Improve html validation

This commit is contained in:
David Bomba 2025-01-04 10:46:21 +11:00
parent 6eaa0ccc41
commit 97ae948618
4 changed files with 950 additions and 531 deletions

View File

@ -79,16 +79,141 @@ class PdfBuilder
/**
* Final method to get compiled HTML.
*
* @param bool $final @deprecated // is it? i still see it being called elsewhere
* @param bool $final Whether this is the final compilation
* @return string
*/
public function getCompiledHTML($final = false)
{
$this->cleanHtml();
$html = $this->document->saveHTML();
return str_replace('%24', '$', $html);
}
private function cleanHtml(): self
{
if (!$this->document || !$this->document->documentElement) {
return $this;
}
$dangerous_elements = [
'iframe', 'form', 'object', 'embed',
'applet', 'audio', 'video',
'frame', 'frameset', 'base','svg'
];
$dangerous_attributes = [
'onabort', 'onblur', 'onchange', 'onclick', 'ondblclick',
'onerror', 'onfocus', 'onkeydown', 'onkeypress', 'onkeyup',
'onload', 'onmousedown', 'onmousemove', 'onmouseout',
'onmouseover', 'onmouseup', 'onreset', 'onresize',
'onselect', 'onsubmit', 'onunload'
];
// Function to recursively check nodes
$removeNodes = function ($node) use (&$removeNodes, $dangerous_elements, $dangerous_attributes) {
if (!$node) {
return;
}
// Store children in array first to avoid modification during iteration
$children = [];
if ($node->hasChildNodes()) {
foreach ($node->childNodes as $child) {
$children[] = $child;
}
}
// Process each child
foreach ($children as $child) {
$removeNodes($child);
}
// Only process element nodes
if ($node instanceof \DOMElement) {
// Remove dangerous elements
if (in_array(strtolower($node->tagName), $dangerous_elements)) {
if ($node->parentNode) {
$node->parentNode->removeChild($node);
}
return;
}
// Remove dangerous attributes
$attributes_to_remove = [];
foreach ($node->attributes as $attr) {
$attr_name = strtolower($attr->name);
$attr_value = strtolower($attr->value);
// Remove event handlers
if (in_array($attr_name, $dangerous_attributes) || strpos($attr_name, 'on') === 0) {
$attributes_to_remove[] = $attr->name;
continue;
}
// Remove dangerous URLs/protocols
if (in_array($attr_name, ['data', 'href', 'meta', 'link'])) {
if (preg_match('/(javascript|data|file|ftp|jar|dict|gopher|ldap|smb|php|alert|prompt|confirm):|\/\/\/\/+|127\.0\.0\.1|localhost/i', $attr_value)) {
$attributes_to_remove[] = $attr->name;
continue;
}
}else if ($attr_name === 'src') {
// For src attributes, only block dangerous protocols but allow data:image
if (preg_match('/(javascript|file|ftp|jar|dict|gopher|ldap|smb|php):|\/\/\/\/+|127\.0\.0\.1|localhost/i', $attr_value)) {
$attributes_to_remove[] = $attr->name;
continue;
}
// Additional check for data: URLs - only allow image types
if (strpos($attr_value, 'data:') === 0 && !preg_match('/^data:image\//i', $attr_value)) {
$attributes_to_remove[] = $attr->name;
continue;
}
// Check for localhost references
if (preg_match('/localhost|127\.|0\.0\.0\.0|::1|0:0:0:0:0:0:0:1/i', $attr_value)) {
$attributes_to_remove[] = $attr->name;
continue;
}
}elseif ($attr_name === 'style') {
if (preg_match('/(expression|javascript|behavior|vbscript):|url\s*\(|import|@import|eval\s*\(|-moz-binding|behavior|expression/i', $attr_value)) {
$attributes_to_remove[] = $attr->name;
continue;
}
}
// Remove expressions
if (preg_match('/expression|javascript:|vbscript:|livescript:/i', $attr_value)) {
$attributes_to_remove[] = $attr->name;
continue;
}
}
// Remove the collected dangerous attributes
foreach ($attributes_to_remove as $attr) {
$node->removeAttribute($attr);
}
}
};
try {
$removeNodes($this->document->documentElement);
} catch (\Exception $e) {
info('Error cleaning HTML: ' . $e->getMessage());
// Clear the document to prevent unsanitized content
$this->document = new \DOMDocument();
// Throw sanitized exception to alert calling code
throw new \RuntimeException('HTML sanitization failed');
}
return $this;
}
/**
* Generate the template
*

View File

@ -129,8 +129,140 @@ class PdfMaker
*/
public function getCompiledHTML($final = false)
{
$this->cleanHtml();
$html = $this->document->saveHTML();
return str_replace('%24', '$', $html);
}
private function cleanHtml(): self
{
if (!$this->document || !$this->document->documentElement) {
return $this;
}
$dangerous_elements = [
'iframe', 'form', 'object', 'embed',
'applet', 'audio', 'video',
'frame', 'frameset', 'base','svg'
];
$dangerous_attributes = [
'onabort', 'onblur', 'onchange', 'onclick', 'ondblclick',
'onerror', 'onfocus', 'onkeydown', 'onkeypress', 'onkeyup',
'onload', 'onmousedown', 'onmousemove', 'onmouseout',
'onmouseover', 'onmouseup', 'onreset', 'onresize',
'onselect', 'onsubmit', 'onunload'
];
// Function to recursively check nodes
$removeNodes = function ($node) use (&$removeNodes, $dangerous_elements, $dangerous_attributes) {
if (!$node) {
return;
}
// Store children in array first to avoid modification during iteration
$children = [];
if ($node->hasChildNodes()) {
foreach ($node->childNodes as $child) {
$children[] = $child;
}
}
// Process each child
foreach ($children as $child) {
$removeNodes($child);
}
// Only process element nodes
if ($node instanceof \DOMElement) {
// Remove dangerous elements
if (in_array(strtolower($node->tagName), $dangerous_elements)) {
if ($node->parentNode) {
$node->parentNode->removeChild($node);
}
return;
}
// Remove dangerous attributes
$attributes_to_remove = [];
foreach ($node->attributes as $attr) {
$attr_name = strtolower($attr->name);
$attr_value = strtolower($attr->value);
// Remove event handlers
if (in_array($attr_name, $dangerous_attributes) || strpos($attr_name, 'on') === 0) {
$attributes_to_remove[] = $attr->name;
continue;
}
// Remove dangerous URLs/protocols
if (in_array($attr_name, ['data', 'href', 'meta', 'link'])) {
if (preg_match('/(javascript|data|file|ftp|jar|dict|gopher|ldap|smb|php|alert|prompt|confirm):|\/\/\/\/+|127\.0\.0\.1|localhost/i', $attr_value)) {
$attributes_to_remove[] = $attr->name;
continue;
}
}else if ($attr_name === 'src') {
// For src attributes, only block dangerous protocols but allow data:image
if (preg_match('/(javascript|file|ftp|jar|dict|gopher|ldap|smb|php):|\/\/\/\/+|127\.0\.0\.1|localhost/i', $attr_value)) {
$attributes_to_remove[] = $attr->name;
continue;
}
// Additional check for data: URLs - only allow image types
if (strpos($attr_value, 'data:') === 0 && !preg_match('/^data:image\//i', $attr_value)) {
$attributes_to_remove[] = $attr->name;
continue;
}
// Check for localhost references
if (preg_match('/localhost|127\.|0\.0\.0\.0|::1|0:0:0:0:0:0:0:1/i', $attr_value)) {
$attributes_to_remove[] = $attr->name;
continue;
}
}elseif ($attr_name === 'style') {
// if (preg_match('/(expression|javascript|behavior|vbscript):|url\s*\(|import/i', $attr_value)) {
// $attributes_to_remove[] = $attr->name;
// }
if (preg_match('/(expression|javascript|behavior|vbscript):|url\s*\(|import|@import|eval\s*\(|-moz-binding|behavior|expression/i', $attr_value)) {
$attributes_to_remove[] = $attr->name;
continue;
}
}
// Remove expressions
if (preg_match('/expression|javascript:|vbscript:|livescript:/i', $attr_value)) {
$attributes_to_remove[] = $attr->name;
continue;
}
}
// Remove the collected dangerous attributes
foreach ($attributes_to_remove as $attr) {
$node->removeAttribute($attr);
}
}
};
try {
$removeNodes($this->document->documentElement);
} catch (\Exception $e) {
info('Error cleaning HTML: ' . $e->getMessage());
// Clear the document to prevent unsanitized content
$this->document = new \DOMDocument();
// Throw sanitized exception to alert calling code
throw new \RuntimeException('HTML sanitization failed');
}
return $this;
}
}

View File

@ -44,7 +44,7 @@
"aws/aws-sdk-php": "^3.319",
"babenkoivan/elastic-scout-driver": "^4.0",
"bacon/bacon-qr-code": "^2.0",
"beganovich/snappdf": "dev-master",
"beganovich/snappdf": "^5.0",
"braintree/braintree_php": "^6.0",
"btcpayserver/btcpayserver-greenfield-php": "^2.6",
"checkout/checkout-sdk-php": "^3.0",
@ -218,10 +218,6 @@
{
"type": "vcs",
"url": "https://github.com/beganovich/php-ansible"
},
{
"type": "vcs",
"url": "https://github.com/turbo124/snappdf"
}
],
"minimum-stability": "dev",

1216
composer.lock generated

File diff suppressed because it is too large Load Diff