429 lines
14 KiB
PHP
429 lines
14 KiB
PHP
<?php
|
|
|
|
namespace App\Services\Pdf;
|
|
|
|
class Purify
|
|
{
|
|
private static array $allowed_elements = [
|
|
// Document structure
|
|
'html', 'head', 'body', 'meta', 'title', 'style',
|
|
|
|
// Root element
|
|
'root',
|
|
|
|
// Block Elements
|
|
'div', 'p', 'section', 'header', 'footer',
|
|
'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
|
|
'blockquote', 'pre',
|
|
|
|
// Text Elements
|
|
'span', 'strong', 'em', 'b', 'i', 'u', 'small',
|
|
'sub', 'sup', 'del', 'ins',
|
|
|
|
// Line Breaks
|
|
'br', 'hr',
|
|
|
|
// Lists
|
|
'ul', 'ol', 'li', 'dl', 'dt', 'dd',
|
|
|
|
// Tables
|
|
'table', 'thead', 'tbody', 'tfoot', 'tr', 'th', 'td',
|
|
|
|
// Media & Links
|
|
'img', 'a',
|
|
|
|
// Template specific
|
|
'ninja',
|
|
|
|
// SVG Elements
|
|
'svg', 'path', 'rect', 'circle', 'ellipse', 'line', 'polyline',
|
|
'polygon', 'g', 'text', 'tspan', 'defs', 'use', 'title',
|
|
];
|
|
|
|
|
|
private static array $allowed_attributes = [
|
|
// Global Attributes
|
|
'class' => ['*'],
|
|
'id' => ['*'],
|
|
'style' => ['*'],
|
|
'title' => ['*'],
|
|
'lang' => ['*'],
|
|
'dir' => ['*'], // Allow all dir values
|
|
'tabindex' => ['*'],
|
|
'data-*' => ['*'], // Custom data attributes
|
|
'data-ref' => ['*'],
|
|
'data-element' => ['*'],
|
|
'data-state' => ['*'],
|
|
|
|
//SVG
|
|
'd' => ['*'],
|
|
'viewBox' => ['*'],
|
|
'xmlns' => ['http://www.w3.org/2000/svg'],
|
|
'fill' => ['*'],
|
|
'stroke' => ['*'],
|
|
'stroke-width' => ['*'],
|
|
'cx' => ['*'],
|
|
'cy' => ['*'],
|
|
'r' => ['*'],
|
|
'x' => ['*'],
|
|
'y' => ['*'],
|
|
'transform' => ['*'],
|
|
'points' => ['*'],
|
|
'preserveAspectRatio' => ['*'],
|
|
'version' => ['*'],
|
|
'xlink:href' => ['#*'], // Only allow internal references
|
|
'fill-rule' => ['nonzero', 'evenodd'],
|
|
// Layout & Presentation
|
|
'align' => ['left', 'center', 'right', 'justify'],
|
|
'valign' => ['top', 'middle', 'bottom', 'baseline'],
|
|
'width' => ['*'],
|
|
'height' => ['*'],
|
|
'cellspacing' => ['*'],
|
|
'cellpadding' => ['*'],
|
|
'border' => ['*'],
|
|
'min-width' => ['*'],
|
|
'max-width' => ['*'],
|
|
|
|
// Table-specific
|
|
'colspan' => ['*'],
|
|
'rowspan' => ['*'],
|
|
'scope' => ['row', 'col', 'rowgroup', 'colgroup'],
|
|
'headers' => ['*'],
|
|
|
|
// Links & Media
|
|
'href' => ['http://*', 'https://*', 'data:image/*', '${*}', '$*.*'],
|
|
'src' => ['http://*', 'https://*', 'data:image/*', '${*}', '$*.*'],
|
|
'alt' => ['*'],
|
|
'target' => ['_blank', '_self'],
|
|
'rel' => ['nofollow', 'noopener', 'noreferrer'],
|
|
|
|
// Lists
|
|
'type' => ['1', 'A', 'a', 'I', 'i', 'disc', 'circle', 'square'],
|
|
'start' => ['*'],
|
|
|
|
// Accessibility
|
|
'aria-*' => ['*'],
|
|
'role' => ['*'],
|
|
|
|
// Template specific
|
|
'hidden' => ['*'],
|
|
'zoom' => ['*'],
|
|
'size' => ['*'],
|
|
|
|
// Meta tag attributes
|
|
'charset' => ['*'],
|
|
'name' => ['*'],
|
|
'content' => ['*'],
|
|
'http-equiv' => ['cache-control'],
|
|
'viewport' => ['*'],
|
|
'xmlns' => ['http://www.w3.org/2000/svg']
|
|
];
|
|
|
|
private static array $dangerous_css_patterns = [
|
|
// JavaScript execution patterns
|
|
'/expression\s*\(/', // CSS expressions
|
|
'/javascript\s*:/', // JavaScript protocol
|
|
'/behaviour\s*:/', // IE behavior
|
|
'/-moz-binding\s*:/', // Mozilla binding
|
|
|
|
// URL patterns that might lead to script execution
|
|
'/url\s*\(\s*[^)]*(?:javascript|data|vbscript)/i',
|
|
|
|
// Import directives
|
|
'/@import\s/', // Added proper delimiters
|
|
|
|
// Other potentially dangerous properties
|
|
'/-o-link\s*:/',
|
|
'/-o-link-source\s*:/',
|
|
'/-o-replace\s*:/',
|
|
'/call\s*\(/',
|
|
'/position\s*:\s*fixed/i',
|
|
|
|
// Common attack vectors
|
|
'/background(?:-image)?\s*:\s*[^;]*(?:url|expression|javascript|data|vbscript)/i',
|
|
|
|
// IE-specific expressions
|
|
'/progid\s*:/',
|
|
'/setExpression\s*\(/',
|
|
'/AlphaImageLoader\s*\(/',
|
|
'/chrome-extension\s*:/',
|
|
'/file\s*:/',
|
|
'/ftp\s*:/',
|
|
'/gopher\s*:/',
|
|
'/ws\s*:/',
|
|
'/wss\s*:/',
|
|
];
|
|
|
|
private static array $dangerous_css_properties = [
|
|
'behavior',
|
|
'-moz-binding',
|
|
'pointer-events',
|
|
'expression',
|
|
'clip-path',
|
|
'mask',
|
|
'filter',
|
|
'backdrop-filter',
|
|
];
|
|
|
|
/**
|
|
* Filter CSS to remove potentially dangerous styles
|
|
*/
|
|
private static function filterCssStyles(string $css): string
|
|
{
|
|
// Remove comments that might hide malicious code
|
|
$css = preg_replace('/\/\*.*?\*\//s', '', $css);
|
|
|
|
// Convert to lowercase for consistent checking
|
|
$css_lower = strtolower($css);
|
|
|
|
// Check for dangerous patterns
|
|
foreach (self::$dangerous_css_patterns as $pattern) {
|
|
if (preg_match($pattern, $css_lower)) {
|
|
return ''; // Return empty if dangerous pattern found
|
|
}
|
|
}
|
|
|
|
// Split into individual declarations
|
|
$declarations = array_filter(array_map('trim', explode(';', $css)));
|
|
$safe_declarations = [];
|
|
|
|
foreach ($declarations as $declaration) {
|
|
// Split property and value
|
|
$parts = array_map('trim', explode(':', $declaration, 2));
|
|
if (count($parts) !== 2) {
|
|
continue;
|
|
}
|
|
|
|
[$property, $value] = $parts;
|
|
$property = strtolower($property);
|
|
|
|
// Skip dangerous properties
|
|
if (in_array($property, self::$dangerous_css_properties)) {
|
|
continue;
|
|
}
|
|
|
|
// Additional URL safety check
|
|
if (stripos($value, 'url(') !== false) {
|
|
// Only allow specific URL patterns
|
|
if (!preg_match('/url\s*\(\s*[\'"]?(https?:\/\/[^"\'\)]+)[\'"]?\s*\)/i', $value)) {
|
|
continue;
|
|
}
|
|
}
|
|
|
|
$safe_declarations[] = $property . ': ' . $value;
|
|
}
|
|
|
|
return implode('; ', $safe_declarations);
|
|
}
|
|
|
|
private static array $dangerous_svg_elements = [
|
|
'script',
|
|
'handler',
|
|
'foreignObject',
|
|
'annotation-xml',
|
|
'color-profile',
|
|
'style', // or carefully sanitize if needed
|
|
'onload',
|
|
'onerror',
|
|
'onunload',
|
|
'onabort'
|
|
];
|
|
|
|
private static function isDangerousSvgElement(string $tagName): bool
|
|
{
|
|
return in_array(strtolower($tagName), self::$dangerous_svg_elements);
|
|
}
|
|
|
|
public static function clean(string $html): string
|
|
{
|
|
if (config('ninja.disable_purify_html') || strlen($html) <= 1) {
|
|
return str_replace('%24', '$', $html);
|
|
}
|
|
|
|
|
|
$html = str_replace('%24', '$', $html);
|
|
libxml_use_internal_errors(true);
|
|
|
|
$document = new \DOMDocument();
|
|
@$document->loadHTML(htmlspecialchars_decode(htmlspecialchars($html, ENT_QUOTES, 'UTF-8')), LIBXML_NONET);
|
|
|
|
// Function to recursively check nodes
|
|
$cleanNodes = function ($node) use (&$cleanNodes) {
|
|
|
|
$allowed_elements = self::$allowed_elements;
|
|
$allowed_attributes = self::$allowed_attributes;
|
|
|
|
if (!$node) {
|
|
return;
|
|
}
|
|
|
|
// Store children in array first to avoid modification during iteration
|
|
$children = [];
|
|
if ($node->hasChildNodes()) {
|
|
foreach ($node->childNodes as $child) {
|
|
$children[] = $child;
|
|
}
|
|
}
|
|
|
|
// Process each child
|
|
foreach ($children as $child) {
|
|
$cleanNodes($child);
|
|
}
|
|
|
|
// Only process element nodes
|
|
if ($node instanceof \DOMElement) {
|
|
// Remove element if not in allowed list
|
|
if (!in_array(strtolower($node->tagName), $allowed_elements)) {
|
|
if ($node->parentNode) {
|
|
$node->parentNode->removeChild($node);
|
|
}
|
|
return;
|
|
}
|
|
|
|
// Store current attributes before removing them
|
|
$current_attributes = [];
|
|
foreach ($node->attributes as $attr) {
|
|
$current_attributes[$attr->name] = $attr->value;
|
|
}
|
|
|
|
// Handle SVG node separately
|
|
if ($node->tagName === 'svg') {
|
|
// Keep only allowed SVG attributes
|
|
$current_attributes = [];
|
|
foreach ($node->attributes as $attr) {
|
|
|
|
if (in_array($attr->name, self::$dangerous_svg_elements)) {
|
|
$node->removeAttribute($attr->name);
|
|
}
|
|
|
|
}
|
|
|
|
} else {
|
|
// First, remove ALL attributes from the node
|
|
// while ($node->attributes->length > 0) {
|
|
// $attr = $node->attributes->item(0);
|
|
// $node->removeAttribute($attr->nodeName);
|
|
// }
|
|
|
|
|
|
if ($node instanceof \DOMElement) {
|
|
// Create a list of attributes to remove
|
|
$attributes_to_remove = [];
|
|
foreach ($node->attributes as $attr) {
|
|
$attributes_to_remove[] = $attr->nodeName;
|
|
}
|
|
|
|
// Remove the attributes
|
|
foreach ($attributes_to_remove as $attr_name) {
|
|
$node->removeAttribute($attr_name);
|
|
}
|
|
}
|
|
|
|
}
|
|
|
|
// Then add back only the allowed attributes
|
|
foreach ($current_attributes as $name => $value) {
|
|
$attr_name = strtolower($name);
|
|
|
|
// Add special handling for style attributes
|
|
if ($attr_name === 'style') {
|
|
$filtered_css = self::filterCssStyles($value);
|
|
if (!empty($filtered_css)) {
|
|
$node->setAttribute($name, $filtered_css);
|
|
}
|
|
continue;
|
|
}
|
|
|
|
// Handle data-* attributes
|
|
if (strpos($attr_name, 'data-') === 0 && isset($allowed_attributes['data-*'])) {
|
|
$node->setAttribute($name, $value);
|
|
continue;
|
|
}
|
|
|
|
// Handle aria-* attributes
|
|
if (strpos($attr_name, 'aria-') === 0 && isset($allowed_attributes['aria-*'])) {
|
|
$node->setAttribute($name, $value);
|
|
continue;
|
|
}
|
|
|
|
// Skip if attribute isn't in allowed list
|
|
if (!isset($allowed_attributes[$attr_name])) {
|
|
continue;
|
|
}
|
|
|
|
$allowed_values = $allowed_attributes[$attr_name];
|
|
|
|
// Special handling for URLs (src and href)
|
|
if (($attr_name === 'src' || $attr_name === 'href') && !empty($allowed_values)) {
|
|
$is_allowed = false;
|
|
|
|
// Debug log
|
|
// nlog("Checking URL attribute {$attr_name} with value: {$value}");
|
|
|
|
foreach ($allowed_values as $pattern) {
|
|
// Fix the pattern conversion for URL matching
|
|
if ($pattern === 'http://*') {
|
|
// nlog("http://* regex");
|
|
$regex = '^http\:\/\/[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,}(\/\S*)?$';
|
|
} elseif ($pattern === 'https://*') {
|
|
// nlog("https://* regex");
|
|
$regex = '^https\:\/\/[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,}(\/\S*)?$';
|
|
} elseif ($pattern === 'data:image/*') {
|
|
// nlog("data:image/* regex");
|
|
$regex = '^data\:image\/[a-zA-Z0-9\+]+;base64,.*$';
|
|
} else {
|
|
$regex = preg_quote($pattern, '/');
|
|
$regex = str_replace('\*', '.*', $regex);
|
|
}
|
|
|
|
if (preg_match('/' . $regex . '/i', $value)) {
|
|
$is_allowed = true;
|
|
break;
|
|
}
|
|
}
|
|
|
|
if ($is_allowed) {
|
|
$node->setAttribute($name, $value);
|
|
} else {
|
|
}
|
|
continue;
|
|
}
|
|
|
|
// For attributes that allow all values
|
|
if ($allowed_values === ['*']) {
|
|
$node->setAttribute($name, $value);
|
|
continue;
|
|
}
|
|
|
|
// For attributes with specific allowed values
|
|
if (in_array($value, $allowed_values)) {
|
|
$node->setAttribute($name, $value);
|
|
}
|
|
}
|
|
}
|
|
};
|
|
|
|
try {
|
|
|
|
$cleanNodes($document->documentElement);
|
|
|
|
$html = str_replace('%24', '$', $document->saveHTML());
|
|
|
|
// nlog("post purify => {$html}");
|
|
return $html;
|
|
|
|
} catch (\Exception $e) {
|
|
|
|
nlog('Error cleaning HTML: ' . $e->getMessage());
|
|
|
|
libxml_clear_errors();
|
|
|
|
throw new \RuntimeException('HTML sanitization failed');
|
|
} finally {
|
|
libxml_clear_errors();
|
|
}
|
|
|
|
}
|
|
|
|
}
|