= 8 && $length % 4 === 0) { $nullCount = 0; for ($i = 0; $i < min(100, $length); $i += 4) { if ($data[$i] === "\x00" && $data[$i + 1] === "\x00" && $data[$i + 2] === "\x00") { $nullCount++; } } if ($nullCount > 5) { // Likely UTF-32LE return mb_convert_encoding($data, 'UTF-8', 'UTF-32LE'); } } // UTF-16 detection (every 2nd byte pattern) if ($length >= 4 && $length % 2 === 0) { $nullCount = 0; for ($i = 0; $i < min(100, $length); $i += 2) { if ($data[$i + 1] === "\x00") { $nullCount++; } } if ($nullCount > 10) { // Likely UTF-16LE return mb_convert_encoding($data, 'UTF-8', 'UTF-16LE'); } // Check for UTF-16BE $nullCount = 0; for ($i = 0; $i < min(100, $length); $i += 2) { if ($data[$i] === "\x00") { $nullCount++; } } if ($nullCount > 10) { // Likely UTF-16BE return mb_convert_encoding($data, 'UTF-8', 'UTF-16BE'); } } return null; } /** * Remove BOM (Byte Order Mark) from the beginning of a string */ private static function removeBOM(string $data): string { // UTF-8 BOM if (substr($data, 0, 3) === "\xEF\xBB\xBF") { return substr($data, 3); } // UTF-16 BE BOM if (substr($data, 0, 2) === "\xFE\xFF") { return substr($data, 2); } // UTF-16 LE BOM if (substr($data, 0, 2) === "\xFF\xFE") { return substr($data, 2); } // UTF-32 BE BOM if (substr($data, 0, 4) === "\x00\x00\xFE\xFF") { return substr($data, 4); } // UTF-32 LE BOM if (substr($data, 0, 4) === "\xFF\xFE\x00\x00") { return substr($data, 4); } return $data; } private static function containsWindows1252Bytes(string $data): bool { // Check for Windows-1252 specific bytes in 0x80-0x9F range $windows1252Bytes = [0x80, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8A, 0x8B, 0x8C, 0x8E, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9A, 0x9B, 0x9C, 0x9E, 0x9F]; foreach ($windows1252Bytes as $byte) { if (strpos($data, chr($byte)) !== false) { return true; } } return false; } private static function fixCorruptedWindows1252(string $data): string { // Map of UTF-8 replacement sequences back to proper characters $replacements = [ "\xEF\xBF\xBD" => "\u{2019}", // Most common: right single quote (0x92) - use smart quote // Add more mappings as needed based on your data ]; return str_replace(array_keys($replacements), array_values($replacements), $data); } private static function isValidConversion(string $data): bool { // Check if conversion was successful: // 1. Must be valid UTF-8 // 2. Must NOT contain replacement characters (indicating corruption) // 3. Additional check for double-encoded replacement return mb_check_encoding($data, 'UTF-8') && !str_contains($data, "\xEF\xBF\xBD") && // UTF-8 replacement character bytes !str_contains($data, '�'); // Double-encoded replacement character } }