problematicSubject; // This is what would happen without the Encode class - forcing conversion through Windows-1252 $corrupted = mb_convert_encoding($original, 'UTF-8', 'WINDOWS-1252'); // Should NOT be equal to original (emoji gets corrupted) $this->assertNotEquals($original, $corrupted); // Emoji should be lost/corrupted $this->assertStringNotContainsString('🚀', $corrupted); // Should contain corruption artifacts $this->assertTrue( str_contains($corrupted, "\xEF\xBF\xBD") || // Replacement character str_contains($corrupted, '?') || // Question mark replacement str_contains($corrupted, 'é') || // Double-encoded é str_contains($corrupted, '🚀') || // Corrupted emoji strlen($corrupted) < strlen($original), // Characters lost "Expected emoji corruption but content seems intact. Original: {$original}, Corrupted: {$corrupted}" ); } /** * Test that naive iconv usage fails with emojis */ public function testIconvFailsWithEmojis() { $original = $this->problematicSubject; // Common mistake: trying to convert UTF-8 through ISO-8859-1 $result = iconv('ISO-8859-1', 'UTF-8//IGNORE', $original); // Should fail or corrupt the content $this->assertNotEquals($original, $result); // Should lose the emoji $this->assertStringNotContainsString('🚀', $result); } /** * Test that forcing through ASCII destroys international characters */ public function testAsciiConversionDestroysInternationalChars() { $original = $this->problematicSubject; // Naive approach: force to ASCII $asciiAttempt = iconv('UTF-8', 'ASCII//IGNORE', $original); // Should lose both emoji and accented characters $this->assertNotEquals($original, $asciiAttempt); $this->assertStringNotContainsString('🚀', $asciiAttempt); $this->assertStringNotContainsString('impayée', $asciiAttempt); // Should contain "impaye" instead (accent completely removed) $this->assertStringContainsString('impaye', $asciiAttempt); } /** * Test that manual character replacement approach is inadequate */ public function testManualReplacementInadequate() { $original = $this->problematicSubject; // Naive manual approach that many developers try $manualAttempt = str_replace([ 'é', 'à', 'ç', 'ù' ], [ 'e', 'a', 'c', 'u' ], $original); // Still has the emoji problem - can't handle all Unicode $this->assertNotEquals($original, $manualAttempt); // Manual replacement changes the é in "impayée" to "e" $this->assertStringNotContainsString('impayée', $manualAttempt); $this->assertStringContainsString('impayee', $manualAttempt); // Emoji remains but manual approach doesn't solve encoding issues $this->assertStringContainsString('🚀', $manualAttempt); } /** * Test simulated database storage/retrieval corruption */ public function testDatabaseStorageCorruption() { $original = $this->problematicSubject; // Simulate what happens when storing in Latin1 database column $latin1Encoded = mb_convert_encoding($original, 'ISO-8859-1', 'UTF-8'); $retrievedBack = mb_convert_encoding($latin1Encoded, 'UTF-8', 'ISO-8859-1'); // Should be corrupted $this->assertNotEquals($original, $retrievedBack); // Emoji definitely lost $this->assertStringNotContainsString('🚀', $retrievedBack); } /** * Test simulated file read/write corruption */ public function testFileHandlingCorruption() { $original = $this->problematicSubject; // Create a temporary file and write with wrong encoding assumption $tempFile = tempnam(sys_get_temp_dir(), 'encoding_fail_test_'); // Simulate writing as Windows-1252 $windows1252Content = mb_convert_encoding($original, 'WINDOWS-1252', 'UTF-8'); file_put_contents($tempFile, $windows1252Content); // Now read it back assuming UTF-8 (common mistake) $corruptedRead = file_get_contents($tempFile); // Should be corrupted $this->assertNotEquals($original, $corruptedRead); // Should not be valid UTF-8 $this->assertFalse(mb_check_encoding($corruptedRead, 'UTF-8')); // Clean up unlink($tempFile); } /** * Test what happens with common "sanitization" approaches */ public function testCommonSanitizationBreaksContent() { $original = $this->problematicSubject; // Common "sanitization" that developers might try $sanitized = filter_var($original, FILTER_SANITIZE_STRING, FILTER_FLAG_STRIP_HIGH); if ($sanitized !== false) { // Should remove high-bit characters (including emoji and accents) $this->assertNotEquals($original, $sanitized); $this->assertStringNotContainsString('🚀', $sanitized); $this->assertStringNotContainsString('impayée', $sanitized); } else { // Filter might fail entirely $this->assertFalse($sanitized); } } /** * Test naive regular expression replacement */ public function testRegexReplacementBreaksUnicode() { $original = $this->problematicSubject; // Naive attempt to "clean" the string with regex $regexCleaned = preg_replace('/[^\x20-\x7E]/', '?', $original); // Should replace all non-ASCII characters with ? $this->assertNotEquals($original, $regexCleaned); $this->assertStringNotContainsString('🚀', $regexCleaned); $this->assertStringNotContainsString('impayée', $regexCleaned); // Should contain question marks $this->assertStringContainsString('?', $regexCleaned); } /** * Test double-encoding problems */ public function testDoubleEncodingProblems() { $original = $this->problematicSubject; // Simulate double-encoding (common web application bug) $firstEncoding = mb_convert_encoding($original, 'ISO-8859-1', 'UTF-8'); $doubleEncoded = mb_convert_encoding($firstEncoding, 'UTF-8', 'ISO-8859-1'); // Should be different and corrupted $this->assertNotEquals($original, $doubleEncoded); // Common double-encoding artifacts $this->assertTrue( str_contains($doubleEncoded, 'é') || // é becomes é str_contains($doubleEncoded, 'â€') || // Other artifacts !str_contains($doubleEncoded, '🚀'), // Emoji lost "Expected double-encoding artifacts but got: " . $doubleEncoded ); } /** * Test CSV export/import corruption */ public function testCsvCorruption() { $original = $this->problematicSubject; // Simulate CSV export without proper encoding $csvLine = '"' . $original . '"'; // Write to temp file with wrong encoding $tempFile = tempnam(sys_get_temp_dir(), 'csv_fail_test_'); file_put_contents($tempFile, $csvLine, LOCK_EX); // Read back with wrong encoding assumption $contents = file_get_contents($tempFile); // Parse CSV (simplified) $parsed = str_replace('"', '', $contents); // If the file system or CSV handling messed up encoding if (!mb_check_encoding($parsed, 'UTF-8')) { $this->assertNotEquals($original, $parsed); } else { // Even if it's valid UTF-8, it might still be different due to CSV processing $this->assertTrue(true, "CSV processing completed"); } // Clean up unlink($tempFile); } /** * Test JSON encoding/decoding issues */ public function testJsonEncodingIssues() { $original = $this->problematicSubject; // Create array with the subject $data = ['subject' => $original]; // Encode to JSON $json = json_encode($data); $this->assertNotFalse($json, "JSON encoding should work with UTF-8"); // Decode back $decoded = json_decode($json, true); $this->assertNotNull($decoded, "JSON decoding should work"); // This should actually work correctly with modern PHP // But let's test what happens if someone tries to "fix" it $brokenAttempt = json_encode($data, JSON_UNESCAPED_UNICODE | JSON_INVALID_UTF8_IGNORE); $brokenDecoded = json_decode($brokenAttempt, true); // The point is that without proper understanding, people might use wrong flags // and lose data integrity if ($brokenDecoded !== null && isset($brokenDecoded['subject'])) { // In some PHP versions or configurations, this might alter the data $this->assertTrue( $decoded['subject'] === $original, "Proper JSON handling preserves Unicode" ); } } /** * Test email header encoding issues */ public function testEmailHeaderEncodingIssues() { $original = $this->problematicSubject; // Naive attempt to create email header without proper encoding $naiveHeader = "Subject: " . $original; // Email headers with non-ASCII characters need RFC 2047 encoding // Without proper encoding, the subject would be corrupted by email servers // Simulate what an email server might do with unencoded headers $serverProcessed = preg_replace('/[^\x20-\x7E]/', '?', $naiveHeader); $this->assertNotEquals($naiveHeader, $serverProcessed); $this->assertStringNotContainsString('🚀', $serverProcessed); $this->assertStringNotContainsString('impayée', $serverProcessed); // Should contain replacement characters $this->assertStringContainsString('?', $serverProcessed); } /** * Summary test showing multiple failure modes */ public function testMultipleFailureModes() { $original = $this->problematicSubject; $failures = []; // Collect all the ways it can fail $attempts = [ 'windows1252' => mb_convert_encoding($original, 'UTF-8', 'WINDOWS-1252'), 'ascii' => iconv('UTF-8', 'ASCII//IGNORE', $original), 'latin1_roundtrip' => mb_convert_encoding(mb_convert_encoding($original, 'ISO-8859-1', 'UTF-8'), 'UTF-8', 'ISO-8859-1'), 'regex_strip' => preg_replace('/[^\x20-\x7E]/', '', $original), 'filter_sanitize' => filter_var($original, FILTER_SANITIZE_STRING, FILTER_FLAG_STRIP_HIGH), ]; foreach ($attempts as $method => $result) { if ($result !== false && $result !== $original) { $failures[$method] = $result; } } // All methods should fail to preserve the original $this->assertGreaterThan(0, count($failures), "At least some methods should fail"); // None of the failed attempts should contain the emoji foreach ($failures as $method => $result) { $this->assertStringNotContainsString('🚀', $result, "Method {$method} should lose emoji"); } } }