diff --git a/app/Helpers/Mail/GmailTransport.php b/app/Helpers/Mail/GmailTransport.php index 662712ae67..8242ac5083 100644 --- a/app/Helpers/Mail/GmailTransport.php +++ b/app/Helpers/Mail/GmailTransport.php @@ -39,19 +39,7 @@ class GmailTransport extends AbstractTransport //ensure utf-8 encoding of subject $subject = $message->getSubject(); - if (!mb_check_encoding($subject, 'UTF-8') || preg_match('/Ã.|â.|Â./', $subject)) { - - $possible_encodings = ['Windows-1252', 'ISO-8859-1', 'ISO-8859-15']; - - foreach ($possible_encodings as $encoding) { - $converted = mb_convert_encoding($subject, 'UTF-8', $encoding); - - if (mb_check_encoding($converted, 'UTF-8') && !preg_match('/Ã.|â.|Â./', $converted)) { - $subject = $converted; - break; - } - } - } + $subject = \App\Utils\Encode::convert($subject); $message->subject($subject); diff --git a/app/Http/Controllers/ClientPortal/NinjaPlanController.php b/app/Http/Controllers/ClientPortal/NinjaPlanController.php index e57cd79fac..cedb38cb14 100644 --- a/app/Http/Controllers/ClientPortal/NinjaPlanController.php +++ b/app/Http/Controllers/ClientPortal/NinjaPlanController.php @@ -159,6 +159,7 @@ class NinjaPlanController extends Controller $account->hosted_company_count = 10; $account->trial_started = now(); $account->trial_plan = 'pro'; + $account->created_at = now(); $account->save(); } diff --git a/app/Utils/Encode.php b/app/Utils/Encode.php new file mode 100644 index 0000000000..79ab9c4acc --- /dev/null +++ b/app/Utils/Encode.php @@ -0,0 +1,225 @@ += 8 && $length % 4 === 0) { + $nullCount = 0; + for ($i = 0; $i < min(100, $length); $i += 4) { + if ($data[$i] === "\x00" && $data[$i + 1] === "\x00" && $data[$i + 2] === "\x00") { + $nullCount++; + } + } + if ($nullCount > 5) { // Likely UTF-32LE + return mb_convert_encoding($data, 'UTF-8', 'UTF-32LE'); + } + } + + // UTF-16 detection (every 2nd byte pattern) + if ($length >= 4 && $length % 2 === 0) { + $nullCount = 0; + for ($i = 0; $i < min(100, $length); $i += 2) { + if ($data[$i + 1] === "\x00") { + $nullCount++; + } + } + if ($nullCount > 10) { // Likely UTF-16LE + return mb_convert_encoding($data, 'UTF-8', 'UTF-16LE'); + } + + // Check for UTF-16BE + $nullCount = 0; + for ($i = 0; $i < min(100, $length); $i += 2) { + if ($data[$i] === "\x00") { + $nullCount++; + } + } + if ($nullCount > 10) { // Likely UTF-16BE + return mb_convert_encoding($data, 'UTF-8', 'UTF-16BE'); + } + } + + return null; + } + + /** + * Remove BOM (Byte Order Mark) from the beginning of a string + */ + private static function removeBOM(string $data): string + { + // UTF-8 BOM + if (substr($data, 0, 3) === "\xEF\xBB\xBF") { + return substr($data, 3); + } + + // UTF-16 BE BOM + if (substr($data, 0, 2) === "\xFE\xFF") { + return substr($data, 2); + } + + // UTF-16 LE BOM + if (substr($data, 0, 2) === "\xFF\xFE") { + return substr($data, 2); + } + + // UTF-32 BE BOM + if (substr($data, 0, 4) === "\x00\x00\xFE\xFF") { + return substr($data, 4); + } + + // UTF-32 LE BOM + if (substr($data, 0, 4) === "\xFF\xFE\x00\x00") { + return substr($data, 4); + } + + return $data; + } + + private static function containsWindows1252Bytes(string $data): bool + { + // Check for Windows-1252 specific bytes in 0x80-0x9F range + $windows1252Bytes = [0x80, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8A, 0x8B, 0x8C, 0x8E, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9A, 0x9B, 0x9C, 0x9E, 0x9F]; + + foreach ($windows1252Bytes as $byte) { + if (strpos($data, chr($byte)) !== false) { + return true; + } + } + return false; + } + + private static function fixCorruptedWindows1252(string $data): string + { + // Map of UTF-8 replacement sequences back to proper characters + $replacements = [ + "\xEF\xBF\xBD" => "\u{2019}", // Most common: right single quote (0x92) - use smart quote + // Add more mappings as needed based on your data + ]; + + return str_replace(array_keys($replacements), array_values($replacements), $data); + } + + private static function isValidConversion(string $data): bool + { + // Check if conversion was successful: + // 1. Must be valid UTF-8 + // 2. Must NOT contain replacement characters (indicating corruption) + // 3. Additional check for double-encoded replacement + return mb_check_encoding($data, 'UTF-8') && + !str_contains($data, "\xEF\xBF\xBD") && // UTF-8 replacement character bytes + !str_contains($data, '�'); // Double-encoded replacement character + } + +} \ No newline at end of file diff --git a/tests/Unit/EncodeClassComparisonTest.php b/tests/Unit/EncodeClassComparisonTest.php new file mode 100644 index 0000000000..30ae070c71 --- /dev/null +++ b/tests/Unit/EncodeClassComparisonTest.php @@ -0,0 +1,202 @@ +problematicSubject; + + // ✅ WITH Encode class - CORRECT approach + $withEncodeClass = Encode::convert($original); + + // ❌ WITHOUT Encode class - Common mistake (forcing through Windows-1252) + $withoutEncodeClass = mb_convert_encoding($original, 'UTF-8', 'WINDOWS-1252'); + + // Results comparison + $this->assertEquals($original, $withEncodeClass, "Encode class should preserve original"); + $this->assertNotEquals($original, $withoutEncodeClass, "Direct conversion should corrupt content"); + + // Emoji preservation + $this->assertStringContainsString('🚀', $withEncodeClass, "Encode class preserves emoji"); + $this->assertStringNotContainsString('🚀', $withoutEncodeClass, "Direct conversion corrupts emoji"); + + // Accented character preservation + $this->assertStringContainsString('impayée', $withEncodeClass, "Encode class preserves accents"); + $this->assertStringNotContainsString('impayée', $withoutEncodeClass, "Direct conversion corrupts accents"); + + // Show the actual corruption + $this->assertStringContainsString('🚀', $withoutEncodeClass, "Should contain corrupted emoji"); + $this->assertStringContainsString('é', $withoutEncodeClass, "Should contain corrupted accent"); + + // UTF-8 validity + $this->assertTrue(mb_check_encoding($withEncodeClass, 'UTF-8'), "Encode class result is valid UTF-8"); + $this->assertTrue(mb_check_encoding($withoutEncodeClass, 'UTF-8'), "Corrupted result is still UTF-8 but wrong"); + } + + /** + * Show multiple common failure approaches vs the Encode class + */ + public function testMultipleFailureApproachesVsEncodeClass() + { + $original = $this->problematicSubject; + + // ✅ CORRECT: Using Encode class + $correct = Encode::convert($original); + + // ❌ WRONG: Common developer mistakes + $commonMistakes = [ + 'force_windows1252' => mb_convert_encoding($original, 'UTF-8', 'WINDOWS-1252'), + 'force_ascii' => iconv('UTF-8', 'ASCII//IGNORE', $original), + 'manual_replace' => str_replace(['é'], ['e'], $original), // Simplistic approach + 'regex_strip' => preg_replace('/[^\x20-\x7E]/', '?', $original), + 'sanitize_filter' => filter_var($original, FILTER_SANITIZE_STRING, FILTER_FLAG_STRIP_HIGH) ?: 'FILTER_FAILED', + ]; + + // The Encode class should preserve the original + $this->assertEquals($original, $correct); + + // All other approaches should fail + foreach ($commonMistakes as $method => $result) { + $this->assertNotEquals($original, $result, "Method '{$method}' should fail to preserve original"); + + // Most should lose the emoji (except manual_replace which only changes accents) + if ($result !== 'FILTER_FAILED' && $method !== 'manual_replace') { + $this->assertStringNotContainsString('🚀', $result, "Method '{$method}' should lose emoji"); + } + } + } + + /** + * Gmail email header compatibility test + */ + public function testGmailHeaderCompatibility() + { + $original = $this->problematicSubject; + + // ✅ CORRECT: Encode class makes it Gmail-compatible + $encodedSubject = Encode::convert($original); + + // Create a proper email header (RFC 2047 encoding would be done by email library) + $properHeader = "Subject: " . $encodedSubject; + + // ❌ WRONG: Direct use without encoding + $corruptedSubject = mb_convert_encoding($original, 'UTF-8', 'WINDOWS-1252'); + $badHeader = "Subject: " . $corruptedSubject; + + // Proper header should contain correct characters + $this->assertStringContainsString('🚀', $properHeader); + $this->assertStringContainsString('impayée', $properHeader); + + // Bad header should contain corruption + $this->assertStringNotContainsString('🚀', $badHeader); + $this->assertStringNotContainsString('impayée', $badHeader); + $this->assertStringContainsString('🚀', $badHeader); + $this->assertStringContainsString('é', $badHeader); + } + + /** + * Performance comparison: Encode class vs naive approaches + */ + public function testPerformanceComparison() + { + $original = $this->problematicSubject; + + // Time the Encode class + $start = microtime(true); + $result = Encode::convert($original); + $encodeClassTime = microtime(true) - $start; + + // Time a naive approach + $start = microtime(true); + $naiveResult = mb_convert_encoding($original, 'UTF-8', 'WINDOWS-1252'); + $naiveTime = microtime(true) - $start; + + // Both should be fast (under 10ms) + $this->assertLessThan(0.01, $encodeClassTime, "Encode class should be fast"); + $this->assertLessThan(0.01, $naiveTime, "Naive approach should also be fast"); + + // But only Encode class preserves content + $this->assertEquals($original, $result); + $this->assertNotEquals($original, $naiveResult); + } + + /** + * Real-world email scenario test + */ + public function testRealWorldEmailScenario() + { + // Simulate various real-world email subjects that would fail without Encode class + $realWorldSubjects = [ + $this->problematicSubject, + "Café Newsletter 📧 March 2024", + "Paiement reçu ✅ Facture #123", + "Señor García - Cotización €1,500 💼", + "Müller GmbH → Status Update 🎯", + ]; + + foreach ($realWorldSubjects as $subject) { + // ✅ With Encode class + $safe = Encode::convert($subject); + + // ❌ Without Encode class (common mistake) + $unsafe = mb_convert_encoding($subject, 'UTF-8', 'WINDOWS-1252'); + + // Encode class should preserve everything + $this->assertEquals($subject, $safe, "Encode class failed for: {$subject}"); + + // Direct conversion should corrupt emojis/accents + $this->assertNotEquals($subject, $unsafe, "Direct conversion should fail for: {$subject}"); + + // Should be valid UTF-8 + $this->assertTrue(mb_check_encoding($safe, 'UTF-8')); + } + } + + /** + * Test what happens with edge cases + */ + public function testEdgeCaseComparison() + { + $edgeCases = [ + // Only emoji + "🚀", + // Only accents + "impayée", + // Mixed complex + "🇫🇷 François & José 💼 €500", + // Empty + "", + // ASCII only + "Invoice 123", + ]; + + foreach ($edgeCases as $testCase) { + $encoded = Encode::convert($testCase); + $naive = mb_convert_encoding($testCase, 'UTF-8', 'WINDOWS-1252'); + + // For ASCII-only content, both should work + if (mb_check_encoding($testCase, 'ASCII')) { + $this->assertEquals($testCase, $encoded); + // Naive might still work for ASCII + } else { + // For Unicode content, only Encode class should work correctly + $this->assertEquals($testCase, $encoded, "Encode class should handle: {$testCase}"); + $this->assertNotEquals($testCase, $naive, "Naive approach should fail: {$testCase}"); + } + } + } +} \ No newline at end of file diff --git a/tests/Unit/EncodeEmailSubjectTest.php b/tests/Unit/EncodeEmailSubjectTest.php new file mode 100644 index 0000000000..54854a11ee --- /dev/null +++ b/tests/Unit/EncodeEmailSubjectTest.php @@ -0,0 +1,285 @@ +assertEquals($originalSubject, $convertedSubject); + $this->assertTrue(mb_check_encoding($convertedSubject, 'UTF-8')); + + // Verify emoji is preserved + $this->assertStringContainsString('🚀', $convertedSubject); + + // Verify accented characters are preserved + $this->assertStringContainsString('impayée', $convertedSubject); + + // Verify the string length is correct (emojis are multi-byte) + $this->assertEquals(mb_strlen($originalSubject, 'UTF-8'), mb_strlen($convertedSubject, 'UTF-8')); + } + + /** + * Test various email subject scenarios with emojis + */ + public function testEmojiEmailSubjects() + { + $testCases = [ + // Single emoji + "Invoice Ready 📧" => "Invoice Ready 📧", + + // Multiple emojis + "Payment Received ✅ 🎉" => "Payment Received ✅ 🎉", + + // Emoji at start + "🚨 Urgent: Payment Overdue" => "🚨 Urgent: Payment Overdue", + + // Emoji at end + "Welcome to our service! 🎯" => "Welcome to our service! 🎯", + + // Complex emojis (family, skin tones, etc.) + "Team meeting 👨‍💻👩‍💻" => "Team meeting 👨‍💻👩‍💻", + + // Mixed flags and symbols + "Conference in Paris 🇫🇷 ✈️" => "Conference in Paris 🇫🇷 ✈️" + ]; + + foreach ($testCases as $input => $expected) { + $result = Encode::convert($input); + + $this->assertEquals($expected, $result, "Failed for emoji test: {$input}"); + $this->assertTrue(mb_check_encoding($result, 'UTF-8'), "Not valid UTF-8: {$input}"); + } + } + + /** + * Test accented characters common in email subjects + */ + public function testAccentedCharacters() + { + $testCases = [ + // French + "Café résumé naïve façade" => "Café résumé naïve façade", + + // Spanish + "Niño piñata mañana" => "Niño piñata mañana", + + // German + "Größe Weiß Mädchen" => "Größe Weiß Mädchen", + + // Portuguese + "Coração São Paulo" => "Coração São Paulo", + + // Mixed languages + "Café & Niño résumé" => "Café & Niño résumé" + ]; + + foreach ($testCases as $input => $expected) { + $result = Encode::convert($input); + + $this->assertEquals($expected, $result, "Failed for accent test: {$input}"); + $this->assertTrue(mb_check_encoding($result, 'UTF-8'), "Not valid UTF-8: {$input}"); + } + } + + /** + * Test special symbols commonly used in email subjects + */ + public function testSpecialSymbols() + { + $testCases = [ + // Currency symbols + "Invoice €50.00 £25.99 ¥1000" => "Invoice €50.00 £25.99 ¥1000", + + // Smart quotes and dashes + "Company's \"quoted\" text—dash…ellipsis" => "Company's \"quoted\" text—dash…ellipsis", + + // Copyright and trademark + "Product™ Service© Brand®" => "Product™ Service© Brand®", + + // Mathematical symbols + "Discount ≥ 20% ± 5%" => "Discount ≥ 20% ± 5%", + + // Arrows and symbols + "Process → Complete ✓" => "Process → Complete ✓" + ]; + + foreach ($testCases as $input => $expected) { + $result = Encode::convert($input); + + $this->assertEquals($expected, $result, "Failed for symbol test: {$input}"); + $this->assertTrue(mb_check_encoding($result, 'UTF-8'), "Not valid UTF-8: {$input}"); + } + } + + /** + * Test email subjects with mixed content (the most realistic scenario) + */ + public function testMixedContentEmailSubjects() + { + $testCases = [ + // User's exact example + "Rappel facture impayée (\$invoice) 🚀" => "Rappel facture impayée (\$invoice) 🚀", + + // Invoice with currency and emoji + "Facture #123 - €150.00 💰" => "Facture #123 - €150.00 💰", + + // Reminder with accents and emoji + "Relance: paiement en retard 📅 ⚠️" => "Relance: paiement en retard 📅 ⚠️", + + // Welcome message + "Bienvenue chez Café ☕ 🥐" => "Bienvenue chez Café ☕ 🥐", + + // Complex business scenario + "Réunion équipe → 15h30 📊 🎯" => "Réunion équipe → 15h30 📊 🎯" + ]; + + foreach ($testCases as $input => $expected) { + $result = Encode::convert($input); + + $this->assertEquals($expected, $result, "Failed for mixed content test: {$input}"); + $this->assertTrue(mb_check_encoding($result, 'UTF-8'), "Not valid UTF-8: {$input}"); + + // Verify character count is preserved (important for emojis) + $this->assertEquals( + mb_strlen($expected, 'UTF-8'), + mb_strlen($result, 'UTF-8'), + "Character count mismatch for: {$input}" + ); + } + } + + /** + * Test corrupted Windows-1252 content that needs conversion + */ + public function testCorruptedEncodingConversion() + { + // Simulate content that was incorrectly encoded as Windows-1252 + $windows1252Input = mb_convert_encoding("Café résumé", 'WINDOWS-1252', 'UTF-8'); + $result = Encode::convert($windows1252Input); + + $this->assertEquals("Café résumé", $result); + $this->assertTrue(mb_check_encoding($result, 'UTF-8')); + } + + /** + * Test Gmail-specific email subject requirements + */ + public function testGmailCompatibility() + { + $testCases = [ + // Long subject with emojis (Gmail truncates at ~70 chars in preview) + "This is a long email subject with emojis 🚀 that might get truncated by Gmail 📧", + + // Subject with only emojis + "🚀📧🎉✅⚠️💰", + + // Subject with special characters Gmail handles + "Re: Fw: [URGENT] Company's \"Project\" Status—Update ✓", + + // International content + "国际业务 🌍 Négociation €500K 💼" + ]; + + foreach ($testCases as $input) { + $result = Encode::convert($input); + + // Should be valid UTF-8 (Gmail requirement) + $this->assertTrue(mb_check_encoding($result, 'UTF-8'), "Gmail compatibility failed for: {$input}"); + + // Should not contain replacement characters + $this->assertStringNotContainsString("\xEF\xBF\xBD", $result, "Contains replacement characters: {$input}"); + $this->assertStringNotContainsString('�', $result, "Contains double-encoded replacement: {$input}"); + + // Should preserve original content for valid UTF-8 + $this->assertEquals($input, $result, "Content changed unnecessarily: {$input}"); + } + } + + /** + * Test edge cases that might break email clients + */ + public function testEmailClientEdgeCases() + { + $testCases = [ + // Empty string + "" => "", + + // Only spaces + " " => " ", + + // Only special characters + "€£¥" => "€£¥", + + // Only emojis + "🚀🎉📧" => "🚀🎉📧", + + // Mixed spaces and emojis + " 🚀 📧 🎉 " => " 🚀 📧 🎉 ", + + // Newlines and tabs (should be preserved) + "Line 1\nLine 2\tTabbed" => "Line 1\nLine 2\tTabbed" + ]; + + foreach ($testCases as $input => $expected) { + $result = Encode::convert($input); + + $this->assertEquals($expected, $result, "Edge case failed: " . var_export($input, true)); + $this->assertTrue(mb_check_encoding($result, 'UTF-8'), "Not valid UTF-8: " . var_export($input, true)); + } + } + + /** + * Test performance with typical email subject lengths + */ + public function testPerformanceWithTypicalSubjects() + { + $baseSubject = "Rappel facture impayée (\$invoice) 🚀"; + + // Test with different subject lengths + $subjects = [ + $baseSubject, // ~40 chars + str_repeat($baseSubject . " ", 2), // ~80 chars + str_repeat($baseSubject . " ", 5), // ~200 chars + ]; + + foreach ($subjects as $subject) { + $startTime = microtime(true); + $result = Encode::convert($subject); + $endTime = microtime(true); + + $executionTime = ($endTime - $startTime) * 1000; // Convert to milliseconds + + // Should complete quickly (under 10ms for email subjects) + $this->assertLessThan(10, $executionTime, "Too slow for subject: " . strlen($subject) . " chars"); + $this->assertTrue(mb_check_encoding($result, 'UTF-8')); + } + } + + /** + * Test that the method is safe to call multiple times + */ + public function testIdempotency() + { + $original = "Rappel facture impayée (\$invoice) 🚀"; + + $first = Encode::convert($original); + $second = Encode::convert($first); + $third = Encode::convert($second); + + // Should be identical after multiple conversions + $this->assertEquals($original, $first); + $this->assertEquals($first, $second); + $this->assertEquals($second, $third); + } +} \ No newline at end of file diff --git a/tests/Unit/EncodeWithoutClassFailureTest.php b/tests/Unit/EncodeWithoutClassFailureTest.php new file mode 100644 index 0000000000..4544345f19 --- /dev/null +++ b/tests/Unit/EncodeWithoutClassFailureTest.php @@ -0,0 +1,334 @@ +problematicSubject; + + // This is what would happen without the Encode class - forcing conversion through Windows-1252 + $corrupted = mb_convert_encoding($original, 'UTF-8', 'WINDOWS-1252'); + + // Should NOT be equal to original (emoji gets corrupted) + $this->assertNotEquals($original, $corrupted); + + // Emoji should be lost/corrupted + $this->assertStringNotContainsString('🚀', $corrupted); + + // Should contain corruption artifacts + $this->assertTrue( + str_contains($corrupted, "\xEF\xBF\xBD") || // Replacement character + str_contains($corrupted, '?') || // Question mark replacement + str_contains($corrupted, 'é') || // Double-encoded é + str_contains($corrupted, '🚀') || // Corrupted emoji + strlen($corrupted) < strlen($original), // Characters lost + "Expected emoji corruption but content seems intact. Original: {$original}, Corrupted: {$corrupted}" + ); + } + + /** + * Test that naive iconv usage fails with emojis + */ + public function testIconvFailsWithEmojis() + { + $original = $this->problematicSubject; + + // Common mistake: trying to convert UTF-8 through ISO-8859-1 + $result = iconv('ISO-8859-1', 'UTF-8//IGNORE', $original); + + // Should fail or corrupt the content + $this->assertNotEquals($original, $result); + + // Should lose the emoji + $this->assertStringNotContainsString('🚀', $result); + } + + /** + * Test that forcing through ASCII destroys international characters + */ + public function testAsciiConversionDestroysInternationalChars() + { + $original = $this->problematicSubject; + + // Naive approach: force to ASCII + $asciiAttempt = iconv('UTF-8', 'ASCII//IGNORE', $original); + + // Should lose both emoji and accented characters + $this->assertNotEquals($original, $asciiAttempt); + $this->assertStringNotContainsString('🚀', $asciiAttempt); + $this->assertStringNotContainsString('impayée', $asciiAttempt); + + // Should contain "impaye" instead (accent completely removed) + $this->assertStringContainsString('impaye', $asciiAttempt); + } + + /** + * Test that manual character replacement approach is inadequate + */ + public function testManualReplacementInadequate() + { + $original = $this->problematicSubject; + + // Naive manual approach that many developers try + $manualAttempt = str_replace([ + 'é', + 'à', + 'ç', + 'ù' + ], [ + 'e', + 'a', + 'c', + 'u' + ], $original); + + // Still has the emoji problem - can't handle all Unicode + $this->assertNotEquals($original, $manualAttempt); + + // Manual replacement changes the é in "impayée" to "e" + $this->assertStringNotContainsString('impayée', $manualAttempt); + $this->assertStringContainsString('impayee', $manualAttempt); + + // Emoji remains but manual approach doesn't solve encoding issues + $this->assertStringContainsString('🚀', $manualAttempt); + } + + /** + * Test simulated database storage/retrieval corruption + */ + public function testDatabaseStorageCorruption() + { + $original = $this->problematicSubject; + + // Simulate what happens when storing in Latin1 database column + $latin1Encoded = mb_convert_encoding($original, 'ISO-8859-1', 'UTF-8'); + $retrievedBack = mb_convert_encoding($latin1Encoded, 'UTF-8', 'ISO-8859-1'); + + // Should be corrupted + $this->assertNotEquals($original, $retrievedBack); + + // Emoji definitely lost + $this->assertStringNotContainsString('🚀', $retrievedBack); + } + + /** + * Test simulated file read/write corruption + */ + public function testFileHandlingCorruption() + { + $original = $this->problematicSubject; + + // Create a temporary file and write with wrong encoding assumption + $tempFile = tempnam(sys_get_temp_dir(), 'encoding_fail_test_'); + + // Simulate writing as Windows-1252 + $windows1252Content = mb_convert_encoding($original, 'WINDOWS-1252', 'UTF-8'); + file_put_contents($tempFile, $windows1252Content); + + // Now read it back assuming UTF-8 (common mistake) + $corruptedRead = file_get_contents($tempFile); + + // Should be corrupted + $this->assertNotEquals($original, $corruptedRead); + + // Should not be valid UTF-8 + $this->assertFalse(mb_check_encoding($corruptedRead, 'UTF-8')); + + // Clean up + unlink($tempFile); + } + + /** + * Test what happens with common "sanitization" approaches + */ + public function testCommonSanitizationBreaksContent() + { + $original = $this->problematicSubject; + + // Common "sanitization" that developers might try + $sanitized = filter_var($original, FILTER_SANITIZE_STRING, FILTER_FLAG_STRIP_HIGH); + + if ($sanitized !== false) { + // Should remove high-bit characters (including emoji and accents) + $this->assertNotEquals($original, $sanitized); + $this->assertStringNotContainsString('🚀', $sanitized); + $this->assertStringNotContainsString('impayée', $sanitized); + } else { + // Filter might fail entirely + $this->assertFalse($sanitized); + } + } + + /** + * Test naive regular expression replacement + */ + public function testRegexReplacementBreaksUnicode() + { + $original = $this->problematicSubject; + + // Naive attempt to "clean" the string with regex + $regexCleaned = preg_replace('/[^\x20-\x7E]/', '?', $original); + + // Should replace all non-ASCII characters with ? + $this->assertNotEquals($original, $regexCleaned); + $this->assertStringNotContainsString('🚀', $regexCleaned); + $this->assertStringNotContainsString('impayée', $regexCleaned); + + // Should contain question marks + $this->assertStringContainsString('?', $regexCleaned); + } + + /** + * Test double-encoding problems + */ + public function testDoubleEncodingProblems() + { + $original = $this->problematicSubject; + + // Simulate double-encoding (common web application bug) + $firstEncoding = mb_convert_encoding($original, 'ISO-8859-1', 'UTF-8'); + $doubleEncoded = mb_convert_encoding($firstEncoding, 'UTF-8', 'ISO-8859-1'); + + // Should be different and corrupted + $this->assertNotEquals($original, $doubleEncoded); + + // Common double-encoding artifacts + $this->assertTrue( + str_contains($doubleEncoded, 'é') || // é becomes é + str_contains($doubleEncoded, 'â€') || // Other artifacts + !str_contains($doubleEncoded, '🚀'), // Emoji lost + "Expected double-encoding artifacts but got: " . $doubleEncoded + ); + } + + /** + * Test CSV export/import corruption + */ + public function testCsvCorruption() + { + $original = $this->problematicSubject; + + // Simulate CSV export without proper encoding + $csvLine = '"' . $original . '"'; + + // Write to temp file with wrong encoding + $tempFile = tempnam(sys_get_temp_dir(), 'csv_fail_test_'); + file_put_contents($tempFile, $csvLine, LOCK_EX); + + // Read back with wrong encoding assumption + $contents = file_get_contents($tempFile); + + // Parse CSV (simplified) + $parsed = str_replace('"', '', $contents); + + // If the file system or CSV handling messed up encoding + if (!mb_check_encoding($parsed, 'UTF-8')) { + $this->assertNotEquals($original, $parsed); + } else { + // Even if it's valid UTF-8, it might still be different due to CSV processing + $this->assertTrue(true, "CSV processing completed"); + } + + // Clean up + unlink($tempFile); + } + + /** + * Test JSON encoding/decoding issues + */ + public function testJsonEncodingIssues() + { + $original = $this->problematicSubject; + + // Create array with the subject + $data = ['subject' => $original]; + + // Encode to JSON + $json = json_encode($data); + $this->assertNotFalse($json, "JSON encoding should work with UTF-8"); + + // Decode back + $decoded = json_decode($json, true); + $this->assertNotNull($decoded, "JSON decoding should work"); + + // This should actually work correctly with modern PHP + // But let's test what happens if someone tries to "fix" it + $brokenAttempt = json_encode($data, JSON_UNESCAPED_UNICODE | JSON_INVALID_UTF8_IGNORE); + $brokenDecoded = json_decode($brokenAttempt, true); + + // The point is that without proper understanding, people might use wrong flags + // and lose data integrity + if ($brokenDecoded !== null && isset($brokenDecoded['subject'])) { + // In some PHP versions or configurations, this might alter the data + $this->assertTrue( + $decoded['subject'] === $original, + "Proper JSON handling preserves Unicode" + ); + } + } + + /** + * Test email header encoding issues + */ + public function testEmailHeaderEncodingIssues() + { + $original = $this->problematicSubject; + + // Naive attempt to create email header without proper encoding + $naiveHeader = "Subject: " . $original; + + // Email headers with non-ASCII characters need RFC 2047 encoding + // Without proper encoding, the subject would be corrupted by email servers + + // Simulate what an email server might do with unencoded headers + $serverProcessed = preg_replace('/[^\x20-\x7E]/', '?', $naiveHeader); + + $this->assertNotEquals($naiveHeader, $serverProcessed); + $this->assertStringNotContainsString('🚀', $serverProcessed); + $this->assertStringNotContainsString('impayée', $serverProcessed); + + // Should contain replacement characters + $this->assertStringContainsString('?', $serverProcessed); + } + + /** + * Summary test showing multiple failure modes + */ + public function testMultipleFailureModes() + { + $original = $this->problematicSubject; + $failures = []; + + // Collect all the ways it can fail + $attempts = [ + 'windows1252' => mb_convert_encoding($original, 'UTF-8', 'WINDOWS-1252'), + 'ascii' => iconv('UTF-8', 'ASCII//IGNORE', $original), + 'latin1_roundtrip' => mb_convert_encoding(mb_convert_encoding($original, 'ISO-8859-1', 'UTF-8'), 'UTF-8', 'ISO-8859-1'), + 'regex_strip' => preg_replace('/[^\x20-\x7E]/', '', $original), + 'filter_sanitize' => filter_var($original, FILTER_SANITIZE_STRING, FILTER_FLAG_STRIP_HIGH), + ]; + + foreach ($attempts as $method => $result) { + if ($result !== false && $result !== $original) { + $failures[$method] = $result; + } + } + + // All methods should fail to preserve the original + $this->assertGreaterThan(0, count($failures), "At least some methods should fail"); + + // None of the failed attempts should contain the emoji + foreach ($failures as $method => $result) { + $this->assertStringNotContainsString('🚀', $result, "Method {$method} should lose emoji"); + } + } +} \ No newline at end of file