Additional tests

This commit is contained in:
David Bomba 2025-06-05 10:49:38 +10:00
parent 651ec15e22
commit b13258be0e
6 changed files with 1048 additions and 13 deletions

View File

@ -39,19 +39,7 @@ class GmailTransport extends AbstractTransport
//ensure utf-8 encoding of subject
$subject = $message->getSubject();
if (!mb_check_encoding($subject, 'UTF-8') || preg_match('/Ã.|â.|Â./', $subject)) {
$possible_encodings = ['Windows-1252', 'ISO-8859-1', 'ISO-8859-15'];
foreach ($possible_encodings as $encoding) {
$converted = mb_convert_encoding($subject, 'UTF-8', $encoding);
if (mb_check_encoding($converted, 'UTF-8') && !preg_match('/Ã.|â.|Â./', $converted)) {
$subject = $converted;
break;
}
}
}
$subject = \App\Utils\Encode::convert($subject);
$message->subject($subject);

View File

@ -159,6 +159,7 @@ class NinjaPlanController extends Controller
$account->hosted_company_count = 10;
$account->trial_started = now();
$account->trial_plan = 'pro';
$account->created_at = now();
$account->save();
}

225
app/Utils/Encode.php Normal file
View File

@ -0,0 +1,225 @@
<?php
/**
* Invoice Ninja (https://invoiceninja.com).
*
* @link https://github.com/invoiceninja/invoiceninja source repository
*
* @copyright Copyright (c) 2025. Invoice Ninja LLC (https://invoiceninja.com)
*
* @license https://www.elastic.co/licensing/elastic-license
*/
namespace App\Utils;
class Encode
{
/**
* Convert string content to UTF-8
* Safe for emojis, file content, and any encoding issues
*/
public static function convert(string $contents): string
{
// Check for different UTF BOMs and handle accordingly
$bomResult = self::detectAndHandleUTFEncoding($contents);
if ($bomResult !== null) {
return $bomResult;
}
// Remove BOM if present (for UTF-8 BOM)
$contents = self::removeBOM($contents);
// Check if it's clean UTF-8 first (no conversion needed)
// This handles emojis, accented characters, and any valid UTF-8 content
if (mb_check_encoding($contents, 'UTF-8') && self::isValidConversion($contents)) {
return $contents;
}
// Method 1: Try Windows-1252 conversion
$contextContents = $contents;
if ($contextContents !== false) {
$contextContents = self::removeBOM($contextContents);
$converted = mb_convert_encoding($contextContents, 'UTF-8', 'WINDOWS-1252');
if (self::isValidConversion($converted)) {
return $converted;
}
}
// Method 2: Binary conversion
$binaryContents = $contents;
$binaryContents = self::removeBOM($binaryContents);
// Check if this looks like Windows-1252 by looking for problem bytes
if (self::containsWindows1252Bytes($binaryContents)) {
$converted = mb_convert_encoding($binaryContents, 'UTF-8', 'WINDOWS-1252');
if (self::isValidConversion($converted)) {
return $converted;
}
}
// Method 3: Fix corrupted UTF-8 replacement characters
if ($contents !== false) {
$fixed = self::fixCorruptedWindows1252($contents);
if (self::isValidConversion($fixed)) {
return $fixed;
}
}
// Method 4: Try different encoding auto-detection with broader list
if ($contents !== false) {
$encodings = ['WINDOWS-1252', 'ISO-8859-1', 'ISO-8859-15', 'CP1252'];
foreach ($encodings as $encoding) {
$converted = mb_convert_encoding($contents, 'UTF-8', $encoding);
if (self::isValidConversion($converted)) {
return $converted;
}
}
}
// Fallback: return original contents
return $contents ?: '';
}
/**
* Detect and handle UTF-16 and UTF-32 encodings based on BOM
*/
private static function detectAndHandleUTFEncoding(string $data): ?string
{
// UTF-32 BE BOM
if (substr($data, 0, 4) === "\x00\x00\xFE\xFF") {
$withoutBOM = substr($data, 4);
return mb_convert_encoding($withoutBOM, 'UTF-8', 'UTF-32BE');
}
// UTF-32 LE BOM
if (substr($data, 0, 4) === "\xFF\xFE\x00\x00") {
$withoutBOM = substr($data, 4);
return mb_convert_encoding($withoutBOM, 'UTF-8', 'UTF-32LE');
}
// UTF-16 BE BOM
if (substr($data, 0, 2) === "\xFE\xFF") {
$withoutBOM = substr($data, 2);
return mb_convert_encoding($withoutBOM, 'UTF-8', 'UTF-16BE');
}
// UTF-16 LE BOM
if (substr($data, 0, 2) === "\xFF\xFE") {
$withoutBOM = substr($data, 2);
return mb_convert_encoding($withoutBOM, 'UTF-8', 'UTF-16LE');
}
// Try to detect UTF-16/32 without BOM (heuristic approach)
$length = strlen($data);
// UTF-32 detection (every 4th byte pattern)
if ($length >= 8 && $length % 4 === 0) {
$nullCount = 0;
for ($i = 0; $i < min(100, $length); $i += 4) {
if ($data[$i] === "\x00" && $data[$i + 1] === "\x00" && $data[$i + 2] === "\x00") {
$nullCount++;
}
}
if ($nullCount > 5) { // Likely UTF-32LE
return mb_convert_encoding($data, 'UTF-8', 'UTF-32LE');
}
}
// UTF-16 detection (every 2nd byte pattern)
if ($length >= 4 && $length % 2 === 0) {
$nullCount = 0;
for ($i = 0; $i < min(100, $length); $i += 2) {
if ($data[$i + 1] === "\x00") {
$nullCount++;
}
}
if ($nullCount > 10) { // Likely UTF-16LE
return mb_convert_encoding($data, 'UTF-8', 'UTF-16LE');
}
// Check for UTF-16BE
$nullCount = 0;
for ($i = 0; $i < min(100, $length); $i += 2) {
if ($data[$i] === "\x00") {
$nullCount++;
}
}
if ($nullCount > 10) { // Likely UTF-16BE
return mb_convert_encoding($data, 'UTF-8', 'UTF-16BE');
}
}
return null;
}
/**
* Remove BOM (Byte Order Mark) from the beginning of a string
*/
private static function removeBOM(string $data): string
{
// UTF-8 BOM
if (substr($data, 0, 3) === "\xEF\xBB\xBF") {
return substr($data, 3);
}
// UTF-16 BE BOM
if (substr($data, 0, 2) === "\xFE\xFF") {
return substr($data, 2);
}
// UTF-16 LE BOM
if (substr($data, 0, 2) === "\xFF\xFE") {
return substr($data, 2);
}
// UTF-32 BE BOM
if (substr($data, 0, 4) === "\x00\x00\xFE\xFF") {
return substr($data, 4);
}
// UTF-32 LE BOM
if (substr($data, 0, 4) === "\xFF\xFE\x00\x00") {
return substr($data, 4);
}
return $data;
}
private static function containsWindows1252Bytes(string $data): bool
{
// Check for Windows-1252 specific bytes in 0x80-0x9F range
$windows1252Bytes = [0x80, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8A, 0x8B, 0x8C, 0x8E, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9A, 0x9B, 0x9C, 0x9E, 0x9F];
foreach ($windows1252Bytes as $byte) {
if (strpos($data, chr($byte)) !== false) {
return true;
}
}
return false;
}
private static function fixCorruptedWindows1252(string $data): string
{
// Map of UTF-8 replacement sequences back to proper characters
$replacements = [
"\xEF\xBF\xBD" => "\u{2019}", // Most common: right single quote (0x92) - use smart quote
// Add more mappings as needed based on your data
];
return str_replace(array_keys($replacements), array_values($replacements), $data);
}
private static function isValidConversion(string $data): bool
{
// Check if conversion was successful:
// 1. Must be valid UTF-8
// 2. Must NOT contain replacement characters (indicating corruption)
// 3. Additional check for double-encoded replacement
return mb_check_encoding($data, 'UTF-8') &&
!str_contains($data, "\xEF\xBF\xBD") && // UTF-8 replacement character bytes
!str_contains($data, '�'); // Double-encoded replacement character
}
}

View File

@ -0,0 +1,202 @@
<?php
namespace Tests\Unit;
use Tests\TestCase;
use App\Utils\Encode;
/**
* Direct comparison showing why the Encode class is necessary
* for email subject lines with emojis and accented characters
*/
class EncodeClassComparisonTest extends TestCase
{
private string $problematicSubject = "Rappel facture impayée (\$invoice) 🚀";
/**
* Demonstrate the difference: WITH Encode class vs WITHOUT
*/
public function testWithVsWithoutEncodeClass()
{
$original = $this->problematicSubject;
// ✅ WITH Encode class - CORRECT approach
$withEncodeClass = Encode::convert($original);
// ❌ WITHOUT Encode class - Common mistake (forcing through Windows-1252)
$withoutEncodeClass = mb_convert_encoding($original, 'UTF-8', 'WINDOWS-1252');
// Results comparison
$this->assertEquals($original, $withEncodeClass, "Encode class should preserve original");
$this->assertNotEquals($original, $withoutEncodeClass, "Direct conversion should corrupt content");
// Emoji preservation
$this->assertStringContainsString('🚀', $withEncodeClass, "Encode class preserves emoji");
$this->assertStringNotContainsString('🚀', $withoutEncodeClass, "Direct conversion corrupts emoji");
// Accented character preservation
$this->assertStringContainsString('impayée', $withEncodeClass, "Encode class preserves accents");
$this->assertStringNotContainsString('impayée', $withoutEncodeClass, "Direct conversion corrupts accents");
// Show the actual corruption
$this->assertStringContainsString('🚀', $withoutEncodeClass, "Should contain corrupted emoji");
$this->assertStringContainsString('é', $withoutEncodeClass, "Should contain corrupted accent");
// UTF-8 validity
$this->assertTrue(mb_check_encoding($withEncodeClass, 'UTF-8'), "Encode class result is valid UTF-8");
$this->assertTrue(mb_check_encoding($withoutEncodeClass, 'UTF-8'), "Corrupted result is still UTF-8 but wrong");
}
/**
* Show multiple common failure approaches vs the Encode class
*/
public function testMultipleFailureApproachesVsEncodeClass()
{
$original = $this->problematicSubject;
// ✅ CORRECT: Using Encode class
$correct = Encode::convert($original);
// ❌ WRONG: Common developer mistakes
$commonMistakes = [
'force_windows1252' => mb_convert_encoding($original, 'UTF-8', 'WINDOWS-1252'),
'force_ascii' => iconv('UTF-8', 'ASCII//IGNORE', $original),
'manual_replace' => str_replace(['é'], ['e'], $original), // Simplistic approach
'regex_strip' => preg_replace('/[^\x20-\x7E]/', '?', $original),
'sanitize_filter' => filter_var($original, FILTER_SANITIZE_STRING, FILTER_FLAG_STRIP_HIGH) ?: 'FILTER_FAILED',
];
// The Encode class should preserve the original
$this->assertEquals($original, $correct);
// All other approaches should fail
foreach ($commonMistakes as $method => $result) {
$this->assertNotEquals($original, $result, "Method '{$method}' should fail to preserve original");
// Most should lose the emoji (except manual_replace which only changes accents)
if ($result !== 'FILTER_FAILED' && $method !== 'manual_replace') {
$this->assertStringNotContainsString('🚀', $result, "Method '{$method}' should lose emoji");
}
}
}
/**
* Gmail email header compatibility test
*/
public function testGmailHeaderCompatibility()
{
$original = $this->problematicSubject;
// ✅ CORRECT: Encode class makes it Gmail-compatible
$encodedSubject = Encode::convert($original);
// Create a proper email header (RFC 2047 encoding would be done by email library)
$properHeader = "Subject: " . $encodedSubject;
// ❌ WRONG: Direct use without encoding
$corruptedSubject = mb_convert_encoding($original, 'UTF-8', 'WINDOWS-1252');
$badHeader = "Subject: " . $corruptedSubject;
// Proper header should contain correct characters
$this->assertStringContainsString('🚀', $properHeader);
$this->assertStringContainsString('impayée', $properHeader);
// Bad header should contain corruption
$this->assertStringNotContainsString('🚀', $badHeader);
$this->assertStringNotContainsString('impayée', $badHeader);
$this->assertStringContainsString('🚀', $badHeader);
$this->assertStringContainsString('é', $badHeader);
}
/**
* Performance comparison: Encode class vs naive approaches
*/
public function testPerformanceComparison()
{
$original = $this->problematicSubject;
// Time the Encode class
$start = microtime(true);
$result = Encode::convert($original);
$encodeClassTime = microtime(true) - $start;
// Time a naive approach
$start = microtime(true);
$naiveResult = mb_convert_encoding($original, 'UTF-8', 'WINDOWS-1252');
$naiveTime = microtime(true) - $start;
// Both should be fast (under 10ms)
$this->assertLessThan(0.01, $encodeClassTime, "Encode class should be fast");
$this->assertLessThan(0.01, $naiveTime, "Naive approach should also be fast");
// But only Encode class preserves content
$this->assertEquals($original, $result);
$this->assertNotEquals($original, $naiveResult);
}
/**
* Real-world email scenario test
*/
public function testRealWorldEmailScenario()
{
// Simulate various real-world email subjects that would fail without Encode class
$realWorldSubjects = [
$this->problematicSubject,
"Café Newsletter 📧 March 2024",
"Paiement reçu ✅ Facture #123",
"Señor García - Cotización €1,500 💼",
"Müller GmbH → Status Update 🎯",
];
foreach ($realWorldSubjects as $subject) {
// ✅ With Encode class
$safe = Encode::convert($subject);
// ❌ Without Encode class (common mistake)
$unsafe = mb_convert_encoding($subject, 'UTF-8', 'WINDOWS-1252');
// Encode class should preserve everything
$this->assertEquals($subject, $safe, "Encode class failed for: {$subject}");
// Direct conversion should corrupt emojis/accents
$this->assertNotEquals($subject, $unsafe, "Direct conversion should fail for: {$subject}");
// Should be valid UTF-8
$this->assertTrue(mb_check_encoding($safe, 'UTF-8'));
}
}
/**
* Test what happens with edge cases
*/
public function testEdgeCaseComparison()
{
$edgeCases = [
// Only emoji
"🚀",
// Only accents
"impayée",
// Mixed complex
"🇫🇷 François & José 💼 €500",
// Empty
"",
// ASCII only
"Invoice 123",
];
foreach ($edgeCases as $testCase) {
$encoded = Encode::convert($testCase);
$naive = mb_convert_encoding($testCase, 'UTF-8', 'WINDOWS-1252');
// For ASCII-only content, both should work
if (mb_check_encoding($testCase, 'ASCII')) {
$this->assertEquals($testCase, $encoded);
// Naive might still work for ASCII
} else {
// For Unicode content, only Encode class should work correctly
$this->assertEquals($testCase, $encoded, "Encode class should handle: {$testCase}");
$this->assertNotEquals($testCase, $naive, "Naive approach should fail: {$testCase}");
}
}
}
}

View File

@ -0,0 +1,285 @@
<?php
namespace Tests\Unit;
use Tests\TestCase;
use App\Utils\Encode;
class EncodeEmailSubjectTest extends TestCase
{
/**
* Test the exact example provided by the user
*/
public function testUserSpecificExample()
{
$originalSubject = "Rappel facture impayée (\$invoice) 🚀";
$convertedSubject = Encode::convert($originalSubject);
// Should return unchanged - already valid UTF-8
$this->assertEquals($originalSubject, $convertedSubject);
$this->assertTrue(mb_check_encoding($convertedSubject, 'UTF-8'));
// Verify emoji is preserved
$this->assertStringContainsString('🚀', $convertedSubject);
// Verify accented characters are preserved
$this->assertStringContainsString('impayée', $convertedSubject);
// Verify the string length is correct (emojis are multi-byte)
$this->assertEquals(mb_strlen($originalSubject, 'UTF-8'), mb_strlen($convertedSubject, 'UTF-8'));
}
/**
* Test various email subject scenarios with emojis
*/
public function testEmojiEmailSubjects()
{
$testCases = [
// Single emoji
"Invoice Ready 📧" => "Invoice Ready 📧",
// Multiple emojis
"Payment Received ✅ 🎉" => "Payment Received ✅ 🎉",
// Emoji at start
"🚨 Urgent: Payment Overdue" => "🚨 Urgent: Payment Overdue",
// Emoji at end
"Welcome to our service! 🎯" => "Welcome to our service! 🎯",
// Complex emojis (family, skin tones, etc.)
"Team meeting 👨‍💻👩‍💻" => "Team meeting 👨‍💻👩‍💻",
// Mixed flags and symbols
"Conference in Paris 🇫🇷 ✈️" => "Conference in Paris 🇫🇷 ✈️"
];
foreach ($testCases as $input => $expected) {
$result = Encode::convert($input);
$this->assertEquals($expected, $result, "Failed for emoji test: {$input}");
$this->assertTrue(mb_check_encoding($result, 'UTF-8'), "Not valid UTF-8: {$input}");
}
}
/**
* Test accented characters common in email subjects
*/
public function testAccentedCharacters()
{
$testCases = [
// French
"Café résumé naïve façade" => "Café résumé naïve façade",
// Spanish
"Niño piñata mañana" => "Niño piñata mañana",
// German
"Größe Weiß Mädchen" => "Größe Weiß Mädchen",
// Portuguese
"Coração São Paulo" => "Coração São Paulo",
// Mixed languages
"Café & Niño résumé" => "Café & Niño résumé"
];
foreach ($testCases as $input => $expected) {
$result = Encode::convert($input);
$this->assertEquals($expected, $result, "Failed for accent test: {$input}");
$this->assertTrue(mb_check_encoding($result, 'UTF-8'), "Not valid UTF-8: {$input}");
}
}
/**
* Test special symbols commonly used in email subjects
*/
public function testSpecialSymbols()
{
$testCases = [
// Currency symbols
"Invoice €50.00 £25.99 ¥1000" => "Invoice €50.00 £25.99 ¥1000",
// Smart quotes and dashes
"Company's \"quoted\" text—dash…ellipsis" => "Company's \"quoted\" text—dash…ellipsis",
// Copyright and trademark
"Product™ Service© Brand®" => "Product™ Service© Brand®",
// Mathematical symbols
"Discount ≥ 20% ± 5%" => "Discount ≥ 20% ± 5%",
// Arrows and symbols
"Process → Complete ✓" => "Process → Complete ✓"
];
foreach ($testCases as $input => $expected) {
$result = Encode::convert($input);
$this->assertEquals($expected, $result, "Failed for symbol test: {$input}");
$this->assertTrue(mb_check_encoding($result, 'UTF-8'), "Not valid UTF-8: {$input}");
}
}
/**
* Test email subjects with mixed content (the most realistic scenario)
*/
public function testMixedContentEmailSubjects()
{
$testCases = [
// User's exact example
"Rappel facture impayée (\$invoice) 🚀" => "Rappel facture impayée (\$invoice) 🚀",
// Invoice with currency and emoji
"Facture #123 - €150.00 💰" => "Facture #123 - €150.00 💰",
// Reminder with accents and emoji
"Relance: paiement en retard 📅 ⚠️" => "Relance: paiement en retard 📅 ⚠️",
// Welcome message
"Bienvenue chez Café ☕ 🥐" => "Bienvenue chez Café ☕ 🥐",
// Complex business scenario
"Réunion équipe → 15h30 📊 🎯" => "Réunion équipe → 15h30 📊 🎯"
];
foreach ($testCases as $input => $expected) {
$result = Encode::convert($input);
$this->assertEquals($expected, $result, "Failed for mixed content test: {$input}");
$this->assertTrue(mb_check_encoding($result, 'UTF-8'), "Not valid UTF-8: {$input}");
// Verify character count is preserved (important for emojis)
$this->assertEquals(
mb_strlen($expected, 'UTF-8'),
mb_strlen($result, 'UTF-8'),
"Character count mismatch for: {$input}"
);
}
}
/**
* Test corrupted Windows-1252 content that needs conversion
*/
public function testCorruptedEncodingConversion()
{
// Simulate content that was incorrectly encoded as Windows-1252
$windows1252Input = mb_convert_encoding("Café résumé", 'WINDOWS-1252', 'UTF-8');
$result = Encode::convert($windows1252Input);
$this->assertEquals("Café résumé", $result);
$this->assertTrue(mb_check_encoding($result, 'UTF-8'));
}
/**
* Test Gmail-specific email subject requirements
*/
public function testGmailCompatibility()
{
$testCases = [
// Long subject with emojis (Gmail truncates at ~70 chars in preview)
"This is a long email subject with emojis 🚀 that might get truncated by Gmail 📧",
// Subject with only emojis
"🚀📧🎉✅⚠️💰",
// Subject with special characters Gmail handles
"Re: Fw: [URGENT] Company's \"Project\" Status—Update ✓",
// International content
"国际业务 🌍 Négociation €500K 💼"
];
foreach ($testCases as $input) {
$result = Encode::convert($input);
// Should be valid UTF-8 (Gmail requirement)
$this->assertTrue(mb_check_encoding($result, 'UTF-8'), "Gmail compatibility failed for: {$input}");
// Should not contain replacement characters
$this->assertStringNotContainsString("\xEF\xBF\xBD", $result, "Contains replacement characters: {$input}");
$this->assertStringNotContainsString('�', $result, "Contains double-encoded replacement: {$input}");
// Should preserve original content for valid UTF-8
$this->assertEquals($input, $result, "Content changed unnecessarily: {$input}");
}
}
/**
* Test edge cases that might break email clients
*/
public function testEmailClientEdgeCases()
{
$testCases = [
// Empty string
"" => "",
// Only spaces
" " => " ",
// Only special characters
"€£¥" => "€£¥",
// Only emojis
"🚀🎉📧" => "🚀🎉📧",
// Mixed spaces and emojis
" 🚀 📧 🎉 " => " 🚀 📧 🎉 ",
// Newlines and tabs (should be preserved)
"Line 1\nLine 2\tTabbed" => "Line 1\nLine 2\tTabbed"
];
foreach ($testCases as $input => $expected) {
$result = Encode::convert($input);
$this->assertEquals($expected, $result, "Edge case failed: " . var_export($input, true));
$this->assertTrue(mb_check_encoding($result, 'UTF-8'), "Not valid UTF-8: " . var_export($input, true));
}
}
/**
* Test performance with typical email subject lengths
*/
public function testPerformanceWithTypicalSubjects()
{
$baseSubject = "Rappel facture impayée (\$invoice) 🚀";
// Test with different subject lengths
$subjects = [
$baseSubject, // ~40 chars
str_repeat($baseSubject . " ", 2), // ~80 chars
str_repeat($baseSubject . " ", 5), // ~200 chars
];
foreach ($subjects as $subject) {
$startTime = microtime(true);
$result = Encode::convert($subject);
$endTime = microtime(true);
$executionTime = ($endTime - $startTime) * 1000; // Convert to milliseconds
// Should complete quickly (under 10ms for email subjects)
$this->assertLessThan(10, $executionTime, "Too slow for subject: " . strlen($subject) . " chars");
$this->assertTrue(mb_check_encoding($result, 'UTF-8'));
}
}
/**
* Test that the method is safe to call multiple times
*/
public function testIdempotency()
{
$original = "Rappel facture impayée (\$invoice) 🚀";
$first = Encode::convert($original);
$second = Encode::convert($first);
$third = Encode::convert($second);
// Should be identical after multiple conversions
$this->assertEquals($original, $first);
$this->assertEquals($first, $second);
$this->assertEquals($second, $third);
}
}

View File

@ -0,0 +1,334 @@
<?php
namespace Tests\Unit;
use Tests\TestCase;
class EncodeWithoutClassFailureTest extends TestCase
{
private string $problematicSubject = "Rappel facture impayée (\$invoice) 🚀";
/**
* Test that direct mb_convert_encoding through Windows-1252 corrupts emojis
*/
public function testDirectConversionCorruptsEmojis()
{
$original = $this->problematicSubject;
// This is what would happen without the Encode class - forcing conversion through Windows-1252
$corrupted = mb_convert_encoding($original, 'UTF-8', 'WINDOWS-1252');
// Should NOT be equal to original (emoji gets corrupted)
$this->assertNotEquals($original, $corrupted);
// Emoji should be lost/corrupted
$this->assertStringNotContainsString('🚀', $corrupted);
// Should contain corruption artifacts
$this->assertTrue(
str_contains($corrupted, "\xEF\xBF\xBD") || // Replacement character
str_contains($corrupted, '?') || // Question mark replacement
str_contains($corrupted, 'é') || // Double-encoded é
str_contains($corrupted, '🚀') || // Corrupted emoji
strlen($corrupted) < strlen($original), // Characters lost
"Expected emoji corruption but content seems intact. Original: {$original}, Corrupted: {$corrupted}"
);
}
/**
* Test that naive iconv usage fails with emojis
*/
public function testIconvFailsWithEmojis()
{
$original = $this->problematicSubject;
// Common mistake: trying to convert UTF-8 through ISO-8859-1
$result = iconv('ISO-8859-1', 'UTF-8//IGNORE', $original);
// Should fail or corrupt the content
$this->assertNotEquals($original, $result);
// Should lose the emoji
$this->assertStringNotContainsString('🚀', $result);
}
/**
* Test that forcing through ASCII destroys international characters
*/
public function testAsciiConversionDestroysInternationalChars()
{
$original = $this->problematicSubject;
// Naive approach: force to ASCII
$asciiAttempt = iconv('UTF-8', 'ASCII//IGNORE', $original);
// Should lose both emoji and accented characters
$this->assertNotEquals($original, $asciiAttempt);
$this->assertStringNotContainsString('🚀', $asciiAttempt);
$this->assertStringNotContainsString('impayée', $asciiAttempt);
// Should contain "impaye" instead (accent completely removed)
$this->assertStringContainsString('impaye', $asciiAttempt);
}
/**
* Test that manual character replacement approach is inadequate
*/
public function testManualReplacementInadequate()
{
$original = $this->problematicSubject;
// Naive manual approach that many developers try
$manualAttempt = str_replace([
'é',
'à',
'ç',
'ù'
], [
'e',
'a',
'c',
'u'
], $original);
// Still has the emoji problem - can't handle all Unicode
$this->assertNotEquals($original, $manualAttempt);
// Manual replacement changes the é in "impayée" to "e"
$this->assertStringNotContainsString('impayée', $manualAttempt);
$this->assertStringContainsString('impayee', $manualAttempt);
// Emoji remains but manual approach doesn't solve encoding issues
$this->assertStringContainsString('🚀', $manualAttempt);
}
/**
* Test simulated database storage/retrieval corruption
*/
public function testDatabaseStorageCorruption()
{
$original = $this->problematicSubject;
// Simulate what happens when storing in Latin1 database column
$latin1Encoded = mb_convert_encoding($original, 'ISO-8859-1', 'UTF-8');
$retrievedBack = mb_convert_encoding($latin1Encoded, 'UTF-8', 'ISO-8859-1');
// Should be corrupted
$this->assertNotEquals($original, $retrievedBack);
// Emoji definitely lost
$this->assertStringNotContainsString('🚀', $retrievedBack);
}
/**
* Test simulated file read/write corruption
*/
public function testFileHandlingCorruption()
{
$original = $this->problematicSubject;
// Create a temporary file and write with wrong encoding assumption
$tempFile = tempnam(sys_get_temp_dir(), 'encoding_fail_test_');
// Simulate writing as Windows-1252
$windows1252Content = mb_convert_encoding($original, 'WINDOWS-1252', 'UTF-8');
file_put_contents($tempFile, $windows1252Content);
// Now read it back assuming UTF-8 (common mistake)
$corruptedRead = file_get_contents($tempFile);
// Should be corrupted
$this->assertNotEquals($original, $corruptedRead);
// Should not be valid UTF-8
$this->assertFalse(mb_check_encoding($corruptedRead, 'UTF-8'));
// Clean up
unlink($tempFile);
}
/**
* Test what happens with common "sanitization" approaches
*/
public function testCommonSanitizationBreaksContent()
{
$original = $this->problematicSubject;
// Common "sanitization" that developers might try
$sanitized = filter_var($original, FILTER_SANITIZE_STRING, FILTER_FLAG_STRIP_HIGH);
if ($sanitized !== false) {
// Should remove high-bit characters (including emoji and accents)
$this->assertNotEquals($original, $sanitized);
$this->assertStringNotContainsString('🚀', $sanitized);
$this->assertStringNotContainsString('impayée', $sanitized);
} else {
// Filter might fail entirely
$this->assertFalse($sanitized);
}
}
/**
* Test naive regular expression replacement
*/
public function testRegexReplacementBreaksUnicode()
{
$original = $this->problematicSubject;
// Naive attempt to "clean" the string with regex
$regexCleaned = preg_replace('/[^\x20-\x7E]/', '?', $original);
// Should replace all non-ASCII characters with ?
$this->assertNotEquals($original, $regexCleaned);
$this->assertStringNotContainsString('🚀', $regexCleaned);
$this->assertStringNotContainsString('impayée', $regexCleaned);
// Should contain question marks
$this->assertStringContainsString('?', $regexCleaned);
}
/**
* Test double-encoding problems
*/
public function testDoubleEncodingProblems()
{
$original = $this->problematicSubject;
// Simulate double-encoding (common web application bug)
$firstEncoding = mb_convert_encoding($original, 'ISO-8859-1', 'UTF-8');
$doubleEncoded = mb_convert_encoding($firstEncoding, 'UTF-8', 'ISO-8859-1');
// Should be different and corrupted
$this->assertNotEquals($original, $doubleEncoded);
// Common double-encoding artifacts
$this->assertTrue(
str_contains($doubleEncoded, 'é') || // é becomes é
str_contains($doubleEncoded, 'â€') || // Other artifacts
!str_contains($doubleEncoded, '🚀'), // Emoji lost
"Expected double-encoding artifacts but got: " . $doubleEncoded
);
}
/**
* Test CSV export/import corruption
*/
public function testCsvCorruption()
{
$original = $this->problematicSubject;
// Simulate CSV export without proper encoding
$csvLine = '"' . $original . '"';
// Write to temp file with wrong encoding
$tempFile = tempnam(sys_get_temp_dir(), 'csv_fail_test_');
file_put_contents($tempFile, $csvLine, LOCK_EX);
// Read back with wrong encoding assumption
$contents = file_get_contents($tempFile);
// Parse CSV (simplified)
$parsed = str_replace('"', '', $contents);
// If the file system or CSV handling messed up encoding
if (!mb_check_encoding($parsed, 'UTF-8')) {
$this->assertNotEquals($original, $parsed);
} else {
// Even if it's valid UTF-8, it might still be different due to CSV processing
$this->assertTrue(true, "CSV processing completed");
}
// Clean up
unlink($tempFile);
}
/**
* Test JSON encoding/decoding issues
*/
public function testJsonEncodingIssues()
{
$original = $this->problematicSubject;
// Create array with the subject
$data = ['subject' => $original];
// Encode to JSON
$json = json_encode($data);
$this->assertNotFalse($json, "JSON encoding should work with UTF-8");
// Decode back
$decoded = json_decode($json, true);
$this->assertNotNull($decoded, "JSON decoding should work");
// This should actually work correctly with modern PHP
// But let's test what happens if someone tries to "fix" it
$brokenAttempt = json_encode($data, JSON_UNESCAPED_UNICODE | JSON_INVALID_UTF8_IGNORE);
$brokenDecoded = json_decode($brokenAttempt, true);
// The point is that without proper understanding, people might use wrong flags
// and lose data integrity
if ($brokenDecoded !== null && isset($brokenDecoded['subject'])) {
// In some PHP versions or configurations, this might alter the data
$this->assertTrue(
$decoded['subject'] === $original,
"Proper JSON handling preserves Unicode"
);
}
}
/**
* Test email header encoding issues
*/
public function testEmailHeaderEncodingIssues()
{
$original = $this->problematicSubject;
// Naive attempt to create email header without proper encoding
$naiveHeader = "Subject: " . $original;
// Email headers with non-ASCII characters need RFC 2047 encoding
// Without proper encoding, the subject would be corrupted by email servers
// Simulate what an email server might do with unencoded headers
$serverProcessed = preg_replace('/[^\x20-\x7E]/', '?', $naiveHeader);
$this->assertNotEquals($naiveHeader, $serverProcessed);
$this->assertStringNotContainsString('🚀', $serverProcessed);
$this->assertStringNotContainsString('impayée', $serverProcessed);
// Should contain replacement characters
$this->assertStringContainsString('?', $serverProcessed);
}
/**
* Summary test showing multiple failure modes
*/
public function testMultipleFailureModes()
{
$original = $this->problematicSubject;
$failures = [];
// Collect all the ways it can fail
$attempts = [
'windows1252' => mb_convert_encoding($original, 'UTF-8', 'WINDOWS-1252'),
'ascii' => iconv('UTF-8', 'ASCII//IGNORE', $original),
'latin1_roundtrip' => mb_convert_encoding(mb_convert_encoding($original, 'ISO-8859-1', 'UTF-8'), 'UTF-8', 'ISO-8859-1'),
'regex_strip' => preg_replace('/[^\x20-\x7E]/', '', $original),
'filter_sanitize' => filter_var($original, FILTER_SANITIZE_STRING, FILTER_FLAG_STRIP_HIGH),
];
foreach ($attempts as $method => $result) {
if ($result !== false && $result !== $original) {
$failures[$method] = $result;
}
}
// All methods should fail to preserve the original
$this->assertGreaterThan(0, count($failures), "At least some methods should fail");
// None of the failed attempts should contain the emoji
foreach ($failures as $method => $result) {
$this->assertStringNotContainsString('🚀', $result, "Method {$method} should lose emoji");
}
}
}