334 lines
12 KiB
PHP
334 lines
12 KiB
PHP
<?php
|
|
|
|
namespace Tests\Unit;
|
|
|
|
use Tests\TestCase;
|
|
|
|
class EncodeWithoutClassFailureTest extends TestCase
|
|
{
|
|
private string $problematicSubject = "Rappel facture impayée (\$invoice) 🚀";
|
|
|
|
/**
|
|
* Test that direct mb_convert_encoding through Windows-1252 corrupts emojis
|
|
*/
|
|
public function testDirectConversionCorruptsEmojis()
|
|
{
|
|
$original = $this->problematicSubject;
|
|
|
|
// This is what would happen without the Encode class - forcing conversion through Windows-1252
|
|
$corrupted = mb_convert_encoding($original, 'UTF-8', 'WINDOWS-1252');
|
|
|
|
// Should NOT be equal to original (emoji gets corrupted)
|
|
$this->assertNotEquals($original, $corrupted);
|
|
|
|
// Emoji should be lost/corrupted
|
|
$this->assertStringNotContainsString('🚀', $corrupted);
|
|
|
|
// Should contain corruption artifacts
|
|
$this->assertTrue(
|
|
str_contains($corrupted, "\xEF\xBF\xBD") || // Replacement character
|
|
str_contains($corrupted, '?') || // Question mark replacement
|
|
str_contains($corrupted, 'é') || // Double-encoded é
|
|
str_contains($corrupted, '🚀') || // Corrupted emoji
|
|
strlen($corrupted) < strlen($original), // Characters lost
|
|
"Expected emoji corruption but content seems intact. Original: {$original}, Corrupted: {$corrupted}"
|
|
);
|
|
}
|
|
|
|
/**
|
|
* Test that naive iconv usage fails with emojis
|
|
*/
|
|
public function testIconvFailsWithEmojis()
|
|
{
|
|
$original = $this->problematicSubject;
|
|
|
|
// Common mistake: trying to convert UTF-8 through ISO-8859-1
|
|
$result = iconv('ISO-8859-1', 'UTF-8//IGNORE', $original);
|
|
|
|
// Should fail or corrupt the content
|
|
$this->assertNotEquals($original, $result);
|
|
|
|
// Should lose the emoji
|
|
$this->assertStringNotContainsString('🚀', $result);
|
|
}
|
|
|
|
/**
|
|
* Test that forcing through ASCII destroys international characters
|
|
*/
|
|
public function testAsciiConversionDestroysInternationalChars()
|
|
{
|
|
$original = $this->problematicSubject;
|
|
|
|
// Naive approach: force to ASCII
|
|
$asciiAttempt = iconv('UTF-8', 'ASCII//IGNORE', $original);
|
|
|
|
// Should lose both emoji and accented characters
|
|
$this->assertNotEquals($original, $asciiAttempt);
|
|
$this->assertStringNotContainsString('🚀', $asciiAttempt);
|
|
$this->assertStringNotContainsString('impayée', $asciiAttempt);
|
|
|
|
// Should contain "impaye" instead (accent completely removed)
|
|
$this->assertStringContainsString('impaye', $asciiAttempt);
|
|
}
|
|
|
|
/**
|
|
* Test that manual character replacement approach is inadequate
|
|
*/
|
|
public function testManualReplacementInadequate()
|
|
{
|
|
$original = $this->problematicSubject;
|
|
|
|
// Naive manual approach that many developers try
|
|
$manualAttempt = str_replace([
|
|
'é',
|
|
'à',
|
|
'ç',
|
|
'ù'
|
|
], [
|
|
'e',
|
|
'a',
|
|
'c',
|
|
'u'
|
|
], $original);
|
|
|
|
// Still has the emoji problem - can't handle all Unicode
|
|
$this->assertNotEquals($original, $manualAttempt);
|
|
|
|
// Manual replacement changes the é in "impayée" to "e"
|
|
$this->assertStringNotContainsString('impayée', $manualAttempt);
|
|
$this->assertStringContainsString('impayee', $manualAttempt);
|
|
|
|
// Emoji remains but manual approach doesn't solve encoding issues
|
|
$this->assertStringContainsString('🚀', $manualAttempt);
|
|
}
|
|
|
|
/**
|
|
* Test simulated database storage/retrieval corruption
|
|
*/
|
|
public function testDatabaseStorageCorruption()
|
|
{
|
|
$original = $this->problematicSubject;
|
|
|
|
// Simulate what happens when storing in Latin1 database column
|
|
$latin1Encoded = mb_convert_encoding($original, 'ISO-8859-1', 'UTF-8');
|
|
$retrievedBack = mb_convert_encoding($latin1Encoded, 'UTF-8', 'ISO-8859-1');
|
|
|
|
// Should be corrupted
|
|
$this->assertNotEquals($original, $retrievedBack);
|
|
|
|
// Emoji definitely lost
|
|
$this->assertStringNotContainsString('🚀', $retrievedBack);
|
|
}
|
|
|
|
/**
|
|
* Test simulated file read/write corruption
|
|
*/
|
|
public function testFileHandlingCorruption()
|
|
{
|
|
$original = $this->problematicSubject;
|
|
|
|
// Create a temporary file and write with wrong encoding assumption
|
|
$tempFile = tempnam(sys_get_temp_dir(), 'encoding_fail_test_');
|
|
|
|
// Simulate writing as Windows-1252
|
|
$windows1252Content = mb_convert_encoding($original, 'WINDOWS-1252', 'UTF-8');
|
|
file_put_contents($tempFile, $windows1252Content);
|
|
|
|
// Now read it back assuming UTF-8 (common mistake)
|
|
$corruptedRead = file_get_contents($tempFile);
|
|
|
|
// Should be corrupted
|
|
$this->assertNotEquals($original, $corruptedRead);
|
|
|
|
// Should not be valid UTF-8
|
|
$this->assertFalse(mb_check_encoding($corruptedRead, 'UTF-8'));
|
|
|
|
// Clean up
|
|
unlink($tempFile);
|
|
}
|
|
|
|
/**
|
|
* Test what happens with common "sanitization" approaches
|
|
*/
|
|
public function testCommonSanitizationBreaksContent()
|
|
{
|
|
$original = $this->problematicSubject;
|
|
|
|
// Common "sanitization" that developers might try
|
|
$sanitized = filter_var($original, FILTER_SANITIZE_STRING, FILTER_FLAG_STRIP_HIGH);
|
|
|
|
if ($sanitized !== false) {
|
|
// Should remove high-bit characters (including emoji and accents)
|
|
$this->assertNotEquals($original, $sanitized);
|
|
$this->assertStringNotContainsString('🚀', $sanitized);
|
|
$this->assertStringNotContainsString('impayée', $sanitized);
|
|
} else {
|
|
// Filter might fail entirely
|
|
$this->assertFalse($sanitized);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Test naive regular expression replacement
|
|
*/
|
|
public function testRegexReplacementBreaksUnicode()
|
|
{
|
|
$original = $this->problematicSubject;
|
|
|
|
// Naive attempt to "clean" the string with regex
|
|
$regexCleaned = preg_replace('/[^\x20-\x7E]/', '?', $original);
|
|
|
|
// Should replace all non-ASCII characters with ?
|
|
$this->assertNotEquals($original, $regexCleaned);
|
|
$this->assertStringNotContainsString('🚀', $regexCleaned);
|
|
$this->assertStringNotContainsString('impayée', $regexCleaned);
|
|
|
|
// Should contain question marks
|
|
$this->assertStringContainsString('?', $regexCleaned);
|
|
}
|
|
|
|
/**
|
|
* Test double-encoding problems
|
|
*/
|
|
public function testDoubleEncodingProblems()
|
|
{
|
|
$original = $this->problematicSubject;
|
|
|
|
// Simulate double-encoding (common web application bug)
|
|
$firstEncoding = mb_convert_encoding($original, 'ISO-8859-1', 'UTF-8');
|
|
$doubleEncoded = mb_convert_encoding($firstEncoding, 'UTF-8', 'ISO-8859-1');
|
|
|
|
// Should be different and corrupted
|
|
$this->assertNotEquals($original, $doubleEncoded);
|
|
|
|
// Common double-encoding artifacts
|
|
$this->assertTrue(
|
|
str_contains($doubleEncoded, 'é') || // é becomes é
|
|
str_contains($doubleEncoded, 'â€') || // Other artifacts
|
|
!str_contains($doubleEncoded, '🚀'), // Emoji lost
|
|
"Expected double-encoding artifacts but got: " . $doubleEncoded
|
|
);
|
|
}
|
|
|
|
/**
|
|
* Test CSV export/import corruption
|
|
*/
|
|
public function testCsvCorruption()
|
|
{
|
|
$original = $this->problematicSubject;
|
|
|
|
// Simulate CSV export without proper encoding
|
|
$csvLine = '"' . $original . '"';
|
|
|
|
// Write to temp file with wrong encoding
|
|
$tempFile = tempnam(sys_get_temp_dir(), 'csv_fail_test_');
|
|
file_put_contents($tempFile, $csvLine, LOCK_EX);
|
|
|
|
// Read back with wrong encoding assumption
|
|
$contents = file_get_contents($tempFile);
|
|
|
|
// Parse CSV (simplified)
|
|
$parsed = str_replace('"', '', $contents);
|
|
|
|
// If the file system or CSV handling messed up encoding
|
|
if (!mb_check_encoding($parsed, 'UTF-8')) {
|
|
$this->assertNotEquals($original, $parsed);
|
|
} else {
|
|
// Even if it's valid UTF-8, it might still be different due to CSV processing
|
|
$this->assertTrue(true, "CSV processing completed");
|
|
}
|
|
|
|
// Clean up
|
|
unlink($tempFile);
|
|
}
|
|
|
|
/**
|
|
* Test JSON encoding/decoding issues
|
|
*/
|
|
public function testJsonEncodingIssues()
|
|
{
|
|
$original = $this->problematicSubject;
|
|
|
|
// Create array with the subject
|
|
$data = ['subject' => $original];
|
|
|
|
// Encode to JSON
|
|
$json = json_encode($data);
|
|
$this->assertNotFalse($json, "JSON encoding should work with UTF-8");
|
|
|
|
// Decode back
|
|
$decoded = json_decode($json, true);
|
|
$this->assertNotNull($decoded, "JSON decoding should work");
|
|
|
|
// This should actually work correctly with modern PHP
|
|
// But let's test what happens if someone tries to "fix" it
|
|
$brokenAttempt = json_encode($data, JSON_UNESCAPED_UNICODE | JSON_INVALID_UTF8_IGNORE);
|
|
$brokenDecoded = json_decode($brokenAttempt, true);
|
|
|
|
// The point is that without proper understanding, people might use wrong flags
|
|
// and lose data integrity
|
|
if ($brokenDecoded !== null && isset($brokenDecoded['subject'])) {
|
|
// In some PHP versions or configurations, this might alter the data
|
|
$this->assertTrue(
|
|
$decoded['subject'] === $original,
|
|
"Proper JSON handling preserves Unicode"
|
|
);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Test email header encoding issues
|
|
*/
|
|
public function testEmailHeaderEncodingIssues()
|
|
{
|
|
$original = $this->problematicSubject;
|
|
|
|
// Naive attempt to create email header without proper encoding
|
|
$naiveHeader = "Subject: " . $original;
|
|
|
|
// Email headers with non-ASCII characters need RFC 2047 encoding
|
|
// Without proper encoding, the subject would be corrupted by email servers
|
|
|
|
// Simulate what an email server might do with unencoded headers
|
|
$serverProcessed = preg_replace('/[^\x20-\x7E]/', '?', $naiveHeader);
|
|
|
|
$this->assertNotEquals($naiveHeader, $serverProcessed);
|
|
$this->assertStringNotContainsString('🚀', $serverProcessed);
|
|
$this->assertStringNotContainsString('impayée', $serverProcessed);
|
|
|
|
// Should contain replacement characters
|
|
$this->assertStringContainsString('?', $serverProcessed);
|
|
}
|
|
|
|
/**
|
|
* Summary test showing multiple failure modes
|
|
*/
|
|
public function testMultipleFailureModes()
|
|
{
|
|
$original = $this->problematicSubject;
|
|
$failures = [];
|
|
|
|
// Collect all the ways it can fail
|
|
$attempts = [
|
|
'windows1252' => mb_convert_encoding($original, 'UTF-8', 'WINDOWS-1252'),
|
|
'ascii' => iconv('UTF-8', 'ASCII//IGNORE', $original),
|
|
'latin1_roundtrip' => mb_convert_encoding(mb_convert_encoding($original, 'ISO-8859-1', 'UTF-8'), 'UTF-8', 'ISO-8859-1'),
|
|
'regex_strip' => preg_replace('/[^\x20-\x7E]/', '', $original),
|
|
'filter_sanitize' => filter_var($original, FILTER_SANITIZE_STRING, FILTER_FLAG_STRIP_HIGH),
|
|
];
|
|
|
|
foreach ($attempts as $method => $result) {
|
|
if ($result !== false && $result !== $original) {
|
|
$failures[$method] = $result;
|
|
}
|
|
}
|
|
|
|
// All methods should fail to preserve the original
|
|
$this->assertGreaterThan(0, count($failures), "At least some methods should fail");
|
|
|
|
// None of the failed attempts should contain the emoji
|
|
foreach ($failures as $method => $result) {
|
|
$this->assertStringNotContainsString('🚀', $result, "Method {$method} should lose emoji");
|
|
}
|
|
}
|
|
}
|