Additional tests
This commit is contained in:
parent
651ec15e22
commit
b13258be0e
|
|
@ -39,19 +39,7 @@ class GmailTransport extends AbstractTransport
|
|||
//ensure utf-8 encoding of subject
|
||||
$subject = $message->getSubject();
|
||||
|
||||
if (!mb_check_encoding($subject, 'UTF-8') || preg_match('/Ã.|â.|Â./', $subject)) {
|
||||
|
||||
$possible_encodings = ['Windows-1252', 'ISO-8859-1', 'ISO-8859-15'];
|
||||
|
||||
foreach ($possible_encodings as $encoding) {
|
||||
$converted = mb_convert_encoding($subject, 'UTF-8', $encoding);
|
||||
|
||||
if (mb_check_encoding($converted, 'UTF-8') && !preg_match('/Ã.|â.|Â./', $converted)) {
|
||||
$subject = $converted;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
$subject = \App\Utils\Encode::convert($subject);
|
||||
|
||||
$message->subject($subject);
|
||||
|
||||
|
|
|
|||
|
|
@ -159,6 +159,7 @@ class NinjaPlanController extends Controller
|
|||
$account->hosted_company_count = 10;
|
||||
$account->trial_started = now();
|
||||
$account->trial_plan = 'pro';
|
||||
$account->created_at = now();
|
||||
$account->save();
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,225 @@
|
|||
<?php
|
||||
/**
|
||||
* Invoice Ninja (https://invoiceninja.com).
|
||||
*
|
||||
* @link https://github.com/invoiceninja/invoiceninja source repository
|
||||
*
|
||||
* @copyright Copyright (c) 2025. Invoice Ninja LLC (https://invoiceninja.com)
|
||||
*
|
||||
* @license https://www.elastic.co/licensing/elastic-license
|
||||
*/
|
||||
|
||||
namespace App\Utils;
|
||||
|
||||
class Encode
|
||||
{
|
||||
|
||||
/**
|
||||
* Convert string content to UTF-8
|
||||
* Safe for emojis, file content, and any encoding issues
|
||||
*/
|
||||
public static function convert(string $contents): string
|
||||
{
|
||||
|
||||
// Check for different UTF BOMs and handle accordingly
|
||||
$bomResult = self::detectAndHandleUTFEncoding($contents);
|
||||
if ($bomResult !== null) {
|
||||
return $bomResult;
|
||||
}
|
||||
|
||||
// Remove BOM if present (for UTF-8 BOM)
|
||||
$contents = self::removeBOM($contents);
|
||||
|
||||
// Check if it's clean UTF-8 first (no conversion needed)
|
||||
// This handles emojis, accented characters, and any valid UTF-8 content
|
||||
if (mb_check_encoding($contents, 'UTF-8') && self::isValidConversion($contents)) {
|
||||
return $contents;
|
||||
}
|
||||
|
||||
// Method 1: Try Windows-1252 conversion
|
||||
$contextContents = $contents;
|
||||
if ($contextContents !== false) {
|
||||
$contextContents = self::removeBOM($contextContents);
|
||||
$converted = mb_convert_encoding($contextContents, 'UTF-8', 'WINDOWS-1252');
|
||||
if (self::isValidConversion($converted)) {
|
||||
return $converted;
|
||||
}
|
||||
}
|
||||
|
||||
// Method 2: Binary conversion
|
||||
$binaryContents = $contents;
|
||||
|
||||
$binaryContents = self::removeBOM($binaryContents);
|
||||
|
||||
// Check if this looks like Windows-1252 by looking for problem bytes
|
||||
if (self::containsWindows1252Bytes($binaryContents)) {
|
||||
$converted = mb_convert_encoding($binaryContents, 'UTF-8', 'WINDOWS-1252');
|
||||
if (self::isValidConversion($converted)) {
|
||||
return $converted;
|
||||
}
|
||||
}
|
||||
|
||||
// Method 3: Fix corrupted UTF-8 replacement characters
|
||||
if ($contents !== false) {
|
||||
$fixed = self::fixCorruptedWindows1252($contents);
|
||||
if (self::isValidConversion($fixed)) {
|
||||
return $fixed;
|
||||
}
|
||||
}
|
||||
|
||||
// Method 4: Try different encoding auto-detection with broader list
|
||||
if ($contents !== false) {
|
||||
$encodings = ['WINDOWS-1252', 'ISO-8859-1', 'ISO-8859-15', 'CP1252'];
|
||||
foreach ($encodings as $encoding) {
|
||||
$converted = mb_convert_encoding($contents, 'UTF-8', $encoding);
|
||||
if (self::isValidConversion($converted)) {
|
||||
return $converted;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Fallback: return original contents
|
||||
return $contents ?: '';
|
||||
}
|
||||
|
||||
/**
|
||||
* Detect and handle UTF-16 and UTF-32 encodings based on BOM
|
||||
*/
|
||||
private static function detectAndHandleUTFEncoding(string $data): ?string
|
||||
{
|
||||
// UTF-32 BE BOM
|
||||
if (substr($data, 0, 4) === "\x00\x00\xFE\xFF") {
|
||||
$withoutBOM = substr($data, 4);
|
||||
return mb_convert_encoding($withoutBOM, 'UTF-8', 'UTF-32BE');
|
||||
}
|
||||
|
||||
// UTF-32 LE BOM
|
||||
if (substr($data, 0, 4) === "\xFF\xFE\x00\x00") {
|
||||
$withoutBOM = substr($data, 4);
|
||||
return mb_convert_encoding($withoutBOM, 'UTF-8', 'UTF-32LE');
|
||||
}
|
||||
|
||||
// UTF-16 BE BOM
|
||||
if (substr($data, 0, 2) === "\xFE\xFF") {
|
||||
$withoutBOM = substr($data, 2);
|
||||
return mb_convert_encoding($withoutBOM, 'UTF-8', 'UTF-16BE');
|
||||
}
|
||||
|
||||
// UTF-16 LE BOM
|
||||
if (substr($data, 0, 2) === "\xFF\xFE") {
|
||||
$withoutBOM = substr($data, 2);
|
||||
return mb_convert_encoding($withoutBOM, 'UTF-8', 'UTF-16LE');
|
||||
}
|
||||
|
||||
// Try to detect UTF-16/32 without BOM (heuristic approach)
|
||||
$length = strlen($data);
|
||||
|
||||
// UTF-32 detection (every 4th byte pattern)
|
||||
if ($length >= 8 && $length % 4 === 0) {
|
||||
$nullCount = 0;
|
||||
for ($i = 0; $i < min(100, $length); $i += 4) {
|
||||
if ($data[$i] === "\x00" && $data[$i + 1] === "\x00" && $data[$i + 2] === "\x00") {
|
||||
$nullCount++;
|
||||
}
|
||||
}
|
||||
if ($nullCount > 5) { // Likely UTF-32LE
|
||||
return mb_convert_encoding($data, 'UTF-8', 'UTF-32LE');
|
||||
}
|
||||
}
|
||||
|
||||
// UTF-16 detection (every 2nd byte pattern)
|
||||
if ($length >= 4 && $length % 2 === 0) {
|
||||
$nullCount = 0;
|
||||
for ($i = 0; $i < min(100, $length); $i += 2) {
|
||||
if ($data[$i + 1] === "\x00") {
|
||||
$nullCount++;
|
||||
}
|
||||
}
|
||||
if ($nullCount > 10) { // Likely UTF-16LE
|
||||
return mb_convert_encoding($data, 'UTF-8', 'UTF-16LE');
|
||||
}
|
||||
|
||||
// Check for UTF-16BE
|
||||
$nullCount = 0;
|
||||
for ($i = 0; $i < min(100, $length); $i += 2) {
|
||||
if ($data[$i] === "\x00") {
|
||||
$nullCount++;
|
||||
}
|
||||
}
|
||||
if ($nullCount > 10) { // Likely UTF-16BE
|
||||
return mb_convert_encoding($data, 'UTF-8', 'UTF-16BE');
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Remove BOM (Byte Order Mark) from the beginning of a string
|
||||
*/
|
||||
private static function removeBOM(string $data): string
|
||||
{
|
||||
// UTF-8 BOM
|
||||
if (substr($data, 0, 3) === "\xEF\xBB\xBF") {
|
||||
return substr($data, 3);
|
||||
}
|
||||
|
||||
// UTF-16 BE BOM
|
||||
if (substr($data, 0, 2) === "\xFE\xFF") {
|
||||
return substr($data, 2);
|
||||
}
|
||||
|
||||
// UTF-16 LE BOM
|
||||
if (substr($data, 0, 2) === "\xFF\xFE") {
|
||||
return substr($data, 2);
|
||||
}
|
||||
|
||||
// UTF-32 BE BOM
|
||||
if (substr($data, 0, 4) === "\x00\x00\xFE\xFF") {
|
||||
return substr($data, 4);
|
||||
}
|
||||
|
||||
// UTF-32 LE BOM
|
||||
if (substr($data, 0, 4) === "\xFF\xFE\x00\x00") {
|
||||
return substr($data, 4);
|
||||
}
|
||||
|
||||
return $data;
|
||||
}
|
||||
|
||||
private static function containsWindows1252Bytes(string $data): bool
|
||||
{
|
||||
// Check for Windows-1252 specific bytes in 0x80-0x9F range
|
||||
$windows1252Bytes = [0x80, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8A, 0x8B, 0x8C, 0x8E, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9A, 0x9B, 0x9C, 0x9E, 0x9F];
|
||||
|
||||
foreach ($windows1252Bytes as $byte) {
|
||||
if (strpos($data, chr($byte)) !== false) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
private static function fixCorruptedWindows1252(string $data): string
|
||||
{
|
||||
// Map of UTF-8 replacement sequences back to proper characters
|
||||
$replacements = [
|
||||
"\xEF\xBF\xBD" => "\u{2019}", // Most common: right single quote (0x92) - use smart quote
|
||||
// Add more mappings as needed based on your data
|
||||
];
|
||||
|
||||
return str_replace(array_keys($replacements), array_values($replacements), $data);
|
||||
}
|
||||
|
||||
private static function isValidConversion(string $data): bool
|
||||
{
|
||||
// Check if conversion was successful:
|
||||
// 1. Must be valid UTF-8
|
||||
// 2. Must NOT contain replacement characters (indicating corruption)
|
||||
// 3. Additional check for double-encoded replacement
|
||||
return mb_check_encoding($data, 'UTF-8') &&
|
||||
!str_contains($data, "\xEF\xBF\xBD") && // UTF-8 replacement character bytes
|
||||
!str_contains($data, '�'); // Double-encoded replacement character
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,202 @@
|
|||
<?php
|
||||
|
||||
namespace Tests\Unit;
|
||||
|
||||
use Tests\TestCase;
|
||||
use App\Utils\Encode;
|
||||
|
||||
/**
|
||||
* Direct comparison showing why the Encode class is necessary
|
||||
* for email subject lines with emojis and accented characters
|
||||
*/
|
||||
class EncodeClassComparisonTest extends TestCase
|
||||
{
|
||||
private string $problematicSubject = "Rappel facture impayée (\$invoice) 🚀";
|
||||
|
||||
/**
|
||||
* Demonstrate the difference: WITH Encode class vs WITHOUT
|
||||
*/
|
||||
public function testWithVsWithoutEncodeClass()
|
||||
{
|
||||
$original = $this->problematicSubject;
|
||||
|
||||
// ✅ WITH Encode class - CORRECT approach
|
||||
$withEncodeClass = Encode::convert($original);
|
||||
|
||||
// ❌ WITHOUT Encode class - Common mistake (forcing through Windows-1252)
|
||||
$withoutEncodeClass = mb_convert_encoding($original, 'UTF-8', 'WINDOWS-1252');
|
||||
|
||||
// Results comparison
|
||||
$this->assertEquals($original, $withEncodeClass, "Encode class should preserve original");
|
||||
$this->assertNotEquals($original, $withoutEncodeClass, "Direct conversion should corrupt content");
|
||||
|
||||
// Emoji preservation
|
||||
$this->assertStringContainsString('🚀', $withEncodeClass, "Encode class preserves emoji");
|
||||
$this->assertStringNotContainsString('🚀', $withoutEncodeClass, "Direct conversion corrupts emoji");
|
||||
|
||||
// Accented character preservation
|
||||
$this->assertStringContainsString('impayée', $withEncodeClass, "Encode class preserves accents");
|
||||
$this->assertStringNotContainsString('impayée', $withoutEncodeClass, "Direct conversion corrupts accents");
|
||||
|
||||
// Show the actual corruption
|
||||
$this->assertStringContainsString('🚀', $withoutEncodeClass, "Should contain corrupted emoji");
|
||||
$this->assertStringContainsString('é', $withoutEncodeClass, "Should contain corrupted accent");
|
||||
|
||||
// UTF-8 validity
|
||||
$this->assertTrue(mb_check_encoding($withEncodeClass, 'UTF-8'), "Encode class result is valid UTF-8");
|
||||
$this->assertTrue(mb_check_encoding($withoutEncodeClass, 'UTF-8'), "Corrupted result is still UTF-8 but wrong");
|
||||
}
|
||||
|
||||
/**
|
||||
* Show multiple common failure approaches vs the Encode class
|
||||
*/
|
||||
public function testMultipleFailureApproachesVsEncodeClass()
|
||||
{
|
||||
$original = $this->problematicSubject;
|
||||
|
||||
// ✅ CORRECT: Using Encode class
|
||||
$correct = Encode::convert($original);
|
||||
|
||||
// ❌ WRONG: Common developer mistakes
|
||||
$commonMistakes = [
|
||||
'force_windows1252' => mb_convert_encoding($original, 'UTF-8', 'WINDOWS-1252'),
|
||||
'force_ascii' => iconv('UTF-8', 'ASCII//IGNORE', $original),
|
||||
'manual_replace' => str_replace(['é'], ['e'], $original), // Simplistic approach
|
||||
'regex_strip' => preg_replace('/[^\x20-\x7E]/', '?', $original),
|
||||
'sanitize_filter' => filter_var($original, FILTER_SANITIZE_STRING, FILTER_FLAG_STRIP_HIGH) ?: 'FILTER_FAILED',
|
||||
];
|
||||
|
||||
// The Encode class should preserve the original
|
||||
$this->assertEquals($original, $correct);
|
||||
|
||||
// All other approaches should fail
|
||||
foreach ($commonMistakes as $method => $result) {
|
||||
$this->assertNotEquals($original, $result, "Method '{$method}' should fail to preserve original");
|
||||
|
||||
// Most should lose the emoji (except manual_replace which only changes accents)
|
||||
if ($result !== 'FILTER_FAILED' && $method !== 'manual_replace') {
|
||||
$this->assertStringNotContainsString('🚀', $result, "Method '{$method}' should lose emoji");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Gmail email header compatibility test
|
||||
*/
|
||||
public function testGmailHeaderCompatibility()
|
||||
{
|
||||
$original = $this->problematicSubject;
|
||||
|
||||
// ✅ CORRECT: Encode class makes it Gmail-compatible
|
||||
$encodedSubject = Encode::convert($original);
|
||||
|
||||
// Create a proper email header (RFC 2047 encoding would be done by email library)
|
||||
$properHeader = "Subject: " . $encodedSubject;
|
||||
|
||||
// ❌ WRONG: Direct use without encoding
|
||||
$corruptedSubject = mb_convert_encoding($original, 'UTF-8', 'WINDOWS-1252');
|
||||
$badHeader = "Subject: " . $corruptedSubject;
|
||||
|
||||
// Proper header should contain correct characters
|
||||
$this->assertStringContainsString('🚀', $properHeader);
|
||||
$this->assertStringContainsString('impayée', $properHeader);
|
||||
|
||||
// Bad header should contain corruption
|
||||
$this->assertStringNotContainsString('🚀', $badHeader);
|
||||
$this->assertStringNotContainsString('impayée', $badHeader);
|
||||
$this->assertStringContainsString('🚀', $badHeader);
|
||||
$this->assertStringContainsString('é', $badHeader);
|
||||
}
|
||||
|
||||
/**
|
||||
* Performance comparison: Encode class vs naive approaches
|
||||
*/
|
||||
public function testPerformanceComparison()
|
||||
{
|
||||
$original = $this->problematicSubject;
|
||||
|
||||
// Time the Encode class
|
||||
$start = microtime(true);
|
||||
$result = Encode::convert($original);
|
||||
$encodeClassTime = microtime(true) - $start;
|
||||
|
||||
// Time a naive approach
|
||||
$start = microtime(true);
|
||||
$naiveResult = mb_convert_encoding($original, 'UTF-8', 'WINDOWS-1252');
|
||||
$naiveTime = microtime(true) - $start;
|
||||
|
||||
// Both should be fast (under 10ms)
|
||||
$this->assertLessThan(0.01, $encodeClassTime, "Encode class should be fast");
|
||||
$this->assertLessThan(0.01, $naiveTime, "Naive approach should also be fast");
|
||||
|
||||
// But only Encode class preserves content
|
||||
$this->assertEquals($original, $result);
|
||||
$this->assertNotEquals($original, $naiveResult);
|
||||
}
|
||||
|
||||
/**
|
||||
* Real-world email scenario test
|
||||
*/
|
||||
public function testRealWorldEmailScenario()
|
||||
{
|
||||
// Simulate various real-world email subjects that would fail without Encode class
|
||||
$realWorldSubjects = [
|
||||
$this->problematicSubject,
|
||||
"Café Newsletter 📧 March 2024",
|
||||
"Paiement reçu ✅ Facture #123",
|
||||
"Señor García - Cotización €1,500 💼",
|
||||
"Müller GmbH → Status Update 🎯",
|
||||
];
|
||||
|
||||
foreach ($realWorldSubjects as $subject) {
|
||||
// ✅ With Encode class
|
||||
$safe = Encode::convert($subject);
|
||||
|
||||
// ❌ Without Encode class (common mistake)
|
||||
$unsafe = mb_convert_encoding($subject, 'UTF-8', 'WINDOWS-1252');
|
||||
|
||||
// Encode class should preserve everything
|
||||
$this->assertEquals($subject, $safe, "Encode class failed for: {$subject}");
|
||||
|
||||
// Direct conversion should corrupt emojis/accents
|
||||
$this->assertNotEquals($subject, $unsafe, "Direct conversion should fail for: {$subject}");
|
||||
|
||||
// Should be valid UTF-8
|
||||
$this->assertTrue(mb_check_encoding($safe, 'UTF-8'));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Test what happens with edge cases
|
||||
*/
|
||||
public function testEdgeCaseComparison()
|
||||
{
|
||||
$edgeCases = [
|
||||
// Only emoji
|
||||
"🚀",
|
||||
// Only accents
|
||||
"impayée",
|
||||
// Mixed complex
|
||||
"🇫🇷 François & José 💼 €500",
|
||||
// Empty
|
||||
"",
|
||||
// ASCII only
|
||||
"Invoice 123",
|
||||
];
|
||||
|
||||
foreach ($edgeCases as $testCase) {
|
||||
$encoded = Encode::convert($testCase);
|
||||
$naive = mb_convert_encoding($testCase, 'UTF-8', 'WINDOWS-1252');
|
||||
|
||||
// For ASCII-only content, both should work
|
||||
if (mb_check_encoding($testCase, 'ASCII')) {
|
||||
$this->assertEquals($testCase, $encoded);
|
||||
// Naive might still work for ASCII
|
||||
} else {
|
||||
// For Unicode content, only Encode class should work correctly
|
||||
$this->assertEquals($testCase, $encoded, "Encode class should handle: {$testCase}");
|
||||
$this->assertNotEquals($testCase, $naive, "Naive approach should fail: {$testCase}");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,285 @@
|
|||
<?php
|
||||
|
||||
namespace Tests\Unit;
|
||||
|
||||
use Tests\TestCase;
|
||||
use App\Utils\Encode;
|
||||
|
||||
class EncodeEmailSubjectTest extends TestCase
|
||||
{
|
||||
/**
|
||||
* Test the exact example provided by the user
|
||||
*/
|
||||
public function testUserSpecificExample()
|
||||
{
|
||||
$originalSubject = "Rappel facture impayée (\$invoice) 🚀";
|
||||
$convertedSubject = Encode::convert($originalSubject);
|
||||
|
||||
// Should return unchanged - already valid UTF-8
|
||||
$this->assertEquals($originalSubject, $convertedSubject);
|
||||
$this->assertTrue(mb_check_encoding($convertedSubject, 'UTF-8'));
|
||||
|
||||
// Verify emoji is preserved
|
||||
$this->assertStringContainsString('🚀', $convertedSubject);
|
||||
|
||||
// Verify accented characters are preserved
|
||||
$this->assertStringContainsString('impayée', $convertedSubject);
|
||||
|
||||
// Verify the string length is correct (emojis are multi-byte)
|
||||
$this->assertEquals(mb_strlen($originalSubject, 'UTF-8'), mb_strlen($convertedSubject, 'UTF-8'));
|
||||
}
|
||||
|
||||
/**
|
||||
* Test various email subject scenarios with emojis
|
||||
*/
|
||||
public function testEmojiEmailSubjects()
|
||||
{
|
||||
$testCases = [
|
||||
// Single emoji
|
||||
"Invoice Ready 📧" => "Invoice Ready 📧",
|
||||
|
||||
// Multiple emojis
|
||||
"Payment Received ✅ 🎉" => "Payment Received ✅ 🎉",
|
||||
|
||||
// Emoji at start
|
||||
"🚨 Urgent: Payment Overdue" => "🚨 Urgent: Payment Overdue",
|
||||
|
||||
// Emoji at end
|
||||
"Welcome to our service! 🎯" => "Welcome to our service! 🎯",
|
||||
|
||||
// Complex emojis (family, skin tones, etc.)
|
||||
"Team meeting 👨💻👩💻" => "Team meeting 👨💻👩💻",
|
||||
|
||||
// Mixed flags and symbols
|
||||
"Conference in Paris 🇫🇷 ✈️" => "Conference in Paris 🇫🇷 ✈️"
|
||||
];
|
||||
|
||||
foreach ($testCases as $input => $expected) {
|
||||
$result = Encode::convert($input);
|
||||
|
||||
$this->assertEquals($expected, $result, "Failed for emoji test: {$input}");
|
||||
$this->assertTrue(mb_check_encoding($result, 'UTF-8'), "Not valid UTF-8: {$input}");
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Test accented characters common in email subjects
|
||||
*/
|
||||
public function testAccentedCharacters()
|
||||
{
|
||||
$testCases = [
|
||||
// French
|
||||
"Café résumé naïve façade" => "Café résumé naïve façade",
|
||||
|
||||
// Spanish
|
||||
"Niño piñata mañana" => "Niño piñata mañana",
|
||||
|
||||
// German
|
||||
"Größe Weiß Mädchen" => "Größe Weiß Mädchen",
|
||||
|
||||
// Portuguese
|
||||
"Coração São Paulo" => "Coração São Paulo",
|
||||
|
||||
// Mixed languages
|
||||
"Café & Niño résumé" => "Café & Niño résumé"
|
||||
];
|
||||
|
||||
foreach ($testCases as $input => $expected) {
|
||||
$result = Encode::convert($input);
|
||||
|
||||
$this->assertEquals($expected, $result, "Failed for accent test: {$input}");
|
||||
$this->assertTrue(mb_check_encoding($result, 'UTF-8'), "Not valid UTF-8: {$input}");
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Test special symbols commonly used in email subjects
|
||||
*/
|
||||
public function testSpecialSymbols()
|
||||
{
|
||||
$testCases = [
|
||||
// Currency symbols
|
||||
"Invoice €50.00 £25.99 ¥1000" => "Invoice €50.00 £25.99 ¥1000",
|
||||
|
||||
// Smart quotes and dashes
|
||||
"Company's \"quoted\" text—dash…ellipsis" => "Company's \"quoted\" text—dash…ellipsis",
|
||||
|
||||
// Copyright and trademark
|
||||
"Product™ Service© Brand®" => "Product™ Service© Brand®",
|
||||
|
||||
// Mathematical symbols
|
||||
"Discount ≥ 20% ± 5%" => "Discount ≥ 20% ± 5%",
|
||||
|
||||
// Arrows and symbols
|
||||
"Process → Complete ✓" => "Process → Complete ✓"
|
||||
];
|
||||
|
||||
foreach ($testCases as $input => $expected) {
|
||||
$result = Encode::convert($input);
|
||||
|
||||
$this->assertEquals($expected, $result, "Failed for symbol test: {$input}");
|
||||
$this->assertTrue(mb_check_encoding($result, 'UTF-8'), "Not valid UTF-8: {$input}");
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Test email subjects with mixed content (the most realistic scenario)
|
||||
*/
|
||||
public function testMixedContentEmailSubjects()
|
||||
{
|
||||
$testCases = [
|
||||
// User's exact example
|
||||
"Rappel facture impayée (\$invoice) 🚀" => "Rappel facture impayée (\$invoice) 🚀",
|
||||
|
||||
// Invoice with currency and emoji
|
||||
"Facture #123 - €150.00 💰" => "Facture #123 - €150.00 💰",
|
||||
|
||||
// Reminder with accents and emoji
|
||||
"Relance: paiement en retard 📅 ⚠️" => "Relance: paiement en retard 📅 ⚠️",
|
||||
|
||||
// Welcome message
|
||||
"Bienvenue chez Café ☕ 🥐" => "Bienvenue chez Café ☕ 🥐",
|
||||
|
||||
// Complex business scenario
|
||||
"Réunion équipe → 15h30 📊 🎯" => "Réunion équipe → 15h30 📊 🎯"
|
||||
];
|
||||
|
||||
foreach ($testCases as $input => $expected) {
|
||||
$result = Encode::convert($input);
|
||||
|
||||
$this->assertEquals($expected, $result, "Failed for mixed content test: {$input}");
|
||||
$this->assertTrue(mb_check_encoding($result, 'UTF-8'), "Not valid UTF-8: {$input}");
|
||||
|
||||
// Verify character count is preserved (important for emojis)
|
||||
$this->assertEquals(
|
||||
mb_strlen($expected, 'UTF-8'),
|
||||
mb_strlen($result, 'UTF-8'),
|
||||
"Character count mismatch for: {$input}"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Test corrupted Windows-1252 content that needs conversion
|
||||
*/
|
||||
public function testCorruptedEncodingConversion()
|
||||
{
|
||||
// Simulate content that was incorrectly encoded as Windows-1252
|
||||
$windows1252Input = mb_convert_encoding("Café résumé", 'WINDOWS-1252', 'UTF-8');
|
||||
$result = Encode::convert($windows1252Input);
|
||||
|
||||
$this->assertEquals("Café résumé", $result);
|
||||
$this->assertTrue(mb_check_encoding($result, 'UTF-8'));
|
||||
}
|
||||
|
||||
/**
|
||||
* Test Gmail-specific email subject requirements
|
||||
*/
|
||||
public function testGmailCompatibility()
|
||||
{
|
||||
$testCases = [
|
||||
// Long subject with emojis (Gmail truncates at ~70 chars in preview)
|
||||
"This is a long email subject with emojis 🚀 that might get truncated by Gmail 📧",
|
||||
|
||||
// Subject with only emojis
|
||||
"🚀📧🎉✅⚠️💰",
|
||||
|
||||
// Subject with special characters Gmail handles
|
||||
"Re: Fw: [URGENT] Company's \"Project\" Status—Update ✓",
|
||||
|
||||
// International content
|
||||
"国际业务 🌍 Négociation €500K 💼"
|
||||
];
|
||||
|
||||
foreach ($testCases as $input) {
|
||||
$result = Encode::convert($input);
|
||||
|
||||
// Should be valid UTF-8 (Gmail requirement)
|
||||
$this->assertTrue(mb_check_encoding($result, 'UTF-8'), "Gmail compatibility failed for: {$input}");
|
||||
|
||||
// Should not contain replacement characters
|
||||
$this->assertStringNotContainsString("\xEF\xBF\xBD", $result, "Contains replacement characters: {$input}");
|
||||
$this->assertStringNotContainsString('�', $result, "Contains double-encoded replacement: {$input}");
|
||||
|
||||
// Should preserve original content for valid UTF-8
|
||||
$this->assertEquals($input, $result, "Content changed unnecessarily: {$input}");
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Test edge cases that might break email clients
|
||||
*/
|
||||
public function testEmailClientEdgeCases()
|
||||
{
|
||||
$testCases = [
|
||||
// Empty string
|
||||
"" => "",
|
||||
|
||||
// Only spaces
|
||||
" " => " ",
|
||||
|
||||
// Only special characters
|
||||
"€£¥" => "€£¥",
|
||||
|
||||
// Only emojis
|
||||
"🚀🎉📧" => "🚀🎉📧",
|
||||
|
||||
// Mixed spaces and emojis
|
||||
" 🚀 📧 🎉 " => " 🚀 📧 🎉 ",
|
||||
|
||||
// Newlines and tabs (should be preserved)
|
||||
"Line 1\nLine 2\tTabbed" => "Line 1\nLine 2\tTabbed"
|
||||
];
|
||||
|
||||
foreach ($testCases as $input => $expected) {
|
||||
$result = Encode::convert($input);
|
||||
|
||||
$this->assertEquals($expected, $result, "Edge case failed: " . var_export($input, true));
|
||||
$this->assertTrue(mb_check_encoding($result, 'UTF-8'), "Not valid UTF-8: " . var_export($input, true));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Test performance with typical email subject lengths
|
||||
*/
|
||||
public function testPerformanceWithTypicalSubjects()
|
||||
{
|
||||
$baseSubject = "Rappel facture impayée (\$invoice) 🚀";
|
||||
|
||||
// Test with different subject lengths
|
||||
$subjects = [
|
||||
$baseSubject, // ~40 chars
|
||||
str_repeat($baseSubject . " ", 2), // ~80 chars
|
||||
str_repeat($baseSubject . " ", 5), // ~200 chars
|
||||
];
|
||||
|
||||
foreach ($subjects as $subject) {
|
||||
$startTime = microtime(true);
|
||||
$result = Encode::convert($subject);
|
||||
$endTime = microtime(true);
|
||||
|
||||
$executionTime = ($endTime - $startTime) * 1000; // Convert to milliseconds
|
||||
|
||||
// Should complete quickly (under 10ms for email subjects)
|
||||
$this->assertLessThan(10, $executionTime, "Too slow for subject: " . strlen($subject) . " chars");
|
||||
$this->assertTrue(mb_check_encoding($result, 'UTF-8'));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Test that the method is safe to call multiple times
|
||||
*/
|
||||
public function testIdempotency()
|
||||
{
|
||||
$original = "Rappel facture impayée (\$invoice) 🚀";
|
||||
|
||||
$first = Encode::convert($original);
|
||||
$second = Encode::convert($first);
|
||||
$third = Encode::convert($second);
|
||||
|
||||
// Should be identical after multiple conversions
|
||||
$this->assertEquals($original, $first);
|
||||
$this->assertEquals($first, $second);
|
||||
$this->assertEquals($second, $third);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,334 @@
|
|||
<?php
|
||||
|
||||
namespace Tests\Unit;
|
||||
|
||||
use Tests\TestCase;
|
||||
|
||||
class EncodeWithoutClassFailureTest extends TestCase
|
||||
{
|
||||
private string $problematicSubject = "Rappel facture impayée (\$invoice) 🚀";
|
||||
|
||||
/**
|
||||
* Test that direct mb_convert_encoding through Windows-1252 corrupts emojis
|
||||
*/
|
||||
public function testDirectConversionCorruptsEmojis()
|
||||
{
|
||||
$original = $this->problematicSubject;
|
||||
|
||||
// This is what would happen without the Encode class - forcing conversion through Windows-1252
|
||||
$corrupted = mb_convert_encoding($original, 'UTF-8', 'WINDOWS-1252');
|
||||
|
||||
// Should NOT be equal to original (emoji gets corrupted)
|
||||
$this->assertNotEquals($original, $corrupted);
|
||||
|
||||
// Emoji should be lost/corrupted
|
||||
$this->assertStringNotContainsString('🚀', $corrupted);
|
||||
|
||||
// Should contain corruption artifacts
|
||||
$this->assertTrue(
|
||||
str_contains($corrupted, "\xEF\xBF\xBD") || // Replacement character
|
||||
str_contains($corrupted, '?') || // Question mark replacement
|
||||
str_contains($corrupted, 'é') || // Double-encoded é
|
||||
str_contains($corrupted, '🚀') || // Corrupted emoji
|
||||
strlen($corrupted) < strlen($original), // Characters lost
|
||||
"Expected emoji corruption but content seems intact. Original: {$original}, Corrupted: {$corrupted}"
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* Test that naive iconv usage fails with emojis
|
||||
*/
|
||||
public function testIconvFailsWithEmojis()
|
||||
{
|
||||
$original = $this->problematicSubject;
|
||||
|
||||
// Common mistake: trying to convert UTF-8 through ISO-8859-1
|
||||
$result = iconv('ISO-8859-1', 'UTF-8//IGNORE', $original);
|
||||
|
||||
// Should fail or corrupt the content
|
||||
$this->assertNotEquals($original, $result);
|
||||
|
||||
// Should lose the emoji
|
||||
$this->assertStringNotContainsString('🚀', $result);
|
||||
}
|
||||
|
||||
/**
|
||||
* Test that forcing through ASCII destroys international characters
|
||||
*/
|
||||
public function testAsciiConversionDestroysInternationalChars()
|
||||
{
|
||||
$original = $this->problematicSubject;
|
||||
|
||||
// Naive approach: force to ASCII
|
||||
$asciiAttempt = iconv('UTF-8', 'ASCII//IGNORE', $original);
|
||||
|
||||
// Should lose both emoji and accented characters
|
||||
$this->assertNotEquals($original, $asciiAttempt);
|
||||
$this->assertStringNotContainsString('🚀', $asciiAttempt);
|
||||
$this->assertStringNotContainsString('impayée', $asciiAttempt);
|
||||
|
||||
// Should contain "impaye" instead (accent completely removed)
|
||||
$this->assertStringContainsString('impaye', $asciiAttempt);
|
||||
}
|
||||
|
||||
/**
|
||||
* Test that manual character replacement approach is inadequate
|
||||
*/
|
||||
public function testManualReplacementInadequate()
|
||||
{
|
||||
$original = $this->problematicSubject;
|
||||
|
||||
// Naive manual approach that many developers try
|
||||
$manualAttempt = str_replace([
|
||||
'é',
|
||||
'à',
|
||||
'ç',
|
||||
'ù'
|
||||
], [
|
||||
'e',
|
||||
'a',
|
||||
'c',
|
||||
'u'
|
||||
], $original);
|
||||
|
||||
// Still has the emoji problem - can't handle all Unicode
|
||||
$this->assertNotEquals($original, $manualAttempt);
|
||||
|
||||
// Manual replacement changes the é in "impayée" to "e"
|
||||
$this->assertStringNotContainsString('impayée', $manualAttempt);
|
||||
$this->assertStringContainsString('impayee', $manualAttempt);
|
||||
|
||||
// Emoji remains but manual approach doesn't solve encoding issues
|
||||
$this->assertStringContainsString('🚀', $manualAttempt);
|
||||
}
|
||||
|
||||
/**
|
||||
* Test simulated database storage/retrieval corruption
|
||||
*/
|
||||
public function testDatabaseStorageCorruption()
|
||||
{
|
||||
$original = $this->problematicSubject;
|
||||
|
||||
// Simulate what happens when storing in Latin1 database column
|
||||
$latin1Encoded = mb_convert_encoding($original, 'ISO-8859-1', 'UTF-8');
|
||||
$retrievedBack = mb_convert_encoding($latin1Encoded, 'UTF-8', 'ISO-8859-1');
|
||||
|
||||
// Should be corrupted
|
||||
$this->assertNotEquals($original, $retrievedBack);
|
||||
|
||||
// Emoji definitely lost
|
||||
$this->assertStringNotContainsString('🚀', $retrievedBack);
|
||||
}
|
||||
|
||||
/**
|
||||
* Test simulated file read/write corruption
|
||||
*/
|
||||
public function testFileHandlingCorruption()
|
||||
{
|
||||
$original = $this->problematicSubject;
|
||||
|
||||
// Create a temporary file and write with wrong encoding assumption
|
||||
$tempFile = tempnam(sys_get_temp_dir(), 'encoding_fail_test_');
|
||||
|
||||
// Simulate writing as Windows-1252
|
||||
$windows1252Content = mb_convert_encoding($original, 'WINDOWS-1252', 'UTF-8');
|
||||
file_put_contents($tempFile, $windows1252Content);
|
||||
|
||||
// Now read it back assuming UTF-8 (common mistake)
|
||||
$corruptedRead = file_get_contents($tempFile);
|
||||
|
||||
// Should be corrupted
|
||||
$this->assertNotEquals($original, $corruptedRead);
|
||||
|
||||
// Should not be valid UTF-8
|
||||
$this->assertFalse(mb_check_encoding($corruptedRead, 'UTF-8'));
|
||||
|
||||
// Clean up
|
||||
unlink($tempFile);
|
||||
}
|
||||
|
||||
/**
|
||||
* Test what happens with common "sanitization" approaches
|
||||
*/
|
||||
public function testCommonSanitizationBreaksContent()
|
||||
{
|
||||
$original = $this->problematicSubject;
|
||||
|
||||
// Common "sanitization" that developers might try
|
||||
$sanitized = filter_var($original, FILTER_SANITIZE_STRING, FILTER_FLAG_STRIP_HIGH);
|
||||
|
||||
if ($sanitized !== false) {
|
||||
// Should remove high-bit characters (including emoji and accents)
|
||||
$this->assertNotEquals($original, $sanitized);
|
||||
$this->assertStringNotContainsString('🚀', $sanitized);
|
||||
$this->assertStringNotContainsString('impayée', $sanitized);
|
||||
} else {
|
||||
// Filter might fail entirely
|
||||
$this->assertFalse($sanitized);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Test naive regular expression replacement
|
||||
*/
|
||||
public function testRegexReplacementBreaksUnicode()
|
||||
{
|
||||
$original = $this->problematicSubject;
|
||||
|
||||
// Naive attempt to "clean" the string with regex
|
||||
$regexCleaned = preg_replace('/[^\x20-\x7E]/', '?', $original);
|
||||
|
||||
// Should replace all non-ASCII characters with ?
|
||||
$this->assertNotEquals($original, $regexCleaned);
|
||||
$this->assertStringNotContainsString('🚀', $regexCleaned);
|
||||
$this->assertStringNotContainsString('impayée', $regexCleaned);
|
||||
|
||||
// Should contain question marks
|
||||
$this->assertStringContainsString('?', $regexCleaned);
|
||||
}
|
||||
|
||||
/**
|
||||
* Test double-encoding problems
|
||||
*/
|
||||
public function testDoubleEncodingProblems()
|
||||
{
|
||||
$original = $this->problematicSubject;
|
||||
|
||||
// Simulate double-encoding (common web application bug)
|
||||
$firstEncoding = mb_convert_encoding($original, 'ISO-8859-1', 'UTF-8');
|
||||
$doubleEncoded = mb_convert_encoding($firstEncoding, 'UTF-8', 'ISO-8859-1');
|
||||
|
||||
// Should be different and corrupted
|
||||
$this->assertNotEquals($original, $doubleEncoded);
|
||||
|
||||
// Common double-encoding artifacts
|
||||
$this->assertTrue(
|
||||
str_contains($doubleEncoded, 'é') || // é becomes é
|
||||
str_contains($doubleEncoded, 'â€') || // Other artifacts
|
||||
!str_contains($doubleEncoded, '🚀'), // Emoji lost
|
||||
"Expected double-encoding artifacts but got: " . $doubleEncoded
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* Test CSV export/import corruption
|
||||
*/
|
||||
public function testCsvCorruption()
|
||||
{
|
||||
$original = $this->problematicSubject;
|
||||
|
||||
// Simulate CSV export without proper encoding
|
||||
$csvLine = '"' . $original . '"';
|
||||
|
||||
// Write to temp file with wrong encoding
|
||||
$tempFile = tempnam(sys_get_temp_dir(), 'csv_fail_test_');
|
||||
file_put_contents($tempFile, $csvLine, LOCK_EX);
|
||||
|
||||
// Read back with wrong encoding assumption
|
||||
$contents = file_get_contents($tempFile);
|
||||
|
||||
// Parse CSV (simplified)
|
||||
$parsed = str_replace('"', '', $contents);
|
||||
|
||||
// If the file system or CSV handling messed up encoding
|
||||
if (!mb_check_encoding($parsed, 'UTF-8')) {
|
||||
$this->assertNotEquals($original, $parsed);
|
||||
} else {
|
||||
// Even if it's valid UTF-8, it might still be different due to CSV processing
|
||||
$this->assertTrue(true, "CSV processing completed");
|
||||
}
|
||||
|
||||
// Clean up
|
||||
unlink($tempFile);
|
||||
}
|
||||
|
||||
/**
|
||||
* Test JSON encoding/decoding issues
|
||||
*/
|
||||
public function testJsonEncodingIssues()
|
||||
{
|
||||
$original = $this->problematicSubject;
|
||||
|
||||
// Create array with the subject
|
||||
$data = ['subject' => $original];
|
||||
|
||||
// Encode to JSON
|
||||
$json = json_encode($data);
|
||||
$this->assertNotFalse($json, "JSON encoding should work with UTF-8");
|
||||
|
||||
// Decode back
|
||||
$decoded = json_decode($json, true);
|
||||
$this->assertNotNull($decoded, "JSON decoding should work");
|
||||
|
||||
// This should actually work correctly with modern PHP
|
||||
// But let's test what happens if someone tries to "fix" it
|
||||
$brokenAttempt = json_encode($data, JSON_UNESCAPED_UNICODE | JSON_INVALID_UTF8_IGNORE);
|
||||
$brokenDecoded = json_decode($brokenAttempt, true);
|
||||
|
||||
// The point is that without proper understanding, people might use wrong flags
|
||||
// and lose data integrity
|
||||
if ($brokenDecoded !== null && isset($brokenDecoded['subject'])) {
|
||||
// In some PHP versions or configurations, this might alter the data
|
||||
$this->assertTrue(
|
||||
$decoded['subject'] === $original,
|
||||
"Proper JSON handling preserves Unicode"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Test email header encoding issues
|
||||
*/
|
||||
public function testEmailHeaderEncodingIssues()
|
||||
{
|
||||
$original = $this->problematicSubject;
|
||||
|
||||
// Naive attempt to create email header without proper encoding
|
||||
$naiveHeader = "Subject: " . $original;
|
||||
|
||||
// Email headers with non-ASCII characters need RFC 2047 encoding
|
||||
// Without proper encoding, the subject would be corrupted by email servers
|
||||
|
||||
// Simulate what an email server might do with unencoded headers
|
||||
$serverProcessed = preg_replace('/[^\x20-\x7E]/', '?', $naiveHeader);
|
||||
|
||||
$this->assertNotEquals($naiveHeader, $serverProcessed);
|
||||
$this->assertStringNotContainsString('🚀', $serverProcessed);
|
||||
$this->assertStringNotContainsString('impayée', $serverProcessed);
|
||||
|
||||
// Should contain replacement characters
|
||||
$this->assertStringContainsString('?', $serverProcessed);
|
||||
}
|
||||
|
||||
/**
|
||||
* Summary test showing multiple failure modes
|
||||
*/
|
||||
public function testMultipleFailureModes()
|
||||
{
|
||||
$original = $this->problematicSubject;
|
||||
$failures = [];
|
||||
|
||||
// Collect all the ways it can fail
|
||||
$attempts = [
|
||||
'windows1252' => mb_convert_encoding($original, 'UTF-8', 'WINDOWS-1252'),
|
||||
'ascii' => iconv('UTF-8', 'ASCII//IGNORE', $original),
|
||||
'latin1_roundtrip' => mb_convert_encoding(mb_convert_encoding($original, 'ISO-8859-1', 'UTF-8'), 'UTF-8', 'ISO-8859-1'),
|
||||
'regex_strip' => preg_replace('/[^\x20-\x7E]/', '', $original),
|
||||
'filter_sanitize' => filter_var($original, FILTER_SANITIZE_STRING, FILTER_FLAG_STRIP_HIGH),
|
||||
];
|
||||
|
||||
foreach ($attempts as $method => $result) {
|
||||
if ($result !== false && $result !== $original) {
|
||||
$failures[$method] = $result;
|
||||
}
|
||||
}
|
||||
|
||||
// All methods should fail to preserve the original
|
||||
$this->assertGreaterThan(0, count($failures), "At least some methods should fail");
|
||||
|
||||
// None of the failed attempts should contain the emoji
|
||||
foreach ($failures as $method => $result) {
|
||||
$this->assertStringNotContainsString('🚀', $result, "Method {$method} should lose emoji");
|
||||
}
|
||||
}
|
||||
}
|
||||
Loading…
Reference in New Issue