511 lines
19 KiB
PHP
511 lines
19 KiB
PHP
<?php
|
||
|
||
namespace Tests\Unit;
|
||
|
||
use Tests\TestCase;
|
||
use App\Http\Controllers\ImportController;
|
||
use ReflectionClass;
|
||
use ReflectionMethod;
|
||
|
||
class ImportUnicodeEncodingTest extends TestCase
|
||
{
|
||
private ImportController $controller;
|
||
private ReflectionMethod $readFileMethod;
|
||
private ReflectionMethod $isValidConversionMethod;
|
||
private ReflectionMethod $removeBOMMethod;
|
||
|
||
protected function setUp(): void
|
||
{
|
||
parent::setUp();
|
||
|
||
$this->controller = new ImportController();
|
||
|
||
// Use reflection to access private methods
|
||
$reflection = new ReflectionClass($this->controller);
|
||
$this->readFileMethod = $reflection->getMethod('readFileWithProperEncoding');
|
||
$this->readFileMethod->setAccessible(true);
|
||
|
||
$this->isValidConversionMethod = $reflection->getMethod('isValidConversion');
|
||
$this->isValidConversionMethod->setAccessible(true);
|
||
|
||
$this->removeBOMMethod = $reflection->getMethod('removeBOM');
|
||
$this->removeBOMMethod->setAccessible(true);
|
||
}
|
||
|
||
/**
|
||
* Test data with various Unicode blocks and international content
|
||
*/
|
||
private function getUnicodeTestData(): array
|
||
{
|
||
return [
|
||
// Basic Latin and Latin Extended
|
||
'latin_basic' => "Hello World! Company's data",
|
||
'latin_extended' => "Café résumé naïve piñata façade",
|
||
|
||
// Greek
|
||
'greek' => "Καλημέρα κόσμε! Ελληνικά γράμματα",
|
||
|
||
// Cyrillic
|
||
'cyrillic' => "Привет мир! Русский текст",
|
||
|
||
// Arabic (RTL)
|
||
'arabic' => "مرحبا بالعالم! النص العربي",
|
||
|
||
// Hebrew (RTL)
|
||
'hebrew' => "שלום עולם! טקסט עברי",
|
||
|
||
// Chinese Simplified
|
||
'chinese_simplified' => "你好世界!简体中文",
|
||
|
||
// Chinese Traditional
|
||
'chinese_traditional' => "你好世界!繁體中文",
|
||
|
||
// Japanese (Hiragana, Katakana, Kanji)
|
||
'japanese' => "こんにちは世界!ひらがな・カタカナ・漢字",
|
||
|
||
// Korean
|
||
'korean' => "안녕하세요 세계! 한국어 텍스트",
|
||
|
||
// Mathematical symbols
|
||
'mathematical' => "∑∫∞±≤≥≠√∂∇∆",
|
||
|
||
// Currency symbols
|
||
'currency' => "€£¥₹₽₨₩₪₦₡₸",
|
||
|
||
// Emoji and symbols
|
||
'emoji' => "😀🌍🚀💻📊✨🎉🔥💡⭐",
|
||
|
||
// Mixed scripts
|
||
'mixed_scripts' => "Hello мир 世界 🌍 café résumé",
|
||
|
||
// Special Unicode cases
|
||
'zero_width' => "Text\u{200B}with\u{FEFF}zero\u{200C}width\u{200D}chars",
|
||
'combining' => "e\u{0301}a\u{0300}i\u{0302}o\u{0303}u\u{0308}", // é à î õ ü
|
||
|
||
// Quotation marks and dashes
|
||
'punctuation' => "«quotes» \u{201C}smart\u{201D} \u{2018}quotes\u{2019} — – … ‚ „",
|
||
];
|
||
}
|
||
|
||
/**
|
||
* Extended encoding list for comprehensive testing
|
||
*/
|
||
private function getExtendedEncodings(): array
|
||
{
|
||
return [
|
||
// Unicode variants
|
||
'UTF-8',
|
||
'UTF-8-BOM',
|
||
'UTF-16BE',
|
||
'UTF-16LE',
|
||
'UTF-32BE',
|
||
'UTF-32LE',
|
||
|
||
// ISO Latin variants (commonly supported)
|
||
'ISO-8859-1', // Western European
|
||
'ISO-8859-2', // Central European
|
||
'ISO-8859-5', // Cyrillic
|
||
'ISO-8859-7', // Greek
|
||
'ISO-8859-9', // Turkish
|
||
'ISO-8859-15', // Western European (with Euro)
|
||
|
||
// Windows code pages (commonly supported)
|
||
'Windows-1251', // Cyrillic
|
||
'Windows-1252', // Western European
|
||
|
||
// Other commonly supported encodings
|
||
'CP1252', // Windows Western
|
||
];
|
||
}
|
||
|
||
/**
|
||
* Create a test file with specific content and encoding
|
||
*/
|
||
private function createTestFile(string $content, string $encoding): string
|
||
{
|
||
$tempFile = tempnam(sys_get_temp_dir(), 'unicode_test_');
|
||
|
||
switch ($encoding) {
|
||
case 'UTF-8-BOM':
|
||
$content = "\xEF\xBB\xBF" . $content;
|
||
file_put_contents($tempFile, $content);
|
||
break;
|
||
|
||
case 'UTF-16BE':
|
||
$content = "\xFE\xFF" . mb_convert_encoding($content, 'UTF-16BE', 'UTF-8');
|
||
file_put_contents($tempFile, $content);
|
||
break;
|
||
|
||
case 'UTF-16LE':
|
||
$content = "\xFF\xFE" . mb_convert_encoding($content, 'UTF-16LE', 'UTF-8');
|
||
file_put_contents($tempFile, $content);
|
||
break;
|
||
|
||
case 'UTF-32BE':
|
||
$content = "\x00\x00\xFE\xFF" . mb_convert_encoding($content, 'UTF-32BE', 'UTF-8');
|
||
file_put_contents($tempFile, $content);
|
||
break;
|
||
|
||
case 'UTF-32LE':
|
||
$content = "\xFF\xFE\x00\x00" . mb_convert_encoding($content, 'UTF-32LE', 'UTF-8');
|
||
file_put_contents($tempFile, $content);
|
||
break;
|
||
|
||
case 'UTF-8':
|
||
file_put_contents($tempFile, $content);
|
||
break;
|
||
|
||
default:
|
||
// Try to convert using mb_convert_encoding
|
||
try {
|
||
// Check if encoding is supported
|
||
if (!in_array($encoding, mb_list_encodings())) {
|
||
// If encoding not supported, use UTF-8 fallback
|
||
file_put_contents($tempFile, $content);
|
||
break;
|
||
}
|
||
|
||
$encoded = mb_convert_encoding($content, $encoding, 'UTF-8');
|
||
file_put_contents($tempFile, $encoded);
|
||
} catch (Exception | ValueError $e) {
|
||
// If conversion fails, use UTF-8 fallback
|
||
file_put_contents($tempFile, $content);
|
||
}
|
||
break;
|
||
}
|
||
|
||
return $tempFile;
|
||
}
|
||
|
||
/**
|
||
* Test 1: Unicode content preservation across different UTF encodings
|
||
*/
|
||
public function testUnicodeContentPreservation()
|
||
{
|
||
$unicodeEncodings = ['UTF-8', 'UTF-8-BOM', 'UTF-16BE', 'UTF-16LE', 'UTF-32BE', 'UTF-32LE'];
|
||
|
||
foreach ($this->getUnicodeTestData() as $name => $content) {
|
||
foreach ($unicodeEncodings as $encoding) {
|
||
$tempFile = $this->createTestFile($content, $encoding);
|
||
|
||
$result = $this->readFileMethod->invoke($this->controller, $tempFile);
|
||
|
||
$this->assertEquals(
|
||
$content,
|
||
$result,
|
||
"Unicode preservation failed for {$name} with {$encoding} encoding"
|
||
);
|
||
|
||
$this->assertTrue(
|
||
$this->isValidConversionMethod->invoke($this->controller, $result),
|
||
"Validation failed for {$name} with {$encoding} encoding"
|
||
);
|
||
|
||
unlink($tempFile);
|
||
}
|
||
}
|
||
}
|
||
|
||
/**
|
||
* Test 2: BOM handling for different UTF variants
|
||
*/
|
||
public function testBOMHandlingForAllUTF()
|
||
{
|
||
$testContent = "Hello 世界! Тест العالم";
|
||
|
||
$bomTests = [
|
||
'UTF-8' => "\xEF\xBB\xBF",
|
||
'UTF-16BE' => "\xFE\xFF",
|
||
'UTF-16LE' => "\xFF\xFE",
|
||
'UTF-32BE' => "\x00\x00\xFE\xFF",
|
||
'UTF-32LE' => "\xFF\xFE\x00\x00",
|
||
];
|
||
|
||
foreach ($bomTests as $encoding => $bom) {
|
||
// Create file with BOM using the createTestFile method
|
||
$tempFile = $this->createTestFile($testContent, $encoding);
|
||
|
||
// Test file processing with BOM
|
||
$fileResult = $this->readFileMethod->invoke($this->controller, $tempFile);
|
||
|
||
$this->assertEquals(
|
||
$testContent,
|
||
$fileResult,
|
||
"File processing with BOM failed for {$encoding}"
|
||
);
|
||
|
||
$this->assertTrue(
|
||
$this->isValidConversionMethod->invoke($this->controller, $fileResult),
|
||
"BOM file validation failed for {$encoding}"
|
||
);
|
||
|
||
unlink($tempFile);
|
||
}
|
||
|
||
// Test UTF-8 BOM removal specifically (since that's what the method is designed for)
|
||
$utf8DataWithBOM = "\xEF\xBB\xBF" . $testContent;
|
||
$result = $this->removeBOMMethod->invoke($this->controller, $utf8DataWithBOM);
|
||
|
||
$this->assertEquals(
|
||
$testContent,
|
||
$result,
|
||
"UTF-8 BOM removal failed"
|
||
);
|
||
}
|
||
|
||
/**
|
||
* Test 3: Extended encoding compatibility
|
||
*/
|
||
public function testExtendedEncodingCompatibility()
|
||
{
|
||
// Use content that's compatible with most encodings
|
||
$basicContent = "Company data with special chars";
|
||
$accentContent = "Cafe resume naive facade"; // Without actual accents for broader compatibility
|
||
|
||
foreach ($this->getExtendedEncodings() as $encoding) {
|
||
// Skip encodings that are known to not support certain characters
|
||
$content = $this->isAsciiCompatibleEncoding($encoding) ? $basicContent : $accentContent;
|
||
|
||
$tempFile = $this->createTestFile($content, $encoding);
|
||
|
||
$result = $this->readFileMethod->invoke($this->controller, $tempFile);
|
||
|
||
// Result should always be valid UTF-8
|
||
$this->assertTrue(
|
||
mb_check_encoding($result, 'UTF-8'),
|
||
"Result should be valid UTF-8 for encoding: {$encoding}"
|
||
);
|
||
|
||
// Should not contain replacement characters
|
||
$this->assertFalse(
|
||
str_contains($result, '<27>'),
|
||
"Result should not contain replacement characters for encoding: {$encoding}"
|
||
);
|
||
|
||
$this->assertTrue(
|
||
$this->isValidConversionMethod->invoke($this->controller, $result),
|
||
"Validation failed for encoding: {$encoding}"
|
||
);
|
||
|
||
unlink($tempFile);
|
||
}
|
||
}
|
||
|
||
/**
|
||
* Test 4: Right-to-left (RTL) text handling
|
||
*/
|
||
public function testRightToLeftTextHandling()
|
||
{
|
||
$rtlContent = [
|
||
'arabic' => "مرحبا بالعالم! شركة البيانات",
|
||
'hebrew' => "שלום עולם! חברת הנתונים",
|
||
'mixed_rtl' => "Hello مرحبا World עולם!",
|
||
];
|
||
|
||
foreach ($rtlContent as $name => $content) {
|
||
$tempFile = $this->createTestFile($content, 'UTF-8');
|
||
|
||
$result = $this->readFileMethod->invoke($this->controller, $tempFile);
|
||
|
||
$this->assertEquals($content, $result, "RTL test failed for: {$name}");
|
||
$this->assertTrue(
|
||
$this->isValidConversionMethod->invoke($this->controller, $result),
|
||
"RTL validation failed for: {$name}"
|
||
);
|
||
|
||
unlink($tempFile);
|
||
}
|
||
}
|
||
|
||
/**
|
||
* Test 5: Asian character sets (CJK)
|
||
*/
|
||
public function testAsianCharacterSets()
|
||
{
|
||
$cjkContent = [
|
||
'chinese_simplified' => "公司数据处理系统",
|
||
'chinese_traditional' => "公司資料處理系統",
|
||
'japanese_hiragana' => "かいしゃのでーたしすてむ",
|
||
'japanese_katakana' => "カイシャノデータシステム",
|
||
'japanese_kanji' => "会社のデータシステム",
|
||
'korean' => "회사 데이터 시스템",
|
||
'mixed_cjk' => "Company 公司 会社 회사 Data",
|
||
];
|
||
|
||
foreach ($cjkContent as $name => $content) {
|
||
$tempFile = $this->createTestFile($content, 'UTF-8');
|
||
|
||
$result = $this->readFileMethod->invoke($this->controller, $tempFile);
|
||
|
||
$this->assertEquals($content, $result, "CJK test failed for: {$name}");
|
||
$this->assertTrue(
|
||
$this->isValidConversionMethod->invoke($this->controller, $result),
|
||
"CJK validation failed for: {$name}"
|
||
);
|
||
|
||
unlink($tempFile);
|
||
}
|
||
}
|
||
|
||
/**
|
||
* Test 6: Emoji and symbol handling
|
||
*/
|
||
public function testEmojiAndSymbolHandling()
|
||
{
|
||
$symbolContent = [
|
||
'basic_emoji' => "Data 📊 Reports 📈 Analysis 🔍",
|
||
'complex_emoji' => "👨💻👩💼🏢💼📋📊📈📉",
|
||
'mathematical' => "∑(x²) ∫f(x)dx ∞ ≠ ≤ ≥ ± √",
|
||
'currency_symbols' => "Price: €100 £80 ¥1000 $75",
|
||
'technical_symbols' => "® © ™ § ¶ † ‡ • ‰ ‱",
|
||
'arrows_symbols' => "← → ↑ ↓ ↔ ↕ ⇐ ⇒ ⇔",
|
||
];
|
||
|
||
foreach ($symbolContent as $name => $content) {
|
||
$tempFile = $this->createTestFile($content, 'UTF-8');
|
||
|
||
$result = $this->readFileMethod->invoke($this->controller, $tempFile);
|
||
|
||
$this->assertEquals($content, $result, "Symbol test failed for: {$name}");
|
||
$this->assertTrue(
|
||
$this->isValidConversionMethod->invoke($this->controller, $result),
|
||
"Symbol validation failed for: {$name}"
|
||
);
|
||
|
||
unlink($tempFile);
|
||
}
|
||
}
|
||
|
||
/**
|
||
* Test 7: Combining characters and normalization
|
||
*/
|
||
public function testCombiningCharacters()
|
||
{
|
||
$combiningContent = [
|
||
'accents_composed' => "café résumé naïve",
|
||
'accents_decomposed' => "cafe\u{0301} re\u{0301}sume\u{0301} nai\u{0308}ve",
|
||
'mixed_normalization' => "café cafe\u{0301} résumé re\u{0301}sume\u{0301}",
|
||
];
|
||
|
||
foreach ($combiningContent as $name => $content) {
|
||
$tempFile = $this->createTestFile($content, 'UTF-8');
|
||
|
||
$result = $this->readFileMethod->invoke($this->controller, $tempFile);
|
||
|
||
// Content should be preserved (normalization might occur but content should be valid)
|
||
$this->assertTrue(
|
||
mb_check_encoding($result, 'UTF-8'),
|
||
"Combining character result should be valid UTF-8 for: {$name}"
|
||
);
|
||
$this->assertTrue(
|
||
$this->isValidConversionMethod->invoke($this->controller, $result),
|
||
"Combining character validation failed for: {$name}"
|
||
);
|
||
|
||
unlink($tempFile);
|
||
}
|
||
}
|
||
|
||
/**
|
||
* Test 8: Large Unicode content performance
|
||
*/
|
||
public function testLargeUnicodeContentPerformance()
|
||
{
|
||
$unicodePattern = "🌍 Hello 世界 مرحبا Здравствуй שלום こんにちは 안녕하세요 ";
|
||
$largeContent = str_repeat($unicodePattern, 1000); // ~50KB of Unicode content
|
||
|
||
$tempFile = $this->createTestFile($largeContent, 'UTF-8');
|
||
|
||
$startTime = microtime(true);
|
||
$result = $this->readFileMethod->invoke($this->controller, $tempFile);
|
||
$endTime = microtime(true);
|
||
|
||
$processingTime = $endTime - $startTime;
|
||
|
||
$this->assertLessThan(2.0, $processingTime, "Large Unicode content processing should be fast");
|
||
$this->assertEquals($largeContent, $result, "Large Unicode content should be preserved");
|
||
$this->assertTrue(
|
||
$this->isValidConversionMethod->invoke($this->controller, $result),
|
||
"Large Unicode content validation failed"
|
||
);
|
||
|
||
unlink($tempFile);
|
||
}
|
||
|
||
/**
|
||
* Test 9: Mixed encoding scenarios
|
||
*/
|
||
public function testMixedEncodingScenarios()
|
||
{
|
||
// Simulate files that might have mixed encoding issues
|
||
$scenarios = [
|
||
'mostly_ascii_with_unicode' => "Regular text with émojis 😀 and symbols ™",
|
||
'csv_with_international' => "Name,Company,Location\n\"José García\",\"Café España\",\"São Paulo\"",
|
||
'business_names' => "McDonald's, L'Oréal, Nestlé, Björk & Co, Müller GmbH",
|
||
];
|
||
|
||
foreach ($scenarios as $name => $content) {
|
||
// Test with multiple encodings
|
||
$encodings = ['UTF-8', 'UTF-8-BOM', 'WINDOWS-1252', 'ISO-8859-1'];
|
||
|
||
foreach ($encodings as $encoding) {
|
||
$tempFile = $this->createTestFile($content, $encoding);
|
||
|
||
$result = $this->readFileMethod->invoke($this->controller, $tempFile);
|
||
|
||
$this->assertTrue(
|
||
mb_check_encoding($result, 'UTF-8'),
|
||
"Mixed encoding result should be valid UTF-8 for {$name} with {$encoding}"
|
||
);
|
||
$this->assertTrue(
|
||
$this->isValidConversionMethod->invoke($this->controller, $result),
|
||
"Mixed encoding validation failed for {$name} with {$encoding}"
|
||
);
|
||
|
||
unlink($tempFile);
|
||
}
|
||
}
|
||
}
|
||
|
||
/**
|
||
* Helper method to determine if an encoding is ASCII-compatible
|
||
*/
|
||
private function isAsciiCompatibleEncoding(string $encoding): bool
|
||
{
|
||
$asciiOnlyEncodings = ['ASCII', 'US-ASCII'];
|
||
return in_array($encoding, $asciiOnlyEncodings);
|
||
}
|
||
|
||
/**
|
||
* Test 10: CSV data with international content
|
||
*/
|
||
public function testCSVWithInternationalContent()
|
||
{
|
||
$csvContent = "Name,Company,City,Country,Notes\n" .
|
||
"\"José García\",\"Café España\",\"São Paulo\",\"Brasil\",\"Açaí supplier\"\n" .
|
||
"\"李小明\",\"北京科技公司\",\"北京\",\"中国\",\"Technology partner\"\n" .
|
||
"\"Müller\",\"Bäckerei München\",\"München\",\"Deutschland\",\"Café & Bäckerei\"\n" .
|
||
"\"Иванов\",\"Москва ООО\",\"Москва\",\"Россия\",\"Software development\"\n" .
|
||
"\"محمد أحمد\",\"شركة الرياض\",\"الرياض\",\"السعودية\",\"Trading company\"";
|
||
|
||
$encodings = ['UTF-8', 'UTF-8-BOM', 'WINDOWS-1252'];
|
||
|
||
foreach ($encodings as $encoding) {
|
||
$tempFile = $this->createTestFile($csvContent, $encoding);
|
||
|
||
$result = $this->readFileMethod->invoke($this->controller, $tempFile);
|
||
|
||
$this->assertTrue(
|
||
mb_check_encoding($result, 'UTF-8'),
|
||
"CSV result should be valid UTF-8 for encoding: {$encoding}"
|
||
);
|
||
|
||
// Check that it contains expected international content
|
||
$this->assertStringContainsString("José García", $result, "Should contain Spanish names");
|
||
$this->assertStringContainsString("李小明", $result, "Should contain Chinese names");
|
||
$this->assertStringContainsString("Müller", $result, "Should contain German names");
|
||
|
||
unlink($tempFile);
|
||
}
|
||
}
|
||
}
|