From daf4391a30bef1f59eb8459862765d75f3eba4c2 Mon Sep 17 00:00:00 2001 From: David Bomba Date: Wed, 4 Jun 2025 10:22:12 +1000 Subject: [PATCH] Improve csv file encoding support for imports --- app/Http/Controllers/BaseController.php | 6 +- app/Http/Controllers/ClientController.php | 12 + app/Http/Controllers/ImportController.php | 273 ++++++++-- app/Http/Controllers/ImportJsonController.php | 144 ++++- app/Import/Providers/BaseImport.php | 2 +- app/Jobs/Invoice/ZipInvoices.php | 2 +- tests/Unit/ImportEncodingTest.php | 480 ++++++++++++++++ tests/Unit/ImportUnicodeEncodingTest.php | 511 ++++++++++++++++++ 8 files changed, 1375 insertions(+), 55 deletions(-) create mode 100644 tests/Unit/ImportEncodingTest.php create mode 100644 tests/Unit/ImportUnicodeEncodingTest.php diff --git a/app/Http/Controllers/BaseController.php b/app/Http/Controllers/BaseController.php index 392c509618..ca6b38de7a 100644 --- a/app/Http/Controllers/BaseController.php +++ b/app/Http/Controllers/BaseController.php @@ -88,9 +88,9 @@ class BaseController extends Controller /* Grouped permissions when we want to hide columns for particular permission groups*/ - private array $client_exclusion_fields = ['balance', 'paid_to_date', 'credit_balance', 'client_hash']; - private array $client_excludable_permissions = ['view_client']; - private array $client_excludable_overrides = ['edit_client', 'edit_all', 'view_invoice', 'view_all', 'edit_invoice']; + protected array $client_exclusion_fields = ['balance', 'paid_to_date', 'credit_balance', 'client_hash']; + protected array $client_excludable_permissions = ['view_client']; + protected array $client_excludable_overrides = ['edit_client', 'edit_all', 'view_invoice', 'view_all', 'edit_invoice']; /* Grouped permissions when we want to hide columns for particular permission groups*/ diff --git a/app/Http/Controllers/ClientController.php b/app/Http/Controllers/ClientController.php index 51d58c8e3b..c7e3d33b0b 100644 --- a/app/Http/Controllers/ClientController.php +++ b/app/Http/Controllers/ClientController.php @@ -112,6 +112,12 @@ class ClientController extends BaseController */ public function show(ShowClientRequest $request, Client $client) { + nlog("show"); + if(auth()->user()->hasExcludedPermissions($this->client_excludable_permissions, $this->client_excludable_overrides)){ + nlog('hiding fields'); + $client->makeHidden($this->client_exclusion_fields); + } + return $this->itemResponse($client); } @@ -125,6 +131,12 @@ class ClientController extends BaseController */ public function edit(EditClientRequest $request, Client $client) { + nlog("Edit"); + if (auth()->user()->hasExcludedPermissions($this->client_excludable_permissions, $this->client_excludable_overrides)) { + nlog('hiding fields'); + $client->makeHidden($this->client_exclusion_fields); + } + return $this->itemResponse($client); } diff --git a/app/Http/Controllers/ImportController.php b/app/Http/Controllers/ImportController.php index eb7d834df7..67a4eae04b 100644 --- a/app/Http/Controllers/ImportController.php +++ b/app/Http/Controllers/ImportController.php @@ -83,8 +83,8 @@ class ImportController extends Controller ]; /** @var UploadedFile $file */ foreach ($request->files->get('files') as $entityType => $file) { - $contents = file_get_contents($file->getPathname()); - // Store the csv in cache with an expiry of 10 minutes + $contents = $this->readFileWithProperEncoding($file->getPathname()); + Cache::put($hash.'-'.$entityType, base64_encode($contents), 1200); // Parse CSV @@ -104,6 +104,224 @@ class ImportController extends Controller return response()->json($data); } + private function readFileWithProperEncoding(string $filePath): string + { + // First, read the file and check if it's already clean UTF-8 + $contents = @file_get_contents($filePath); + if ($contents === false) { + return ''; + } + + // Check for different UTF BOMs and handle accordingly + $bomResult = $this->detectAndHandleUTFEncoding($contents); + if ($bomResult !== null) { + return $bomResult; + } + + // Remove BOM if present (for UTF-8 BOM) + $contents = $this->removeBOM($contents); + + // Check if it's clean UTF-8 first (no conversion needed) + if (mb_check_encoding($contents, 'UTF-8') && $this->isValidConversion($contents)) { + return $contents; + } + + // Method 1: Try reading with explicit Windows-1252 context + $context = stream_context_create([ + 'file' => [ + 'encoding' => 'WINDOWS-1252' + ] + ]); + + $contextContents = @file_get_contents($filePath, false, $context); + if ($contextContents !== false) { + $contextContents = $this->removeBOM($contextContents); + $converted = mb_convert_encoding($contextContents, 'UTF-8', 'WINDOWS-1252'); + if ($this->isValidConversion($converted)) { + return $converted; + } + } + + // Method 2: Binary read with forced Windows-1252 conversion + $handle = @fopen($filePath, 'rb'); + if ($handle) { + $binaryContents = fread($handle, filesize($filePath)); + fclose($handle); + + $binaryContents = $this->removeBOM($binaryContents); + + // Check if this looks like Windows-1252 by looking for problem bytes + if ($this->containsWindows1252Bytes($binaryContents)) { + $converted = mb_convert_encoding($binaryContents, 'UTF-8', 'WINDOWS-1252'); + if ($this->isValidConversion($converted)) { + return $converted; + } + } + } + + // Method 3: Fix corrupted UTF-8 replacement characters + if ($contents !== false) { + $fixed = $this->fixCorruptedWindows1252($contents); + if ($this->isValidConversion($fixed)) { + return $fixed; + } + } + + // Method 4: Try different encoding auto-detection with broader list + if ($contents !== false) { + $encodings = ['WINDOWS-1252', 'ISO-8859-1', 'ISO-8859-15', 'CP1252']; + foreach ($encodings as $encoding) { + $converted = mb_convert_encoding($contents, 'UTF-8', $encoding); + if ($this->isValidConversion($converted)) { + return $converted; + } + } + } + + // Fallback: return original contents + return $contents ?: ''; + } + + /** + * Detect and handle UTF-16 and UTF-32 encodings based on BOM + */ + private function detectAndHandleUTFEncoding(string $data): ?string + { + // UTF-32 BE BOM + if (substr($data, 0, 4) === "\x00\x00\xFE\xFF") { + $withoutBOM = substr($data, 4); + return mb_convert_encoding($withoutBOM, 'UTF-8', 'UTF-32BE'); + } + + // UTF-32 LE BOM + if (substr($data, 0, 4) === "\xFF\xFE\x00\x00") { + $withoutBOM = substr($data, 4); + return mb_convert_encoding($withoutBOM, 'UTF-8', 'UTF-32LE'); + } + + // UTF-16 BE BOM + if (substr($data, 0, 2) === "\xFE\xFF") { + $withoutBOM = substr($data, 2); + return mb_convert_encoding($withoutBOM, 'UTF-8', 'UTF-16BE'); + } + + // UTF-16 LE BOM + if (substr($data, 0, 2) === "\xFF\xFE") { + $withoutBOM = substr($data, 2); + return mb_convert_encoding($withoutBOM, 'UTF-8', 'UTF-16LE'); + } + + // Try to detect UTF-16/32 without BOM (heuristic approach) + $length = strlen($data); + + // UTF-32 detection (every 4th byte pattern) + if ($length >= 8 && $length % 4 === 0) { + $nullCount = 0; + for ($i = 0; $i < min(100, $length); $i += 4) { + if ($data[$i] === "\x00" && $data[$i + 1] === "\x00" && $data[$i + 2] === "\x00") { + $nullCount++; + } + } + if ($nullCount > 5) { // Likely UTF-32LE + return mb_convert_encoding($data, 'UTF-8', 'UTF-32LE'); + } + } + + // UTF-16 detection (every 2nd byte pattern) + if ($length >= 4 && $length % 2 === 0) { + $nullCount = 0; + for ($i = 0; $i < min(100, $length); $i += 2) { + if ($data[$i + 1] === "\x00") { + $nullCount++; + } + } + if ($nullCount > 10) { // Likely UTF-16LE + return mb_convert_encoding($data, 'UTF-8', 'UTF-16LE'); + } + + // Check for UTF-16BE + $nullCount = 0; + for ($i = 0; $i < min(100, $length); $i += 2) { + if ($data[$i] === "\x00") { + $nullCount++; + } + } + if ($nullCount > 10) { // Likely UTF-16BE + return mb_convert_encoding($data, 'UTF-8', 'UTF-16BE'); + } + } + + return null; + } + + /** + * Remove BOM (Byte Order Mark) from the beginning of a string + */ + private function removeBOM(string $data): string + { + // UTF-8 BOM + if (substr($data, 0, 3) === "\xEF\xBB\xBF") { + return substr($data, 3); + } + + // UTF-16 BE BOM + if (substr($data, 0, 2) === "\xFE\xFF") { + return substr($data, 2); + } + + // UTF-16 LE BOM + if (substr($data, 0, 2) === "\xFF\xFE") { + return substr($data, 2); + } + + // UTF-32 BE BOM + if (substr($data, 0, 4) === "\x00\x00\xFE\xFF") { + return substr($data, 4); + } + + // UTF-32 LE BOM + if (substr($data, 0, 4) === "\xFF\xFE\x00\x00") { + return substr($data, 4); + } + + return $data; + } + + private function containsWindows1252Bytes(string $data): bool + { + // Check for Windows-1252 specific bytes in 0x80-0x9F range + $windows1252Bytes = [0x80, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8A, 0x8B, 0x8C, 0x8E, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9A, 0x9B, 0x9C, 0x9E, 0x9F]; + + foreach ($windows1252Bytes as $byte) { + if (strpos($data, chr($byte)) !== false) { + return true; + } + } + return false; + } + + private function fixCorruptedWindows1252(string $data): string + { + // Map of UTF-8 replacement sequences back to proper characters + $replacements = [ + "\xEF\xBF\xBD" => "\u{2019}", // Most common: right single quote (0x92) - use smart quote + // Add more mappings as needed based on your data + ]; + + return str_replace(array_keys($replacements), array_values($replacements), $data); + } + + private function isValidConversion(string $data): bool + { + // Check if conversion was successful: + // 1. Must be valid UTF-8 + // 2. Must NOT contain replacement characters (indicating corruption) + // 3. Additional check for double-encoded replacement + return mb_check_encoding($data, 'UTF-8') && + !str_contains($data, "\xEF\xBF\xBD") && // UTF-8 replacement character bytes + !str_contains($data, '�'); // Double-encoded replacement character + } + private function setImportHints($entity_type, $available_keys, $headers): array { $hints = []; @@ -203,7 +421,9 @@ class ImportController extends Controller /** @var UploadedFile $file */ foreach ($request->files->get('files') as $entityType => $file) { - $contents = file_get_contents($file->getPathname()); + // $contents = file_get_contents($file->getPathname()); + $contents = $this->readFileWithProperEncoding($file->getPathname()); + // Store the csv in cache with an expiry of 10 minutes Cache::put($hash.'-'.$entityType, base64_encode($contents), 600); nlog($hash.'-'.$entityType); @@ -248,54 +468,9 @@ class ImportController extends Controller } } - return $this->convertData($data); + return $data; // Remove the convertData call since we fixed encoding upfront } - - - private function convertData(array $data): array - { - - // List of encodings to check against - $encodings = [ - 'UTF-8', - 'ISO-8859-1', // Latin-1 - 'ISO-8859-2', // Latin-2 - 'WINDOWS-1252', // CP1252 - 'SHIFT-JIS', - 'EUC-JP', - 'GB2312', - 'GBK', - 'BIG5', - 'ISO-2022-JP', - 'KOI8-R', - 'KOI8-U', - 'WINDOWS-1251', // CP1251 - 'UTF-16', - 'UTF-32', - 'ASCII', - 'WINDOWS-1254', // Turkish, which sometimes includes Georgian - 'WINDOWS-1256', // Arabic, which sometimes includes Georgian - 'ISO-8859-10', - ]; - - foreach ($data as $key => $value) { - // Only process strings - if (is_string($value)) { - // Detect the encoding of the string - $detectedEncoding = mb_detect_encoding($value, $encodings, true); - - // If encoding is detected and it's not UTF-8, convert it to UTF-8 - if ($detectedEncoding && $detectedEncoding !== 'UTF-8') { - $array[$key] = mb_convert_encoding($value, 'UTF-8', $detectedEncoding); - } - } - } - - return $data; - } - - /** * Returns the best delimiter * diff --git a/app/Http/Controllers/ImportJsonController.php b/app/Http/Controllers/ImportJsonController.php index a42ee2efea..d7239b4045 100644 --- a/app/Http/Controllers/ImportJsonController.php +++ b/app/Http/Controllers/ImportJsonController.php @@ -107,7 +107,7 @@ class ImportJsonController extends BaseController return response()->json(array_merge(['message' => 'Processing','success' => true], $metadata), 200); } - private function handleChunkedUpload(ImportJsonRequest $request) + private function handleChunkedUploadX(ImportJsonRequest $request) { $metadata = json_decode($request->metadata, true); @@ -251,4 +251,146 @@ class ImportJsonController extends BaseController rmdir($dir); } + + private function handleChunkedUpload(ImportJsonRequest $request) + { + $metadata = json_decode($request->metadata, true); + + // Validate metadata structure + if (!isset($metadata['fileHash'], $metadata['fileName'], $metadata['totalChunks'], $metadata['currentChunk'])) { + throw new \InvalidArgumentException('Invalid metadata structure'); + } + + // Sanitize and validate file hash (should be alphanumeric) + if (!preg_match('/^[a-zA-Z0-9]+$/', $metadata['fileHash'])) { + throw new \InvalidArgumentException('Invalid file hash format'); + } + + // Sanitize and validate filename + $safeFileName = basename($metadata['fileName']); + if ($safeFileName !== $metadata['fileName']) { + throw new \InvalidArgumentException('Invalid filename'); + } + + // Validate chunk number format + if (!is_numeric($metadata['currentChunk']) || $metadata['currentChunk'] < 0) { + throw new \InvalidArgumentException('Invalid chunk number'); + } + + // Validate total chunks + if (!is_numeric($metadata['totalChunks']) || $metadata['totalChunks'] <= 0 || $metadata['totalChunks'] > 1000) { + throw new \InvalidArgumentException('Invalid total chunks'); + } + + // Validate file type + $chunk = $request->file('file'); + if (!$chunk || !$chunk->isValid()) { + throw new \InvalidArgumentException('Invalid file chunk'); + } + + // Validate file size before saving + $maxChunkSize = 5 * 1024 * 1024; // 5MB + if ($chunk->getSize() > $maxChunkSize) { + throw new \InvalidArgumentException('Chunk size exceeds limit'); + } + + $disk = Ninja::isHosted() ? 'backup' : config('filesystems.default'); + + // Store chunk in S3 with unique path + $chunkKey = "tmp/chunks/{$metadata['fileHash']}/chunk-{$metadata['currentChunk']}"; + + Storage::disk($disk)->put( + $chunkKey, + file_get_contents($chunk->getRealPath()), + ['visibility' => 'private'] + ); + + // Check if all chunks are uploaded by listing S3 objects + $chunkPrefix = "tmp/chunks/{$metadata['fileHash']}/"; + $uploadedChunks = collect(Storage::disk($disk)->files($chunkPrefix)) + ->filter(function($file) { + return str_contains(basename($file), 'chunk-'); + }) + ->count(); + + if ($uploadedChunks >= $metadata['totalChunks']) { + try { + // Combine chunks from S3 + $finalPath = "migrations/{$safeFileName}"; + $this->combineChunksFromS3($disk, $metadata['fileHash'], $metadata['totalChunks'], $finalPath); + + // Clean up + $this->cleanupS3Chunks($disk, $metadata['fileHash']); + + $metadata['uploaded_filepath'] = $finalPath; + return $metadata; + + } catch (\Exception $e) { + // Clean up on error + $this->cleanupS3Chunks($disk, $metadata['fileHash']); + throw $e; + } + } + + return $metadata; + } + + private function combineChunksFromS3(string $disk, string $fileHash, int $totalChunks, string $finalPath): void + { + // Create a temporary local file to combine chunks + $tempFile = tempnam(sys_get_temp_dir(), 'chunk_combine_'); + + try { + $handle = fopen($tempFile, 'wb'); + if ($handle === false) { + throw new \RuntimeException('Failed to create temporary file'); + } + + // Download and combine chunks in order + for ($i = 0; $i < $totalChunks; $i++) { + $chunkKey = "tmp/chunks/{$fileHash}/chunk-{$i}"; + + if (!Storage::disk($disk)->exists($chunkKey)) { + throw new \RuntimeException("Missing chunk: {$i}"); + } + + $chunkContent = Storage::disk($disk)->get($chunkKey); + if ($chunkContent === null) { + throw new \RuntimeException("Failed to read chunk: {$i}"); + } + + if (fwrite($handle, $chunkContent) === false) { + throw new \RuntimeException("Failed to write chunk: {$i}"); + } + } + + fclose($handle); + + // Upload combined file to final location + Storage::disk($disk)->put( + $finalPath, + file_get_contents($tempFile), + ['visibility' => 'private'] + ); + + } finally { + // Clean up temporary file + if (file_exists($tempFile)) { + unlink($tempFile); + } + } + } + + private function cleanupS3Chunks(string $disk, string $fileHash): void + { + $chunkPrefix = "tmp/chunks/{$fileHash}/"; + + // Get all chunk files for this upload + $chunkFiles = Storage::disk($disk)->files($chunkPrefix); + + // Delete all chunk files + if (!empty($chunkFiles)) { + Storage::disk($disk)->delete($chunkFiles); + } + } } diff --git a/app/Import/Providers/BaseImport.php b/app/Import/Providers/BaseImport.php index f6e34cc82b..d9ab941b37 100644 --- a/app/Import/Providers/BaseImport.php +++ b/app/Import/Providers/BaseImport.php @@ -108,7 +108,7 @@ class BaseImport nlog("found {$entity_type}"); $csv = base64_decode($base64_encoded_csv); - $csv = mb_convert_encoding($csv, 'UTF-8', 'UTF-8'); + // $csv = mb_convert_encoding($csv, 'UTF-8', 'UTF-8'); $csv = Reader::createFromString($csv); $csvdelimiter = self::detectDelimiter($csv); diff --git a/app/Jobs/Invoice/ZipInvoices.php b/app/Jobs/Invoice/ZipInvoices.php index b2e4afb7eb..96323a6d23 100644 --- a/app/Jobs/Invoice/ZipInvoices.php +++ b/app/Jobs/Invoice/ZipInvoices.php @@ -38,7 +38,7 @@ class ZipInvoices implements ShouldQueue public $tries = 1; - public $timeout = 3600; + public $timeout = 10800; /** * @param $invoices diff --git a/tests/Unit/ImportEncodingTest.php b/tests/Unit/ImportEncodingTest.php new file mode 100644 index 0000000000..cfa8721107 --- /dev/null +++ b/tests/Unit/ImportEncodingTest.php @@ -0,0 +1,480 @@ +controller = new ImportController(); + + // Use reflection to access private methods + $reflection = new ReflectionClass($this->controller); + $this->readFileMethod = $reflection->getMethod('readFileWithProperEncoding'); + $this->readFileMethod->setAccessible(true); + + $this->containsWindows1252Method = $reflection->getMethod('containsWindows1252Bytes'); + $this->containsWindows1252Method->setAccessible(true); + + $this->fixCorruptedMethod = $reflection->getMethod('fixCorruptedWindows1252'); + $this->fixCorruptedMethod->setAccessible(true); + + $this->isValidConversionMethod = $reflection->getMethod('isValidConversion'); + $this->isValidConversionMethod->setAccessible(true); + } + + /** + * Test data for various encoding scenarios + */ + private function getTestData(): array + { + return [ + // Test string with common problematic characters + 'basic' => "Company's text with quotes", + 'apostrophes' => "Sya's Ian Le Led", + 'quotes' => '"Smart quotes" and \'single quotes\'', + 'currency' => "Price: 50.00, 25.99", // Simplified to avoid currency symbols in basic test + 'symbols' => "Trademark and copyright symbols", + 'accents' => "Cafe resume naive facade", // Simplified accents + ]; + } + + /** + * Get complex test data with full Unicode characters (for specific encoding tests) + */ + private function getComplexTestData(): array + { + return [ + 'complex' => "Company's «quoted» text—dash…ellipsis", + 'currency' => "Price: €50.00, £25.99", + 'symbols' => "Trademark™ and copyright© symbols", + 'accents' => "Café résumé naïve piñata façade", + ]; + } + + /** + * Windows-1252 special characters (0x80-0x9F range) + */ + private function getWindows1252SpecialChars(): array + { + return [ + 0x80 => '€', // Euro sign + 0x82 => '‚', // Single low-9 quotation mark + 0x83 => 'ƒ', // Latin small letter f with hook + 0x84 => '„', // Double low-9 quotation mark + 0x85 => '…', // Horizontal ellipsis + 0x86 => '†', // Dagger + 0x87 => '‡', // Double dagger + 0x88 => 'ˆ', // Modifier letter circumflex accent + 0x89 => '‰', // Per mille sign + 0x8A => 'Š', // Latin capital letter S with caron + 0x8B => '‹', // Single left-pointing angle quotation mark + 0x8C => 'Œ', // Latin capital ligature OE + 0x8E => 'Ž', // Latin capital letter Z with caron + 0x91 => "\u{2018}", // Left single quotation mark (smart quote) + 0x92 => "\u{2019}", // Right single quotation mark (smart quote) + 0x93 => "\u{201C}", // Left double quotation mark + 0x94 => "\u{201D}", // Right double quotation mark + 0x95 => '•', // Bullet + 0x96 => '–', // En dash + 0x97 => '—', // Em dash + 0x98 => '˜', // Small tilde + 0x99 => '™', // Trade mark sign + 0x9A => 'š', // Latin small letter s with caron + 0x9B => '›', // Single right-pointing angle quotation mark + 0x9C => 'œ', // Latin small ligature oe + 0x9E => 'ž', // Latin small letter z with caron + 0x9F => 'Ÿ', // Latin capital letter Y with diaeresis + ]; + } + + /** + * Create a temporary file with specific encoding + */ + private function createTestFile(string $content, string $encoding): string + { + $tempFile = tempnam(sys_get_temp_dir(), 'encoding_test_'); + + if ($encoding === 'UTF-8-BOM') { + $content = "\xEF\xBB\xBF" . $content; + file_put_contents($tempFile, $content); + } elseif ($encoding === 'UTF-8-CORRUPTED') { + // Simulate corrupted UTF-8 with replacement characters + $content = str_replace("'", "\xEF\xBF\xBD", $content); + file_put_contents($tempFile, $content); + } elseif ($encoding === 'UTF-8') { + file_put_contents($tempFile, $content); + } else { + // Convert to target encoding + $encoded = mb_convert_encoding($content, $encoding, 'UTF-8'); + file_put_contents($tempFile, $encoded); + } + + return $tempFile; + } + + /** + * Test 1: UTF-8 clean files (should pass through unchanged) + */ + public function testCleanUtf8Files() + { + foreach ($this->getTestData() as $name => $content) { + $tempFile = $this->createTestFile($content, 'UTF-8'); + + $result = $this->readFileMethod->invoke($this->controller, $tempFile); + + $this->assertEquals($content, $result, "Clean UTF-8 test failed for: {$name}"); + $this->assertTrue( + $this->isValidConversionMethod->invoke($this->controller, $result), + "Validation failed for clean UTF-8: {$name}" + ); + + unlink($tempFile); + } + } + + /** + * Test 2: UTF-8 with BOM + */ + public function testUtf8WithBom() + { + foreach ($this->getTestData() as $name => $content) { + $tempFile = $this->createTestFile($content, 'UTF-8-BOM'); + + $result = $this->readFileMethod->invoke($this->controller, $tempFile); + + // Should remove BOM and return clean content + $this->assertEquals($content, $result, "UTF-8 BOM test failed for: {$name}"); + $this->assertTrue( + $this->isValidConversionMethod->invoke($this->controller, $result), + "Validation failed for UTF-8 BOM: {$name}" + ); + + unlink($tempFile); + } + } + + /** + * Test 3: Windows-1252 files + */ + public function testWindows1252Files() + { + // Test with complex Unicode characters for Windows-1252 + foreach ($this->getComplexTestData() as $name => $content) { + $tempFile = $this->createTestFile($content, 'WINDOWS-1252'); + + $result = $this->readFileMethod->invoke($this->controller, $tempFile); + + $this->assertEquals($content, $result, "Windows-1252 test failed for: {$name}"); + $this->assertTrue( + $this->isValidConversionMethod->invoke($this->controller, $result), + "Validation failed for Windows-1252: {$name}" + ); + + unlink($tempFile); + } + } + + /** + * Test 3.5: Complex UTF-8 files with Unicode characters + */ + public function testComplexUtf8Files() + { + foreach ($this->getComplexTestData() as $name => $content) { + $tempFile = $this->createTestFile($content, 'UTF-8'); + + $result = $this->readFileMethod->invoke($this->controller, $tempFile); + + $this->assertEquals($content, $result, "Complex UTF-8 test failed for: {$name}"); + $this->assertTrue( + $this->isValidConversionMethod->invoke($this->controller, $result), + "Validation failed for complex UTF-8: {$name}" + ); + + unlink($tempFile); + } + } + + /** + * Test 4: ISO-8859-1 files + */ + public function testIso88591Files() + { + // Use only characters that exist in ISO-8859-1 + $testData = [ + 'basic' => "Company's text", + 'accents' => "Café résumé naïve façade", + ]; + + foreach ($testData as $name => $content) { + $tempFile = $this->createTestFile($content, 'ISO-8859-1'); + + $result = $this->readFileMethod->invoke($this->controller, $tempFile); + + $this->assertEquals($content, $result, "ISO-8859-1 test failed for: {$name}"); + $this->assertTrue( + $this->isValidConversionMethod->invoke($this->controller, $result), + "Validation failed for ISO-8859-1: {$name}" + ); + + unlink($tempFile); + } + } + + /** + * Test 5: Corrupted UTF-8 with replacement characters + */ + public function testCorruptedUtf8Files() + { + foreach ($this->getTestData() as $name => $content) { + $tempFile = $this->createTestFile($content, 'UTF-8-CORRUPTED'); + + $result = $this->readFileMethod->invoke($this->controller, $tempFile); + + // Expected result should have smart quotes instead of straight apostrophes + $expectedContent = str_replace("'", "\u{2019}", $content); + $this->assertEquals($expectedContent, $result, "Corrupted UTF-8 test failed for: {$name}"); + $this->assertTrue( + $this->isValidConversionMethod->invoke($this->controller, $result), + "Validation failed for corrupted UTF-8: {$name}" + ); + + unlink($tempFile); + } + } + + /** + * Test 6: All Windows-1252 special characters + */ + public function testAllWindows1252SpecialCharacters() + { + $specialChars = $this->getWindows1252SpecialChars(); + + foreach ($specialChars as $byte => $expectedChar) { + // Create content with the specific byte + $content = "Test " . chr($byte) . " character"; + $tempFile = tempnam(sys_get_temp_dir(), 'char_test_'); + + // Write raw bytes including the Windows-1252 character + $rawContent = "Test " . chr($byte) . " character"; + file_put_contents($tempFile, $rawContent); + + $result = $this->readFileMethod->invoke($this->controller, $tempFile); + + $expectedResult = "Test {$expectedChar} character"; + $this->assertEquals( + $expectedResult, + $result, + "Windows-1252 character test failed for byte 0x" . dechex($byte) . " ({$expectedChar})" + ); + + unlink($tempFile); + } + } + + /** + * Test 7: containsWindows1252Bytes method + */ + public function testContainsWindows1252Bytes() + { + // Test with Windows-1252 bytes + $dataWithWindows1252 = "Test " . chr(0x92) . " content"; + $this->assertTrue( + $this->containsWindows1252Method->invoke($this->controller, $dataWithWindows1252), + "Should detect Windows-1252 bytes" + ); + + // Test without Windows-1252 bytes + $cleanData = "Test clean content"; + $this->assertFalse( + $this->containsWindows1252Method->invoke($this->controller, $cleanData), + "Should not detect Windows-1252 bytes in clean data" + ); + + // Test with UTF-8 replacement characters + $corruptedData = "Test \xEF\xBF\xBD content"; + $this->assertFalse( + $this->containsWindows1252Method->invoke($this->controller, $corruptedData), + "Should not detect Windows-1252 bytes in corrupted UTF-8" + ); + } + + /** + * Test 8: fixCorruptedWindows1252 method + */ + public function testFixCorruptedWindows1252() + { + $corruptedData = "Sya\xEF\xBF\xBDs In Le"; + $expectedResult = "Sya\u{2019}s In Le"; + + $result = $this->fixCorruptedMethod->invoke($this->controller, $corruptedData); + + $this->assertEquals($expectedResult, $result, "Failed to fix corrupted Windows-1252 data"); + } + + /** + * Test 9: isValidConversion method + */ + public function testIsValidConversion() + { + // Valid UTF-8 without replacement characters + $validData = "Clean UTF-8 content with apostrophe's"; + $this->assertTrue( + $this->isValidConversionMethod->invoke($this->controller, $validData), + "Should validate clean UTF-8 content" + ); + + // Invalid - contains replacement character bytes + $invalidData1 = "Content with \xEF\xBF\xBD replacement"; + $this->assertFalse( + $this->isValidConversionMethod->invoke($this->controller, $invalidData1), + "Should reject content with UTF-8 replacement bytes" + ); + + // Invalid - contains double-encoded replacement + $invalidData2 = "Content with � replacement"; + $this->assertFalse( + $this->isValidConversionMethod->invoke($this->controller, $invalidData2), + "Should reject content with double-encoded replacement" + ); + + // Invalid UTF-8 + $invalidUtf8 = "Invalid \xFF UTF-8"; + $this->assertFalse( + $this->isValidConversionMethod->invoke($this->controller, $invalidUtf8), + "Should reject invalid UTF-8" + ); + } + + /** + * Test 10: Multiple encoding types comprehensive test + */ + public function testMultipleEncodingTypes() + { + $encodings = [ + 'UTF-8', + 'WINDOWS-1252', + 'ISO-8859-1', + 'ISO-8859-15', + 'ASCII', + ]; + + $testContent = "Company's «test» data—with symbols"; + + foreach ($encodings as $encoding) { + if ($encoding === 'ASCII') { + // ASCII can't handle special characters, use simpler content + $content = "Company data test"; + } else { + $content = $testContent; + } + + $tempFile = $this->createTestFile($content, $encoding); + $result = $this->readFileMethod->invoke($this->controller, $tempFile); + + // Result should always be valid UTF-8 + $this->assertTrue( + mb_check_encoding($result, 'UTF-8'), + "Result should be valid UTF-8 for encoding: {$encoding}" + ); + + // Should not contain replacement characters + $this->assertFalse( + str_contains($result, '�'), + "Result should not contain replacement characters for encoding: {$encoding}" + ); + + unlink($tempFile); + } + } + + /** + * Test 11: Backward compatibility - existing functionality should not break + */ + public function testBackwardCompatibility() + { + // Test that normal CSV content still works + $csvContent = "Name,Amount,Date\n\"John's Company\",100.50,2024-01-01\n\"Mary's Store\",250.75,2024-01-02"; + + $tempFile = $this->createTestFile($csvContent, 'UTF-8'); + $result = $this->readFileMethod->invoke($this->controller, $tempFile); + + $this->assertEquals($csvContent, $result, "Backward compatibility test failed for CSV content"); + + // Test that it contains expected structure + $this->assertStringContainsString("John's Company", $result, "CSV should contain original apostrophes"); + $this->assertStringContainsString("Mary's Store", $result, "CSV should contain original apostrophes"); + + unlink($tempFile); + } + + /** + * Test 12: Edge cases and error handling + */ + public function testEdgeCases() + { + // Empty file + $tempFile = tempnam(sys_get_temp_dir(), 'empty_test_'); + file_put_contents($tempFile, ''); + + $result = $this->readFileMethod->invoke($this->controller, $tempFile); + $this->assertEquals('', $result, "Empty file should return empty string"); + + unlink($tempFile); + + // Non-existent file + $result = $this->readFileMethod->invoke($this->controller, '/non/existent/file.csv'); + $this->assertEquals('', $result, "Non-existent file should return empty string"); + + // Very large content with mixed characters + $largeContent = str_repeat("Test's data with special chars—", 1000); + $tempFile = $this->createTestFile($largeContent, 'WINDOWS-1252'); + + $result = $this->readFileMethod->invoke($this->controller, $tempFile); + $this->assertTrue( + $this->isValidConversionMethod->invoke($this->controller, $result), + "Large file conversion should be valid" + ); + + unlink($tempFile); + } + + /** + * Test 13: Performance test to ensure no significant regression + */ + public function testPerformance() + { + $content = str_repeat("Company's data with special characters test\n", 10000); + $tempFile = $this->createTestFile($content, 'WINDOWS-1252'); + + $startTime = microtime(true); + $result = $this->readFileMethod->invoke($this->controller, $tempFile); + $endTime = microtime(true); + + $processingTime = $endTime - $startTime; + + // Should process reasonably fast (less than 1 second for 10k lines) + $this->assertLessThan(1.0, $processingTime, "Processing should be reasonably fast"); + $this->assertTrue( + $this->isValidConversionMethod->invoke($this->controller, $result), + "Performance test result should be valid" + ); + + unlink($tempFile); + } +} \ No newline at end of file diff --git a/tests/Unit/ImportUnicodeEncodingTest.php b/tests/Unit/ImportUnicodeEncodingTest.php new file mode 100644 index 0000000000..989f4e0f61 --- /dev/null +++ b/tests/Unit/ImportUnicodeEncodingTest.php @@ -0,0 +1,511 @@ +controller = new ImportController(); + + // Use reflection to access private methods + $reflection = new ReflectionClass($this->controller); + $this->readFileMethod = $reflection->getMethod('readFileWithProperEncoding'); + $this->readFileMethod->setAccessible(true); + + $this->isValidConversionMethod = $reflection->getMethod('isValidConversion'); + $this->isValidConversionMethod->setAccessible(true); + + $this->removeBOMMethod = $reflection->getMethod('removeBOM'); + $this->removeBOMMethod->setAccessible(true); + } + + /** + * Test data with various Unicode blocks and international content + */ + private function getUnicodeTestData(): array + { + return [ + // Basic Latin and Latin Extended + 'latin_basic' => "Hello World! Company's data", + 'latin_extended' => "Café résumé naïve piñata façade", + + // Greek + 'greek' => "Καλημέρα κόσμε! Ελληνικά γράμματα", + + // Cyrillic + 'cyrillic' => "Привет мир! Русский текст", + + // Arabic (RTL) + 'arabic' => "مرحبا بالعالم! النص العربي", + + // Hebrew (RTL) + 'hebrew' => "שלום עולם! טקסט עברי", + + // Chinese Simplified + 'chinese_simplified' => "你好世界!简体中文", + + // Chinese Traditional + 'chinese_traditional' => "你好世界!繁體中文", + + // Japanese (Hiragana, Katakana, Kanji) + 'japanese' => "こんにちは世界!ひらがな・カタカナ・漢字", + + // Korean + 'korean' => "안녕하세요 세계! 한국어 텍스트", + + // Mathematical symbols + 'mathematical' => "∑∫∞±≤≥≠√∂∇∆", + + // Currency symbols + 'currency' => "€£¥₹₽₨₩₪₦₡₸", + + // Emoji and symbols + 'emoji' => "😀🌍🚀💻📊✨🎉🔥💡⭐", + + // Mixed scripts + 'mixed_scripts' => "Hello мир 世界 🌍 café résumé", + + // Special Unicode cases + 'zero_width' => "Text\u{200B}with\u{FEFF}zero\u{200C}width\u{200D}chars", + 'combining' => "e\u{0301}a\u{0300}i\u{0302}o\u{0303}u\u{0308}", // é à î õ ü + + // Quotation marks and dashes + 'punctuation' => "«quotes» \u{201C}smart\u{201D} \u{2018}quotes\u{2019} — – … ‚ „", + ]; + } + + /** + * Extended encoding list for comprehensive testing + */ + private function getExtendedEncodings(): array + { + return [ + // Unicode variants + 'UTF-8', + 'UTF-8-BOM', + 'UTF-16BE', + 'UTF-16LE', + 'UTF-32BE', + 'UTF-32LE', + + // ISO Latin variants (commonly supported) + 'ISO-8859-1', // Western European + 'ISO-8859-2', // Central European + 'ISO-8859-5', // Cyrillic + 'ISO-8859-7', // Greek + 'ISO-8859-9', // Turkish + 'ISO-8859-15', // Western European (with Euro) + + // Windows code pages (commonly supported) + 'Windows-1251', // Cyrillic + 'Windows-1252', // Western European + + // Other commonly supported encodings + 'CP1252', // Windows Western + ]; + } + + /** + * Create a test file with specific content and encoding + */ + private function createTestFile(string $content, string $encoding): string + { + $tempFile = tempnam(sys_get_temp_dir(), 'unicode_test_'); + + switch ($encoding) { + case 'UTF-8-BOM': + $content = "\xEF\xBB\xBF" . $content; + file_put_contents($tempFile, $content); + break; + + case 'UTF-16BE': + $content = "\xFE\xFF" . mb_convert_encoding($content, 'UTF-16BE', 'UTF-8'); + file_put_contents($tempFile, $content); + break; + + case 'UTF-16LE': + $content = "\xFF\xFE" . mb_convert_encoding($content, 'UTF-16LE', 'UTF-8'); + file_put_contents($tempFile, $content); + break; + + case 'UTF-32BE': + $content = "\x00\x00\xFE\xFF" . mb_convert_encoding($content, 'UTF-32BE', 'UTF-8'); + file_put_contents($tempFile, $content); + break; + + case 'UTF-32LE': + $content = "\xFF\xFE\x00\x00" . mb_convert_encoding($content, 'UTF-32LE', 'UTF-8'); + file_put_contents($tempFile, $content); + break; + + case 'UTF-8': + file_put_contents($tempFile, $content); + break; + + default: + // Try to convert using mb_convert_encoding + try { + // Check if encoding is supported + if (!in_array($encoding, mb_list_encodings())) { + // If encoding not supported, use UTF-8 fallback + file_put_contents($tempFile, $content); + break; + } + + $encoded = mb_convert_encoding($content, $encoding, 'UTF-8'); + file_put_contents($tempFile, $encoded); + } catch (Exception | ValueError $e) { + // If conversion fails, use UTF-8 fallback + file_put_contents($tempFile, $content); + } + break; + } + + return $tempFile; + } + + /** + * Test 1: Unicode content preservation across different UTF encodings + */ + public function testUnicodeContentPreservation() + { + $unicodeEncodings = ['UTF-8', 'UTF-8-BOM', 'UTF-16BE', 'UTF-16LE', 'UTF-32BE', 'UTF-32LE']; + + foreach ($this->getUnicodeTestData() as $name => $content) { + foreach ($unicodeEncodings as $encoding) { + $tempFile = $this->createTestFile($content, $encoding); + + $result = $this->readFileMethod->invoke($this->controller, $tempFile); + + $this->assertEquals( + $content, + $result, + "Unicode preservation failed for {$name} with {$encoding} encoding" + ); + + $this->assertTrue( + $this->isValidConversionMethod->invoke($this->controller, $result), + "Validation failed for {$name} with {$encoding} encoding" + ); + + unlink($tempFile); + } + } + } + + /** + * Test 2: BOM handling for different UTF variants + */ + public function testBOMHandlingForAllUTF() + { + $testContent = "Hello 世界! Тест العالم"; + + $bomTests = [ + 'UTF-8' => "\xEF\xBB\xBF", + 'UTF-16BE' => "\xFE\xFF", + 'UTF-16LE' => "\xFF\xFE", + 'UTF-32BE' => "\x00\x00\xFE\xFF", + 'UTF-32LE' => "\xFF\xFE\x00\x00", + ]; + + foreach ($bomTests as $encoding => $bom) { + // Create file with BOM using the createTestFile method + $tempFile = $this->createTestFile($testContent, $encoding); + + // Test file processing with BOM + $fileResult = $this->readFileMethod->invoke($this->controller, $tempFile); + + $this->assertEquals( + $testContent, + $fileResult, + "File processing with BOM failed for {$encoding}" + ); + + $this->assertTrue( + $this->isValidConversionMethod->invoke($this->controller, $fileResult), + "BOM file validation failed for {$encoding}" + ); + + unlink($tempFile); + } + + // Test UTF-8 BOM removal specifically (since that's what the method is designed for) + $utf8DataWithBOM = "\xEF\xBB\xBF" . $testContent; + $result = $this->removeBOMMethod->invoke($this->controller, $utf8DataWithBOM); + + $this->assertEquals( + $testContent, + $result, + "UTF-8 BOM removal failed" + ); + } + + /** + * Test 3: Extended encoding compatibility + */ + public function testExtendedEncodingCompatibility() + { + // Use content that's compatible with most encodings + $basicContent = "Company data with special chars"; + $accentContent = "Cafe resume naive facade"; // Without actual accents for broader compatibility + + foreach ($this->getExtendedEncodings() as $encoding) { + // Skip encodings that are known to not support certain characters + $content = $this->isAsciiCompatibleEncoding($encoding) ? $basicContent : $accentContent; + + $tempFile = $this->createTestFile($content, $encoding); + + $result = $this->readFileMethod->invoke($this->controller, $tempFile); + + // Result should always be valid UTF-8 + $this->assertTrue( + mb_check_encoding($result, 'UTF-8'), + "Result should be valid UTF-8 for encoding: {$encoding}" + ); + + // Should not contain replacement characters + $this->assertFalse( + str_contains($result, '�'), + "Result should not contain replacement characters for encoding: {$encoding}" + ); + + $this->assertTrue( + $this->isValidConversionMethod->invoke($this->controller, $result), + "Validation failed for encoding: {$encoding}" + ); + + unlink($tempFile); + } + } + + /** + * Test 4: Right-to-left (RTL) text handling + */ + public function testRightToLeftTextHandling() + { + $rtlContent = [ + 'arabic' => "مرحبا بالعالم! شركة البيانات", + 'hebrew' => "שלום עולם! חברת הנתונים", + 'mixed_rtl' => "Hello مرحبا World עולם!", + ]; + + foreach ($rtlContent as $name => $content) { + $tempFile = $this->createTestFile($content, 'UTF-8'); + + $result = $this->readFileMethod->invoke($this->controller, $tempFile); + + $this->assertEquals($content, $result, "RTL test failed for: {$name}"); + $this->assertTrue( + $this->isValidConversionMethod->invoke($this->controller, $result), + "RTL validation failed for: {$name}" + ); + + unlink($tempFile); + } + } + + /** + * Test 5: Asian character sets (CJK) + */ + public function testAsianCharacterSets() + { + $cjkContent = [ + 'chinese_simplified' => "公司数据处理系统", + 'chinese_traditional' => "公司資料處理系統", + 'japanese_hiragana' => "かいしゃのでーたしすてむ", + 'japanese_katakana' => "カイシャノデータシステム", + 'japanese_kanji' => "会社のデータシステム", + 'korean' => "회사 데이터 시스템", + 'mixed_cjk' => "Company 公司 会社 회사 Data", + ]; + + foreach ($cjkContent as $name => $content) { + $tempFile = $this->createTestFile($content, 'UTF-8'); + + $result = $this->readFileMethod->invoke($this->controller, $tempFile); + + $this->assertEquals($content, $result, "CJK test failed for: {$name}"); + $this->assertTrue( + $this->isValidConversionMethod->invoke($this->controller, $result), + "CJK validation failed for: {$name}" + ); + + unlink($tempFile); + } + } + + /** + * Test 6: Emoji and symbol handling + */ + public function testEmojiAndSymbolHandling() + { + $symbolContent = [ + 'basic_emoji' => "Data 📊 Reports 📈 Analysis 🔍", + 'complex_emoji' => "👨‍💻👩‍💼🏢💼📋📊📈📉", + 'mathematical' => "∑(x²) ∫f(x)dx ∞ ≠ ≤ ≥ ± √", + 'currency_symbols' => "Price: €100 £80 ¥1000 $75", + 'technical_symbols' => "® © ™ § ¶ † ‡ • ‰ ‱", + 'arrows_symbols' => "← → ↑ ↓ ↔ ↕ ⇐ ⇒ ⇔", + ]; + + foreach ($symbolContent as $name => $content) { + $tempFile = $this->createTestFile($content, 'UTF-8'); + + $result = $this->readFileMethod->invoke($this->controller, $tempFile); + + $this->assertEquals($content, $result, "Symbol test failed for: {$name}"); + $this->assertTrue( + $this->isValidConversionMethod->invoke($this->controller, $result), + "Symbol validation failed for: {$name}" + ); + + unlink($tempFile); + } + } + + /** + * Test 7: Combining characters and normalization + */ + public function testCombiningCharacters() + { + $combiningContent = [ + 'accents_composed' => "café résumé naïve", + 'accents_decomposed' => "cafe\u{0301} re\u{0301}sume\u{0301} nai\u{0308}ve", + 'mixed_normalization' => "café cafe\u{0301} résumé re\u{0301}sume\u{0301}", + ]; + + foreach ($combiningContent as $name => $content) { + $tempFile = $this->createTestFile($content, 'UTF-8'); + + $result = $this->readFileMethod->invoke($this->controller, $tempFile); + + // Content should be preserved (normalization might occur but content should be valid) + $this->assertTrue( + mb_check_encoding($result, 'UTF-8'), + "Combining character result should be valid UTF-8 for: {$name}" + ); + $this->assertTrue( + $this->isValidConversionMethod->invoke($this->controller, $result), + "Combining character validation failed for: {$name}" + ); + + unlink($tempFile); + } + } + + /** + * Test 8: Large Unicode content performance + */ + public function testLargeUnicodeContentPerformance() + { + $unicodePattern = "🌍 Hello 世界 مرحبا Здравствуй שלום こんにちは 안녕하세요 "; + $largeContent = str_repeat($unicodePattern, 1000); // ~50KB of Unicode content + + $tempFile = $this->createTestFile($largeContent, 'UTF-8'); + + $startTime = microtime(true); + $result = $this->readFileMethod->invoke($this->controller, $tempFile); + $endTime = microtime(true); + + $processingTime = $endTime - $startTime; + + $this->assertLessThan(2.0, $processingTime, "Large Unicode content processing should be fast"); + $this->assertEquals($largeContent, $result, "Large Unicode content should be preserved"); + $this->assertTrue( + $this->isValidConversionMethod->invoke($this->controller, $result), + "Large Unicode content validation failed" + ); + + unlink($tempFile); + } + + /** + * Test 9: Mixed encoding scenarios + */ + public function testMixedEncodingScenarios() + { + // Simulate files that might have mixed encoding issues + $scenarios = [ + 'mostly_ascii_with_unicode' => "Regular text with émojis 😀 and symbols ™", + 'csv_with_international' => "Name,Company,Location\n\"José García\",\"Café España\",\"São Paulo\"", + 'business_names' => "McDonald's, L'Oréal, Nestlé, Björk & Co, Müller GmbH", + ]; + + foreach ($scenarios as $name => $content) { + // Test with multiple encodings + $encodings = ['UTF-8', 'UTF-8-BOM', 'WINDOWS-1252', 'ISO-8859-1']; + + foreach ($encodings as $encoding) { + $tempFile = $this->createTestFile($content, $encoding); + + $result = $this->readFileMethod->invoke($this->controller, $tempFile); + + $this->assertTrue( + mb_check_encoding($result, 'UTF-8'), + "Mixed encoding result should be valid UTF-8 for {$name} with {$encoding}" + ); + $this->assertTrue( + $this->isValidConversionMethod->invoke($this->controller, $result), + "Mixed encoding validation failed for {$name} with {$encoding}" + ); + + unlink($tempFile); + } + } + } + + /** + * Helper method to determine if an encoding is ASCII-compatible + */ + private function isAsciiCompatibleEncoding(string $encoding): bool + { + $asciiOnlyEncodings = ['ASCII', 'US-ASCII']; + return in_array($encoding, $asciiOnlyEncodings); + } + + /** + * Test 10: CSV data with international content + */ + public function testCSVWithInternationalContent() + { + $csvContent = "Name,Company,City,Country,Notes\n" . + "\"José García\",\"Café España\",\"São Paulo\",\"Brasil\",\"Açaí supplier\"\n" . + "\"李小明\",\"北京科技公司\",\"北京\",\"中国\",\"Technology partner\"\n" . + "\"Müller\",\"Bäckerei München\",\"München\",\"Deutschland\",\"Café & Bäckerei\"\n" . + "\"Иванов\",\"Москва ООО\",\"Москва\",\"Россия\",\"Software development\"\n" . + "\"محمد أحمد\",\"شركة الرياض\",\"الرياض\",\"السعودية\",\"Trading company\""; + + $encodings = ['UTF-8', 'UTF-8-BOM', 'WINDOWS-1252']; + + foreach ($encodings as $encoding) { + $tempFile = $this->createTestFile($csvContent, $encoding); + + $result = $this->readFileMethod->invoke($this->controller, $tempFile); + + $this->assertTrue( + mb_check_encoding($result, 'UTF-8'), + "CSV result should be valid UTF-8 for encoding: {$encoding}" + ); + + // Check that it contains expected international content + $this->assertStringContainsString("José García", $result, "Should contain Spanish names"); + $this->assertStringContainsString("李小明", $result, "Should contain Chinese names"); + $this->assertStringContainsString("Müller", $result, "Should contain German names"); + + unlink($tempFile); + } + } +} \ No newline at end of file