Improve csv file encoding support for imports

This commit is contained in:
David Bomba 2025-06-04 10:22:12 +10:00
parent f4533421f1
commit daf4391a30
8 changed files with 1375 additions and 55 deletions

View File

@ -88,9 +88,9 @@ class BaseController extends Controller
/* Grouped permissions when we want to hide columns for particular permission groups*/ /* Grouped permissions when we want to hide columns for particular permission groups*/
private array $client_exclusion_fields = ['balance', 'paid_to_date', 'credit_balance', 'client_hash']; protected array $client_exclusion_fields = ['balance', 'paid_to_date', 'credit_balance', 'client_hash'];
private array $client_excludable_permissions = ['view_client']; protected array $client_excludable_permissions = ['view_client'];
private array $client_excludable_overrides = ['edit_client', 'edit_all', 'view_invoice', 'view_all', 'edit_invoice']; protected array $client_excludable_overrides = ['edit_client', 'edit_all', 'view_invoice', 'view_all', 'edit_invoice'];
/* Grouped permissions when we want to hide columns for particular permission groups*/ /* Grouped permissions when we want to hide columns for particular permission groups*/

View File

@ -112,6 +112,12 @@ class ClientController extends BaseController
*/ */
public function show(ShowClientRequest $request, Client $client) public function show(ShowClientRequest $request, Client $client)
{ {
nlog("show");
if(auth()->user()->hasExcludedPermissions($this->client_excludable_permissions, $this->client_excludable_overrides)){
nlog('hiding fields');
$client->makeHidden($this->client_exclusion_fields);
}
return $this->itemResponse($client); return $this->itemResponse($client);
} }
@ -125,6 +131,12 @@ class ClientController extends BaseController
*/ */
public function edit(EditClientRequest $request, Client $client) public function edit(EditClientRequest $request, Client $client)
{ {
nlog("Edit");
if (auth()->user()->hasExcludedPermissions($this->client_excludable_permissions, $this->client_excludable_overrides)) {
nlog('hiding fields');
$client->makeHidden($this->client_exclusion_fields);
}
return $this->itemResponse($client); return $this->itemResponse($client);
} }

View File

@ -83,8 +83,8 @@ class ImportController extends Controller
]; ];
/** @var UploadedFile $file */ /** @var UploadedFile $file */
foreach ($request->files->get('files') as $entityType => $file) { foreach ($request->files->get('files') as $entityType => $file) {
$contents = file_get_contents($file->getPathname()); $contents = $this->readFileWithProperEncoding($file->getPathname());
// Store the csv in cache with an expiry of 10 minutes
Cache::put($hash.'-'.$entityType, base64_encode($contents), 1200); Cache::put($hash.'-'.$entityType, base64_encode($contents), 1200);
// Parse CSV // Parse CSV
@ -104,6 +104,224 @@ class ImportController extends Controller
return response()->json($data); return response()->json($data);
} }
private function readFileWithProperEncoding(string $filePath): string
{
// First, read the file and check if it's already clean UTF-8
$contents = @file_get_contents($filePath);
if ($contents === false) {
return '';
}
// Check for different UTF BOMs and handle accordingly
$bomResult = $this->detectAndHandleUTFEncoding($contents);
if ($bomResult !== null) {
return $bomResult;
}
// Remove BOM if present (for UTF-8 BOM)
$contents = $this->removeBOM($contents);
// Check if it's clean UTF-8 first (no conversion needed)
if (mb_check_encoding($contents, 'UTF-8') && $this->isValidConversion($contents)) {
return $contents;
}
// Method 1: Try reading with explicit Windows-1252 context
$context = stream_context_create([
'file' => [
'encoding' => 'WINDOWS-1252'
]
]);
$contextContents = @file_get_contents($filePath, false, $context);
if ($contextContents !== false) {
$contextContents = $this->removeBOM($contextContents);
$converted = mb_convert_encoding($contextContents, 'UTF-8', 'WINDOWS-1252');
if ($this->isValidConversion($converted)) {
return $converted;
}
}
// Method 2: Binary read with forced Windows-1252 conversion
$handle = @fopen($filePath, 'rb');
if ($handle) {
$binaryContents = fread($handle, filesize($filePath));
fclose($handle);
$binaryContents = $this->removeBOM($binaryContents);
// Check if this looks like Windows-1252 by looking for problem bytes
if ($this->containsWindows1252Bytes($binaryContents)) {
$converted = mb_convert_encoding($binaryContents, 'UTF-8', 'WINDOWS-1252');
if ($this->isValidConversion($converted)) {
return $converted;
}
}
}
// Method 3: Fix corrupted UTF-8 replacement characters
if ($contents !== false) {
$fixed = $this->fixCorruptedWindows1252($contents);
if ($this->isValidConversion($fixed)) {
return $fixed;
}
}
// Method 4: Try different encoding auto-detection with broader list
if ($contents !== false) {
$encodings = ['WINDOWS-1252', 'ISO-8859-1', 'ISO-8859-15', 'CP1252'];
foreach ($encodings as $encoding) {
$converted = mb_convert_encoding($contents, 'UTF-8', $encoding);
if ($this->isValidConversion($converted)) {
return $converted;
}
}
}
// Fallback: return original contents
return $contents ?: '';
}
/**
* Detect and handle UTF-16 and UTF-32 encodings based on BOM
*/
private function detectAndHandleUTFEncoding(string $data): ?string
{
// UTF-32 BE BOM
if (substr($data, 0, 4) === "\x00\x00\xFE\xFF") {
$withoutBOM = substr($data, 4);
return mb_convert_encoding($withoutBOM, 'UTF-8', 'UTF-32BE');
}
// UTF-32 LE BOM
if (substr($data, 0, 4) === "\xFF\xFE\x00\x00") {
$withoutBOM = substr($data, 4);
return mb_convert_encoding($withoutBOM, 'UTF-8', 'UTF-32LE');
}
// UTF-16 BE BOM
if (substr($data, 0, 2) === "\xFE\xFF") {
$withoutBOM = substr($data, 2);
return mb_convert_encoding($withoutBOM, 'UTF-8', 'UTF-16BE');
}
// UTF-16 LE BOM
if (substr($data, 0, 2) === "\xFF\xFE") {
$withoutBOM = substr($data, 2);
return mb_convert_encoding($withoutBOM, 'UTF-8', 'UTF-16LE');
}
// Try to detect UTF-16/32 without BOM (heuristic approach)
$length = strlen($data);
// UTF-32 detection (every 4th byte pattern)
if ($length >= 8 && $length % 4 === 0) {
$nullCount = 0;
for ($i = 0; $i < min(100, $length); $i += 4) {
if ($data[$i] === "\x00" && $data[$i + 1] === "\x00" && $data[$i + 2] === "\x00") {
$nullCount++;
}
}
if ($nullCount > 5) { // Likely UTF-32LE
return mb_convert_encoding($data, 'UTF-8', 'UTF-32LE');
}
}
// UTF-16 detection (every 2nd byte pattern)
if ($length >= 4 && $length % 2 === 0) {
$nullCount = 0;
for ($i = 0; $i < min(100, $length); $i += 2) {
if ($data[$i + 1] === "\x00") {
$nullCount++;
}
}
if ($nullCount > 10) { // Likely UTF-16LE
return mb_convert_encoding($data, 'UTF-8', 'UTF-16LE');
}
// Check for UTF-16BE
$nullCount = 0;
for ($i = 0; $i < min(100, $length); $i += 2) {
if ($data[$i] === "\x00") {
$nullCount++;
}
}
if ($nullCount > 10) { // Likely UTF-16BE
return mb_convert_encoding($data, 'UTF-8', 'UTF-16BE');
}
}
return null;
}
/**
* Remove BOM (Byte Order Mark) from the beginning of a string
*/
private function removeBOM(string $data): string
{
// UTF-8 BOM
if (substr($data, 0, 3) === "\xEF\xBB\xBF") {
return substr($data, 3);
}
// UTF-16 BE BOM
if (substr($data, 0, 2) === "\xFE\xFF") {
return substr($data, 2);
}
// UTF-16 LE BOM
if (substr($data, 0, 2) === "\xFF\xFE") {
return substr($data, 2);
}
// UTF-32 BE BOM
if (substr($data, 0, 4) === "\x00\x00\xFE\xFF") {
return substr($data, 4);
}
// UTF-32 LE BOM
if (substr($data, 0, 4) === "\xFF\xFE\x00\x00") {
return substr($data, 4);
}
return $data;
}
private function containsWindows1252Bytes(string $data): bool
{
// Check for Windows-1252 specific bytes in 0x80-0x9F range
$windows1252Bytes = [0x80, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8A, 0x8B, 0x8C, 0x8E, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9A, 0x9B, 0x9C, 0x9E, 0x9F];
foreach ($windows1252Bytes as $byte) {
if (strpos($data, chr($byte)) !== false) {
return true;
}
}
return false;
}
private function fixCorruptedWindows1252(string $data): string
{
// Map of UTF-8 replacement sequences back to proper characters
$replacements = [
"\xEF\xBF\xBD" => "\u{2019}", // Most common: right single quote (0x92) - use smart quote
// Add more mappings as needed based on your data
];
return str_replace(array_keys($replacements), array_values($replacements), $data);
}
private function isValidConversion(string $data): bool
{
// Check if conversion was successful:
// 1. Must be valid UTF-8
// 2. Must NOT contain replacement characters (indicating corruption)
// 3. Additional check for double-encoded replacement
return mb_check_encoding($data, 'UTF-8') &&
!str_contains($data, "\xEF\xBF\xBD") && // UTF-8 replacement character bytes
!str_contains($data, '�'); // Double-encoded replacement character
}
private function setImportHints($entity_type, $available_keys, $headers): array private function setImportHints($entity_type, $available_keys, $headers): array
{ {
$hints = []; $hints = [];
@ -203,7 +421,9 @@ class ImportController extends Controller
/** @var UploadedFile $file */ /** @var UploadedFile $file */
foreach ($request->files->get('files') as $entityType => $file) { foreach ($request->files->get('files') as $entityType => $file) {
$contents = file_get_contents($file->getPathname()); // $contents = file_get_contents($file->getPathname());
$contents = $this->readFileWithProperEncoding($file->getPathname());
// Store the csv in cache with an expiry of 10 minutes // Store the csv in cache with an expiry of 10 minutes
Cache::put($hash.'-'.$entityType, base64_encode($contents), 600); Cache::put($hash.'-'.$entityType, base64_encode($contents), 600);
nlog($hash.'-'.$entityType); nlog($hash.'-'.$entityType);
@ -248,54 +468,9 @@ class ImportController extends Controller
} }
} }
return $this->convertData($data); return $data; // Remove the convertData call since we fixed encoding upfront
} }
private function convertData(array $data): array
{
// List of encodings to check against
$encodings = [
'UTF-8',
'ISO-8859-1', // Latin-1
'ISO-8859-2', // Latin-2
'WINDOWS-1252', // CP1252
'SHIFT-JIS',
'EUC-JP',
'GB2312',
'GBK',
'BIG5',
'ISO-2022-JP',
'KOI8-R',
'KOI8-U',
'WINDOWS-1251', // CP1251
'UTF-16',
'UTF-32',
'ASCII',
'WINDOWS-1254', // Turkish, which sometimes includes Georgian
'WINDOWS-1256', // Arabic, which sometimes includes Georgian
'ISO-8859-10',
];
foreach ($data as $key => $value) {
// Only process strings
if (is_string($value)) {
// Detect the encoding of the string
$detectedEncoding = mb_detect_encoding($value, $encodings, true);
// If encoding is detected and it's not UTF-8, convert it to UTF-8
if ($detectedEncoding && $detectedEncoding !== 'UTF-8') {
$array[$key] = mb_convert_encoding($value, 'UTF-8', $detectedEncoding);
}
}
}
return $data;
}
/** /**
* Returns the best delimiter * Returns the best delimiter
* *

View File

@ -107,7 +107,7 @@ class ImportJsonController extends BaseController
return response()->json(array_merge(['message' => 'Processing','success' => true], $metadata), 200); return response()->json(array_merge(['message' => 'Processing','success' => true], $metadata), 200);
} }
private function handleChunkedUpload(ImportJsonRequest $request) private function handleChunkedUploadX(ImportJsonRequest $request)
{ {
$metadata = json_decode($request->metadata, true); $metadata = json_decode($request->metadata, true);
@ -251,4 +251,146 @@ class ImportJsonController extends BaseController
rmdir($dir); rmdir($dir);
} }
private function handleChunkedUpload(ImportJsonRequest $request)
{
$metadata = json_decode($request->metadata, true);
// Validate metadata structure
if (!isset($metadata['fileHash'], $metadata['fileName'], $metadata['totalChunks'], $metadata['currentChunk'])) {
throw new \InvalidArgumentException('Invalid metadata structure');
}
// Sanitize and validate file hash (should be alphanumeric)
if (!preg_match('/^[a-zA-Z0-9]+$/', $metadata['fileHash'])) {
throw new \InvalidArgumentException('Invalid file hash format');
}
// Sanitize and validate filename
$safeFileName = basename($metadata['fileName']);
if ($safeFileName !== $metadata['fileName']) {
throw new \InvalidArgumentException('Invalid filename');
}
// Validate chunk number format
if (!is_numeric($metadata['currentChunk']) || $metadata['currentChunk'] < 0) {
throw new \InvalidArgumentException('Invalid chunk number');
}
// Validate total chunks
if (!is_numeric($metadata['totalChunks']) || $metadata['totalChunks'] <= 0 || $metadata['totalChunks'] > 1000) {
throw new \InvalidArgumentException('Invalid total chunks');
}
// Validate file type
$chunk = $request->file('file');
if (!$chunk || !$chunk->isValid()) {
throw new \InvalidArgumentException('Invalid file chunk');
}
// Validate file size before saving
$maxChunkSize = 5 * 1024 * 1024; // 5MB
if ($chunk->getSize() > $maxChunkSize) {
throw new \InvalidArgumentException('Chunk size exceeds limit');
}
$disk = Ninja::isHosted() ? 'backup' : config('filesystems.default');
// Store chunk in S3 with unique path
$chunkKey = "tmp/chunks/{$metadata['fileHash']}/chunk-{$metadata['currentChunk']}";
Storage::disk($disk)->put(
$chunkKey,
file_get_contents($chunk->getRealPath()),
['visibility' => 'private']
);
// Check if all chunks are uploaded by listing S3 objects
$chunkPrefix = "tmp/chunks/{$metadata['fileHash']}/";
$uploadedChunks = collect(Storage::disk($disk)->files($chunkPrefix))
->filter(function($file) {
return str_contains(basename($file), 'chunk-');
})
->count();
if ($uploadedChunks >= $metadata['totalChunks']) {
try {
// Combine chunks from S3
$finalPath = "migrations/{$safeFileName}";
$this->combineChunksFromS3($disk, $metadata['fileHash'], $metadata['totalChunks'], $finalPath);
// Clean up
$this->cleanupS3Chunks($disk, $metadata['fileHash']);
$metadata['uploaded_filepath'] = $finalPath;
return $metadata;
} catch (\Exception $e) {
// Clean up on error
$this->cleanupS3Chunks($disk, $metadata['fileHash']);
throw $e;
}
}
return $metadata;
}
private function combineChunksFromS3(string $disk, string $fileHash, int $totalChunks, string $finalPath): void
{
// Create a temporary local file to combine chunks
$tempFile = tempnam(sys_get_temp_dir(), 'chunk_combine_');
try {
$handle = fopen($tempFile, 'wb');
if ($handle === false) {
throw new \RuntimeException('Failed to create temporary file');
}
// Download and combine chunks in order
for ($i = 0; $i < $totalChunks; $i++) {
$chunkKey = "tmp/chunks/{$fileHash}/chunk-{$i}";
if (!Storage::disk($disk)->exists($chunkKey)) {
throw new \RuntimeException("Missing chunk: {$i}");
}
$chunkContent = Storage::disk($disk)->get($chunkKey);
if ($chunkContent === null) {
throw new \RuntimeException("Failed to read chunk: {$i}");
}
if (fwrite($handle, $chunkContent) === false) {
throw new \RuntimeException("Failed to write chunk: {$i}");
}
}
fclose($handle);
// Upload combined file to final location
Storage::disk($disk)->put(
$finalPath,
file_get_contents($tempFile),
['visibility' => 'private']
);
} finally {
// Clean up temporary file
if (file_exists($tempFile)) {
unlink($tempFile);
}
}
}
private function cleanupS3Chunks(string $disk, string $fileHash): void
{
$chunkPrefix = "tmp/chunks/{$fileHash}/";
// Get all chunk files for this upload
$chunkFiles = Storage::disk($disk)->files($chunkPrefix);
// Delete all chunk files
if (!empty($chunkFiles)) {
Storage::disk($disk)->delete($chunkFiles);
}
}
} }

View File

@ -108,7 +108,7 @@ class BaseImport
nlog("found {$entity_type}"); nlog("found {$entity_type}");
$csv = base64_decode($base64_encoded_csv); $csv = base64_decode($base64_encoded_csv);
$csv = mb_convert_encoding($csv, 'UTF-8', 'UTF-8'); // $csv = mb_convert_encoding($csv, 'UTF-8', 'UTF-8');
$csv = Reader::createFromString($csv); $csv = Reader::createFromString($csv);
$csvdelimiter = self::detectDelimiter($csv); $csvdelimiter = self::detectDelimiter($csv);

View File

@ -38,7 +38,7 @@ class ZipInvoices implements ShouldQueue
public $tries = 1; public $tries = 1;
public $timeout = 3600; public $timeout = 10800;
/** /**
* @param $invoices * @param $invoices

View File

@ -0,0 +1,480 @@
<?php
namespace Tests\Unit;
use Tests\TestCase;
use App\Http\Controllers\ImportController;
use Illuminate\Http\UploadedFile;
use Illuminate\Support\Facades\Storage;
use ReflectionClass;
use ReflectionMethod;
class ImportEncodingTest extends TestCase
{
private ImportController $controller;
private ReflectionMethod $readFileMethod;
private ReflectionMethod $containsWindows1252Method;
private ReflectionMethod $fixCorruptedMethod;
private ReflectionMethod $isValidConversionMethod;
protected function setUp(): void
{
parent::setUp();
$this->controller = new ImportController();
// Use reflection to access private methods
$reflection = new ReflectionClass($this->controller);
$this->readFileMethod = $reflection->getMethod('readFileWithProperEncoding');
$this->readFileMethod->setAccessible(true);
$this->containsWindows1252Method = $reflection->getMethod('containsWindows1252Bytes');
$this->containsWindows1252Method->setAccessible(true);
$this->fixCorruptedMethod = $reflection->getMethod('fixCorruptedWindows1252');
$this->fixCorruptedMethod->setAccessible(true);
$this->isValidConversionMethod = $reflection->getMethod('isValidConversion');
$this->isValidConversionMethod->setAccessible(true);
}
/**
* Test data for various encoding scenarios
*/
private function getTestData(): array
{
return [
// Test string with common problematic characters
'basic' => "Company's text with quotes",
'apostrophes' => "Sya's Ian Le Led",
'quotes' => '"Smart quotes" and \'single quotes\'',
'currency' => "Price: 50.00, 25.99", // Simplified to avoid currency symbols in basic test
'symbols' => "Trademark and copyright symbols",
'accents' => "Cafe resume naive facade", // Simplified accents
];
}
/**
* Get complex test data with full Unicode characters (for specific encoding tests)
*/
private function getComplexTestData(): array
{
return [
'complex' => "Company's «quoted» text—dash…ellipsis",
'currency' => "Price: €50.00, £25.99",
'symbols' => "Trademark™ and copyright© symbols",
'accents' => "Café résumé naïve piñata façade",
];
}
/**
* Windows-1252 special characters (0x80-0x9F range)
*/
private function getWindows1252SpecialChars(): array
{
return [
0x80 => '€', // Euro sign
0x82 => '', // Single low-9 quotation mark
0x83 => 'ƒ', // Latin small letter f with hook
0x84 => '„', // Double low-9 quotation mark
0x85 => '…', // Horizontal ellipsis
0x86 => '†', // Dagger
0x87 => '‡', // Double dagger
0x88 => 'ˆ', // Modifier letter circumflex accent
0x89 => '‰', // Per mille sign
0x8A => 'Š', // Latin capital letter S with caron
0x8B => '', // Single left-pointing angle quotation mark
0x8C => 'Œ', // Latin capital ligature OE
0x8E => 'Ž', // Latin capital letter Z with caron
0x91 => "\u{2018}", // Left single quotation mark (smart quote)
0x92 => "\u{2019}", // Right single quotation mark (smart quote)
0x93 => "\u{201C}", // Left double quotation mark
0x94 => "\u{201D}", // Right double quotation mark
0x95 => '•', // Bullet
0x96 => '', // En dash
0x97 => '—', // Em dash
0x98 => '˜', // Small tilde
0x99 => '™', // Trade mark sign
0x9A => 'š', // Latin small letter s with caron
0x9B => '', // Single right-pointing angle quotation mark
0x9C => 'œ', // Latin small ligature oe
0x9E => 'ž', // Latin small letter z with caron
0x9F => 'Ÿ', // Latin capital letter Y with diaeresis
];
}
/**
* Create a temporary file with specific encoding
*/
private function createTestFile(string $content, string $encoding): string
{
$tempFile = tempnam(sys_get_temp_dir(), 'encoding_test_');
if ($encoding === 'UTF-8-BOM') {
$content = "\xEF\xBB\xBF" . $content;
file_put_contents($tempFile, $content);
} elseif ($encoding === 'UTF-8-CORRUPTED') {
// Simulate corrupted UTF-8 with replacement characters
$content = str_replace("'", "\xEF\xBF\xBD", $content);
file_put_contents($tempFile, $content);
} elseif ($encoding === 'UTF-8') {
file_put_contents($tempFile, $content);
} else {
// Convert to target encoding
$encoded = mb_convert_encoding($content, $encoding, 'UTF-8');
file_put_contents($tempFile, $encoded);
}
return $tempFile;
}
/**
* Test 1: UTF-8 clean files (should pass through unchanged)
*/
public function testCleanUtf8Files()
{
foreach ($this->getTestData() as $name => $content) {
$tempFile = $this->createTestFile($content, 'UTF-8');
$result = $this->readFileMethod->invoke($this->controller, $tempFile);
$this->assertEquals($content, $result, "Clean UTF-8 test failed for: {$name}");
$this->assertTrue(
$this->isValidConversionMethod->invoke($this->controller, $result),
"Validation failed for clean UTF-8: {$name}"
);
unlink($tempFile);
}
}
/**
* Test 2: UTF-8 with BOM
*/
public function testUtf8WithBom()
{
foreach ($this->getTestData() as $name => $content) {
$tempFile = $this->createTestFile($content, 'UTF-8-BOM');
$result = $this->readFileMethod->invoke($this->controller, $tempFile);
// Should remove BOM and return clean content
$this->assertEquals($content, $result, "UTF-8 BOM test failed for: {$name}");
$this->assertTrue(
$this->isValidConversionMethod->invoke($this->controller, $result),
"Validation failed for UTF-8 BOM: {$name}"
);
unlink($tempFile);
}
}
/**
* Test 3: Windows-1252 files
*/
public function testWindows1252Files()
{
// Test with complex Unicode characters for Windows-1252
foreach ($this->getComplexTestData() as $name => $content) {
$tempFile = $this->createTestFile($content, 'WINDOWS-1252');
$result = $this->readFileMethod->invoke($this->controller, $tempFile);
$this->assertEquals($content, $result, "Windows-1252 test failed for: {$name}");
$this->assertTrue(
$this->isValidConversionMethod->invoke($this->controller, $result),
"Validation failed for Windows-1252: {$name}"
);
unlink($tempFile);
}
}
/**
* Test 3.5: Complex UTF-8 files with Unicode characters
*/
public function testComplexUtf8Files()
{
foreach ($this->getComplexTestData() as $name => $content) {
$tempFile = $this->createTestFile($content, 'UTF-8');
$result = $this->readFileMethod->invoke($this->controller, $tempFile);
$this->assertEquals($content, $result, "Complex UTF-8 test failed for: {$name}");
$this->assertTrue(
$this->isValidConversionMethod->invoke($this->controller, $result),
"Validation failed for complex UTF-8: {$name}"
);
unlink($tempFile);
}
}
/**
* Test 4: ISO-8859-1 files
*/
public function testIso88591Files()
{
// Use only characters that exist in ISO-8859-1
$testData = [
'basic' => "Company's text",
'accents' => "Café résumé naïve façade",
];
foreach ($testData as $name => $content) {
$tempFile = $this->createTestFile($content, 'ISO-8859-1');
$result = $this->readFileMethod->invoke($this->controller, $tempFile);
$this->assertEquals($content, $result, "ISO-8859-1 test failed for: {$name}");
$this->assertTrue(
$this->isValidConversionMethod->invoke($this->controller, $result),
"Validation failed for ISO-8859-1: {$name}"
);
unlink($tempFile);
}
}
/**
* Test 5: Corrupted UTF-8 with replacement characters
*/
public function testCorruptedUtf8Files()
{
foreach ($this->getTestData() as $name => $content) {
$tempFile = $this->createTestFile($content, 'UTF-8-CORRUPTED');
$result = $this->readFileMethod->invoke($this->controller, $tempFile);
// Expected result should have smart quotes instead of straight apostrophes
$expectedContent = str_replace("'", "\u{2019}", $content);
$this->assertEquals($expectedContent, $result, "Corrupted UTF-8 test failed for: {$name}");
$this->assertTrue(
$this->isValidConversionMethod->invoke($this->controller, $result),
"Validation failed for corrupted UTF-8: {$name}"
);
unlink($tempFile);
}
}
/**
* Test 6: All Windows-1252 special characters
*/
public function testAllWindows1252SpecialCharacters()
{
$specialChars = $this->getWindows1252SpecialChars();
foreach ($specialChars as $byte => $expectedChar) {
// Create content with the specific byte
$content = "Test " . chr($byte) . " character";
$tempFile = tempnam(sys_get_temp_dir(), 'char_test_');
// Write raw bytes including the Windows-1252 character
$rawContent = "Test " . chr($byte) . " character";
file_put_contents($tempFile, $rawContent);
$result = $this->readFileMethod->invoke($this->controller, $tempFile);
$expectedResult = "Test {$expectedChar} character";
$this->assertEquals(
$expectedResult,
$result,
"Windows-1252 character test failed for byte 0x" . dechex($byte) . " ({$expectedChar})"
);
unlink($tempFile);
}
}
/**
* Test 7: containsWindows1252Bytes method
*/
public function testContainsWindows1252Bytes()
{
// Test with Windows-1252 bytes
$dataWithWindows1252 = "Test " . chr(0x92) . " content";
$this->assertTrue(
$this->containsWindows1252Method->invoke($this->controller, $dataWithWindows1252),
"Should detect Windows-1252 bytes"
);
// Test without Windows-1252 bytes
$cleanData = "Test clean content";
$this->assertFalse(
$this->containsWindows1252Method->invoke($this->controller, $cleanData),
"Should not detect Windows-1252 bytes in clean data"
);
// Test with UTF-8 replacement characters
$corruptedData = "Test \xEF\xBF\xBD content";
$this->assertFalse(
$this->containsWindows1252Method->invoke($this->controller, $corruptedData),
"Should not detect Windows-1252 bytes in corrupted UTF-8"
);
}
/**
* Test 8: fixCorruptedWindows1252 method
*/
public function testFixCorruptedWindows1252()
{
$corruptedData = "Sya\xEF\xBF\xBDs In Le";
$expectedResult = "Sya\u{2019}s In Le";
$result = $this->fixCorruptedMethod->invoke($this->controller, $corruptedData);
$this->assertEquals($expectedResult, $result, "Failed to fix corrupted Windows-1252 data");
}
/**
* Test 9: isValidConversion method
*/
public function testIsValidConversion()
{
// Valid UTF-8 without replacement characters
$validData = "Clean UTF-8 content with apostrophe's";
$this->assertTrue(
$this->isValidConversionMethod->invoke($this->controller, $validData),
"Should validate clean UTF-8 content"
);
// Invalid - contains replacement character bytes
$invalidData1 = "Content with \xEF\xBF\xBD replacement";
$this->assertFalse(
$this->isValidConversionMethod->invoke($this->controller, $invalidData1),
"Should reject content with UTF-8 replacement bytes"
);
// Invalid - contains double-encoded replacement
$invalidData2 = "Content with � replacement";
$this->assertFalse(
$this->isValidConversionMethod->invoke($this->controller, $invalidData2),
"Should reject content with double-encoded replacement"
);
// Invalid UTF-8
$invalidUtf8 = "Invalid \xFF UTF-8";
$this->assertFalse(
$this->isValidConversionMethod->invoke($this->controller, $invalidUtf8),
"Should reject invalid UTF-8"
);
}
/**
* Test 10: Multiple encoding types comprehensive test
*/
public function testMultipleEncodingTypes()
{
$encodings = [
'UTF-8',
'WINDOWS-1252',
'ISO-8859-1',
'ISO-8859-15',
'ASCII',
];
$testContent = "Company's «test» data—with symbols";
foreach ($encodings as $encoding) {
if ($encoding === 'ASCII') {
// ASCII can't handle special characters, use simpler content
$content = "Company data test";
} else {
$content = $testContent;
}
$tempFile = $this->createTestFile($content, $encoding);
$result = $this->readFileMethod->invoke($this->controller, $tempFile);
// Result should always be valid UTF-8
$this->assertTrue(
mb_check_encoding($result, 'UTF-8'),
"Result should be valid UTF-8 for encoding: {$encoding}"
);
// Should not contain replacement characters
$this->assertFalse(
str_contains($result, '<27>'),
"Result should not contain replacement characters for encoding: {$encoding}"
);
unlink($tempFile);
}
}
/**
* Test 11: Backward compatibility - existing functionality should not break
*/
public function testBackwardCompatibility()
{
// Test that normal CSV content still works
$csvContent = "Name,Amount,Date\n\"John's Company\",100.50,2024-01-01\n\"Mary's Store\",250.75,2024-01-02";
$tempFile = $this->createTestFile($csvContent, 'UTF-8');
$result = $this->readFileMethod->invoke($this->controller, $tempFile);
$this->assertEquals($csvContent, $result, "Backward compatibility test failed for CSV content");
// Test that it contains expected structure
$this->assertStringContainsString("John's Company", $result, "CSV should contain original apostrophes");
$this->assertStringContainsString("Mary's Store", $result, "CSV should contain original apostrophes");
unlink($tempFile);
}
/**
* Test 12: Edge cases and error handling
*/
public function testEdgeCases()
{
// Empty file
$tempFile = tempnam(sys_get_temp_dir(), 'empty_test_');
file_put_contents($tempFile, '');
$result = $this->readFileMethod->invoke($this->controller, $tempFile);
$this->assertEquals('', $result, "Empty file should return empty string");
unlink($tempFile);
// Non-existent file
$result = $this->readFileMethod->invoke($this->controller, '/non/existent/file.csv');
$this->assertEquals('', $result, "Non-existent file should return empty string");
// Very large content with mixed characters
$largeContent = str_repeat("Test's data with special chars—", 1000);
$tempFile = $this->createTestFile($largeContent, 'WINDOWS-1252');
$result = $this->readFileMethod->invoke($this->controller, $tempFile);
$this->assertTrue(
$this->isValidConversionMethod->invoke($this->controller, $result),
"Large file conversion should be valid"
);
unlink($tempFile);
}
/**
* Test 13: Performance test to ensure no significant regression
*/
public function testPerformance()
{
$content = str_repeat("Company's data with special characters test\n", 10000);
$tempFile = $this->createTestFile($content, 'WINDOWS-1252');
$startTime = microtime(true);
$result = $this->readFileMethod->invoke($this->controller, $tempFile);
$endTime = microtime(true);
$processingTime = $endTime - $startTime;
// Should process reasonably fast (less than 1 second for 10k lines)
$this->assertLessThan(1.0, $processingTime, "Processing should be reasonably fast");
$this->assertTrue(
$this->isValidConversionMethod->invoke($this->controller, $result),
"Performance test result should be valid"
);
unlink($tempFile);
}
}

View File

@ -0,0 +1,511 @@
<?php
namespace Tests\Unit;
use Tests\TestCase;
use App\Http\Controllers\ImportController;
use ReflectionClass;
use ReflectionMethod;
class ImportUnicodeEncodingTest extends TestCase
{
private ImportController $controller;
private ReflectionMethod $readFileMethod;
private ReflectionMethod $isValidConversionMethod;
private ReflectionMethod $removeBOMMethod;
protected function setUp(): void
{
parent::setUp();
$this->controller = new ImportController();
// Use reflection to access private methods
$reflection = new ReflectionClass($this->controller);
$this->readFileMethod = $reflection->getMethod('readFileWithProperEncoding');
$this->readFileMethod->setAccessible(true);
$this->isValidConversionMethod = $reflection->getMethod('isValidConversion');
$this->isValidConversionMethod->setAccessible(true);
$this->removeBOMMethod = $reflection->getMethod('removeBOM');
$this->removeBOMMethod->setAccessible(true);
}
/**
* Test data with various Unicode blocks and international content
*/
private function getUnicodeTestData(): array
{
return [
// Basic Latin and Latin Extended
'latin_basic' => "Hello World! Company's data",
'latin_extended' => "Café résumé naïve piñata façade",
// Greek
'greek' => "Καλημέρα κόσμε! Ελληνικά γράμματα",
// Cyrillic
'cyrillic' => "Привет мир! Русский текст",
// Arabic (RTL)
'arabic' => "مرحبا بالعالم! النص العربي",
// Hebrew (RTL)
'hebrew' => "שלום עולם! טקסט עברי",
// Chinese Simplified
'chinese_simplified' => "你好世界!简体中文",
// Chinese Traditional
'chinese_traditional' => "你好世界!繁體中文",
// Japanese (Hiragana, Katakana, Kanji)
'japanese' => "こんにちは世界!ひらがな・カタカナ・漢字",
// Korean
'korean' => "안녕하세요 세계! 한국어 텍스트",
// Mathematical symbols
'mathematical' => "∑∫∞±≤≥≠√∂∇∆",
// Currency symbols
'currency' => "€£¥₹₽₨₩₪₦₡₸",
// Emoji and symbols
'emoji' => "😀🌍🚀💻📊✨🎉🔥💡⭐",
// Mixed scripts
'mixed_scripts' => "Hello мир 世界 🌍 café résumé",
// Special Unicode cases
'zero_width' => "Text\u{200B}with\u{FEFF}zero\u{200C}width\u{200D}chars",
'combining' => "e\u{0301}a\u{0300}i\u{0302}o\u{0303}u\u{0308}", // é à î õ ü
// Quotation marks and dashes
'punctuation' => "«quotes» \u{201C}smart\u{201D} \u{2018}quotes\u{2019} — ",
];
}
/**
* Extended encoding list for comprehensive testing
*/
private function getExtendedEncodings(): array
{
return [
// Unicode variants
'UTF-8',
'UTF-8-BOM',
'UTF-16BE',
'UTF-16LE',
'UTF-32BE',
'UTF-32LE',
// ISO Latin variants (commonly supported)
'ISO-8859-1', // Western European
'ISO-8859-2', // Central European
'ISO-8859-5', // Cyrillic
'ISO-8859-7', // Greek
'ISO-8859-9', // Turkish
'ISO-8859-15', // Western European (with Euro)
// Windows code pages (commonly supported)
'Windows-1251', // Cyrillic
'Windows-1252', // Western European
// Other commonly supported encodings
'CP1252', // Windows Western
];
}
/**
* Create a test file with specific content and encoding
*/
private function createTestFile(string $content, string $encoding): string
{
$tempFile = tempnam(sys_get_temp_dir(), 'unicode_test_');
switch ($encoding) {
case 'UTF-8-BOM':
$content = "\xEF\xBB\xBF" . $content;
file_put_contents($tempFile, $content);
break;
case 'UTF-16BE':
$content = "\xFE\xFF" . mb_convert_encoding($content, 'UTF-16BE', 'UTF-8');
file_put_contents($tempFile, $content);
break;
case 'UTF-16LE':
$content = "\xFF\xFE" . mb_convert_encoding($content, 'UTF-16LE', 'UTF-8');
file_put_contents($tempFile, $content);
break;
case 'UTF-32BE':
$content = "\x00\x00\xFE\xFF" . mb_convert_encoding($content, 'UTF-32BE', 'UTF-8');
file_put_contents($tempFile, $content);
break;
case 'UTF-32LE':
$content = "\xFF\xFE\x00\x00" . mb_convert_encoding($content, 'UTF-32LE', 'UTF-8');
file_put_contents($tempFile, $content);
break;
case 'UTF-8':
file_put_contents($tempFile, $content);
break;
default:
// Try to convert using mb_convert_encoding
try {
// Check if encoding is supported
if (!in_array($encoding, mb_list_encodings())) {
// If encoding not supported, use UTF-8 fallback
file_put_contents($tempFile, $content);
break;
}
$encoded = mb_convert_encoding($content, $encoding, 'UTF-8');
file_put_contents($tempFile, $encoded);
} catch (Exception | ValueError $e) {
// If conversion fails, use UTF-8 fallback
file_put_contents($tempFile, $content);
}
break;
}
return $tempFile;
}
/**
* Test 1: Unicode content preservation across different UTF encodings
*/
public function testUnicodeContentPreservation()
{
$unicodeEncodings = ['UTF-8', 'UTF-8-BOM', 'UTF-16BE', 'UTF-16LE', 'UTF-32BE', 'UTF-32LE'];
foreach ($this->getUnicodeTestData() as $name => $content) {
foreach ($unicodeEncodings as $encoding) {
$tempFile = $this->createTestFile($content, $encoding);
$result = $this->readFileMethod->invoke($this->controller, $tempFile);
$this->assertEquals(
$content,
$result,
"Unicode preservation failed for {$name} with {$encoding} encoding"
);
$this->assertTrue(
$this->isValidConversionMethod->invoke($this->controller, $result),
"Validation failed for {$name} with {$encoding} encoding"
);
unlink($tempFile);
}
}
}
/**
* Test 2: BOM handling for different UTF variants
*/
public function testBOMHandlingForAllUTF()
{
$testContent = "Hello 世界! Тест العالم";
$bomTests = [
'UTF-8' => "\xEF\xBB\xBF",
'UTF-16BE' => "\xFE\xFF",
'UTF-16LE' => "\xFF\xFE",
'UTF-32BE' => "\x00\x00\xFE\xFF",
'UTF-32LE' => "\xFF\xFE\x00\x00",
];
foreach ($bomTests as $encoding => $bom) {
// Create file with BOM using the createTestFile method
$tempFile = $this->createTestFile($testContent, $encoding);
// Test file processing with BOM
$fileResult = $this->readFileMethod->invoke($this->controller, $tempFile);
$this->assertEquals(
$testContent,
$fileResult,
"File processing with BOM failed for {$encoding}"
);
$this->assertTrue(
$this->isValidConversionMethod->invoke($this->controller, $fileResult),
"BOM file validation failed for {$encoding}"
);
unlink($tempFile);
}
// Test UTF-8 BOM removal specifically (since that's what the method is designed for)
$utf8DataWithBOM = "\xEF\xBB\xBF" . $testContent;
$result = $this->removeBOMMethod->invoke($this->controller, $utf8DataWithBOM);
$this->assertEquals(
$testContent,
$result,
"UTF-8 BOM removal failed"
);
}
/**
* Test 3: Extended encoding compatibility
*/
public function testExtendedEncodingCompatibility()
{
// Use content that's compatible with most encodings
$basicContent = "Company data with special chars";
$accentContent = "Cafe resume naive facade"; // Without actual accents for broader compatibility
foreach ($this->getExtendedEncodings() as $encoding) {
// Skip encodings that are known to not support certain characters
$content = $this->isAsciiCompatibleEncoding($encoding) ? $basicContent : $accentContent;
$tempFile = $this->createTestFile($content, $encoding);
$result = $this->readFileMethod->invoke($this->controller, $tempFile);
// Result should always be valid UTF-8
$this->assertTrue(
mb_check_encoding($result, 'UTF-8'),
"Result should be valid UTF-8 for encoding: {$encoding}"
);
// Should not contain replacement characters
$this->assertFalse(
str_contains($result, '<27>'),
"Result should not contain replacement characters for encoding: {$encoding}"
);
$this->assertTrue(
$this->isValidConversionMethod->invoke($this->controller, $result),
"Validation failed for encoding: {$encoding}"
);
unlink($tempFile);
}
}
/**
* Test 4: Right-to-left (RTL) text handling
*/
public function testRightToLeftTextHandling()
{
$rtlContent = [
'arabic' => "مرحبا بالعالم! شركة البيانات",
'hebrew' => "שלום עולם! חברת הנתונים",
'mixed_rtl' => "Hello مرحبا World עולם!",
];
foreach ($rtlContent as $name => $content) {
$tempFile = $this->createTestFile($content, 'UTF-8');
$result = $this->readFileMethod->invoke($this->controller, $tempFile);
$this->assertEquals($content, $result, "RTL test failed for: {$name}");
$this->assertTrue(
$this->isValidConversionMethod->invoke($this->controller, $result),
"RTL validation failed for: {$name}"
);
unlink($tempFile);
}
}
/**
* Test 5: Asian character sets (CJK)
*/
public function testAsianCharacterSets()
{
$cjkContent = [
'chinese_simplified' => "公司数据处理系统",
'chinese_traditional' => "公司資料處理系統",
'japanese_hiragana' => "かいしゃのでーたしすてむ",
'japanese_katakana' => "カイシャノデータシステム",
'japanese_kanji' => "会社のデータシステム",
'korean' => "회사 데이터 시스템",
'mixed_cjk' => "Company 公司 会社 회사 Data",
];
foreach ($cjkContent as $name => $content) {
$tempFile = $this->createTestFile($content, 'UTF-8');
$result = $this->readFileMethod->invoke($this->controller, $tempFile);
$this->assertEquals($content, $result, "CJK test failed for: {$name}");
$this->assertTrue(
$this->isValidConversionMethod->invoke($this->controller, $result),
"CJK validation failed for: {$name}"
);
unlink($tempFile);
}
}
/**
* Test 6: Emoji and symbol handling
*/
public function testEmojiAndSymbolHandling()
{
$symbolContent = [
'basic_emoji' => "Data 📊 Reports 📈 Analysis 🔍",
'complex_emoji' => "👨‍💻👩‍💼🏢💼📋📊📈📉",
'mathematical' => "∑(x²) ∫f(x)dx ∞ ≠ ≤ ≥ ± √",
'currency_symbols' => "Price: €100 £80 ¥1000 $75",
'technical_symbols' => "® © ™ § ¶ † ‡ • ‰ ‱",
'arrows_symbols' => "← → ↑ ↓ ↔ ↕ ⇐ ⇒ ⇔",
];
foreach ($symbolContent as $name => $content) {
$tempFile = $this->createTestFile($content, 'UTF-8');
$result = $this->readFileMethod->invoke($this->controller, $tempFile);
$this->assertEquals($content, $result, "Symbol test failed for: {$name}");
$this->assertTrue(
$this->isValidConversionMethod->invoke($this->controller, $result),
"Symbol validation failed for: {$name}"
);
unlink($tempFile);
}
}
/**
* Test 7: Combining characters and normalization
*/
public function testCombiningCharacters()
{
$combiningContent = [
'accents_composed' => "café résumé naïve",
'accents_decomposed' => "cafe\u{0301} re\u{0301}sume\u{0301} nai\u{0308}ve",
'mixed_normalization' => "café cafe\u{0301} résumé re\u{0301}sume\u{0301}",
];
foreach ($combiningContent as $name => $content) {
$tempFile = $this->createTestFile($content, 'UTF-8');
$result = $this->readFileMethod->invoke($this->controller, $tempFile);
// Content should be preserved (normalization might occur but content should be valid)
$this->assertTrue(
mb_check_encoding($result, 'UTF-8'),
"Combining character result should be valid UTF-8 for: {$name}"
);
$this->assertTrue(
$this->isValidConversionMethod->invoke($this->controller, $result),
"Combining character validation failed for: {$name}"
);
unlink($tempFile);
}
}
/**
* Test 8: Large Unicode content performance
*/
public function testLargeUnicodeContentPerformance()
{
$unicodePattern = "🌍 Hello 世界 مرحبا Здравствуй שלום こんにちは 안녕하세요 ";
$largeContent = str_repeat($unicodePattern, 1000); // ~50KB of Unicode content
$tempFile = $this->createTestFile($largeContent, 'UTF-8');
$startTime = microtime(true);
$result = $this->readFileMethod->invoke($this->controller, $tempFile);
$endTime = microtime(true);
$processingTime = $endTime - $startTime;
$this->assertLessThan(2.0, $processingTime, "Large Unicode content processing should be fast");
$this->assertEquals($largeContent, $result, "Large Unicode content should be preserved");
$this->assertTrue(
$this->isValidConversionMethod->invoke($this->controller, $result),
"Large Unicode content validation failed"
);
unlink($tempFile);
}
/**
* Test 9: Mixed encoding scenarios
*/
public function testMixedEncodingScenarios()
{
// Simulate files that might have mixed encoding issues
$scenarios = [
'mostly_ascii_with_unicode' => "Regular text with émojis 😀 and symbols ™",
'csv_with_international' => "Name,Company,Location\n\"José García\",\"Café España\",\"São Paulo\"",
'business_names' => "McDonald's, L'Oréal, Nestlé, Björk & Co, Müller GmbH",
];
foreach ($scenarios as $name => $content) {
// Test with multiple encodings
$encodings = ['UTF-8', 'UTF-8-BOM', 'WINDOWS-1252', 'ISO-8859-1'];
foreach ($encodings as $encoding) {
$tempFile = $this->createTestFile($content, $encoding);
$result = $this->readFileMethod->invoke($this->controller, $tempFile);
$this->assertTrue(
mb_check_encoding($result, 'UTF-8'),
"Mixed encoding result should be valid UTF-8 for {$name} with {$encoding}"
);
$this->assertTrue(
$this->isValidConversionMethod->invoke($this->controller, $result),
"Mixed encoding validation failed for {$name} with {$encoding}"
);
unlink($tempFile);
}
}
}
/**
* Helper method to determine if an encoding is ASCII-compatible
*/
private function isAsciiCompatibleEncoding(string $encoding): bool
{
$asciiOnlyEncodings = ['ASCII', 'US-ASCII'];
return in_array($encoding, $asciiOnlyEncodings);
}
/**
* Test 10: CSV data with international content
*/
public function testCSVWithInternationalContent()
{
$csvContent = "Name,Company,City,Country,Notes\n" .
"\"José García\",\"Café España\",\"São Paulo\",\"Brasil\",\"Açaí supplier\"\n" .
"\"李小明\",\"北京科技公司\",\"北京\",\"中国\",\"Technology partner\"\n" .
"\"Müller\",\"Bäckerei München\",\"München\",\"Deutschland\",\"Café & Bäckerei\"\n" .
"\"Иванов\",\"Москва ООО\",\"Москва\",\"Россия\",\"Software development\"\n" .
"\"محمد أحمد\",\"شركة الرياض\",\"الرياض\",\"السعودية\",\"Trading company\"";
$encodings = ['UTF-8', 'UTF-8-BOM', 'WINDOWS-1252'];
foreach ($encodings as $encoding) {
$tempFile = $this->createTestFile($csvContent, $encoding);
$result = $this->readFileMethod->invoke($this->controller, $tempFile);
$this->assertTrue(
mb_check_encoding($result, 'UTF-8'),
"CSV result should be valid UTF-8 for encoding: {$encoding}"
);
// Check that it contains expected international content
$this->assertStringContainsString("José García", $result, "Should contain Spanish names");
$this->assertStringContainsString("李小明", $result, "Should contain Chinese names");
$this->assertStringContainsString("Müller", $result, "Should contain German names");
unlink($tempFile);
}
}
}