Improve csv file encoding support for imports
This commit is contained in:
parent
f4533421f1
commit
daf4391a30
|
|
@ -88,9 +88,9 @@ class BaseController extends Controller
|
|||
|
||||
/* Grouped permissions when we want to hide columns for particular permission groups*/
|
||||
|
||||
private array $client_exclusion_fields = ['balance', 'paid_to_date', 'credit_balance', 'client_hash'];
|
||||
private array $client_excludable_permissions = ['view_client'];
|
||||
private array $client_excludable_overrides = ['edit_client', 'edit_all', 'view_invoice', 'view_all', 'edit_invoice'];
|
||||
protected array $client_exclusion_fields = ['balance', 'paid_to_date', 'credit_balance', 'client_hash'];
|
||||
protected array $client_excludable_permissions = ['view_client'];
|
||||
protected array $client_excludable_overrides = ['edit_client', 'edit_all', 'view_invoice', 'view_all', 'edit_invoice'];
|
||||
|
||||
/* Grouped permissions when we want to hide columns for particular permission groups*/
|
||||
|
||||
|
|
|
|||
|
|
@ -112,6 +112,12 @@ class ClientController extends BaseController
|
|||
*/
|
||||
public function show(ShowClientRequest $request, Client $client)
|
||||
{
|
||||
nlog("show");
|
||||
if(auth()->user()->hasExcludedPermissions($this->client_excludable_permissions, $this->client_excludable_overrides)){
|
||||
nlog('hiding fields');
|
||||
$client->makeHidden($this->client_exclusion_fields);
|
||||
}
|
||||
|
||||
return $this->itemResponse($client);
|
||||
}
|
||||
|
||||
|
|
@ -125,6 +131,12 @@ class ClientController extends BaseController
|
|||
*/
|
||||
public function edit(EditClientRequest $request, Client $client)
|
||||
{
|
||||
nlog("Edit");
|
||||
if (auth()->user()->hasExcludedPermissions($this->client_excludable_permissions, $this->client_excludable_overrides)) {
|
||||
nlog('hiding fields');
|
||||
$client->makeHidden($this->client_exclusion_fields);
|
||||
}
|
||||
|
||||
return $this->itemResponse($client);
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -83,8 +83,8 @@ class ImportController extends Controller
|
|||
];
|
||||
/** @var UploadedFile $file */
|
||||
foreach ($request->files->get('files') as $entityType => $file) {
|
||||
$contents = file_get_contents($file->getPathname());
|
||||
// Store the csv in cache with an expiry of 10 minutes
|
||||
$contents = $this->readFileWithProperEncoding($file->getPathname());
|
||||
|
||||
Cache::put($hash.'-'.$entityType, base64_encode($contents), 1200);
|
||||
|
||||
// Parse CSV
|
||||
|
|
@ -104,6 +104,224 @@ class ImportController extends Controller
|
|||
return response()->json($data);
|
||||
}
|
||||
|
||||
private function readFileWithProperEncoding(string $filePath): string
|
||||
{
|
||||
// First, read the file and check if it's already clean UTF-8
|
||||
$contents = @file_get_contents($filePath);
|
||||
if ($contents === false) {
|
||||
return '';
|
||||
}
|
||||
|
||||
// Check for different UTF BOMs and handle accordingly
|
||||
$bomResult = $this->detectAndHandleUTFEncoding($contents);
|
||||
if ($bomResult !== null) {
|
||||
return $bomResult;
|
||||
}
|
||||
|
||||
// Remove BOM if present (for UTF-8 BOM)
|
||||
$contents = $this->removeBOM($contents);
|
||||
|
||||
// Check if it's clean UTF-8 first (no conversion needed)
|
||||
if (mb_check_encoding($contents, 'UTF-8') && $this->isValidConversion($contents)) {
|
||||
return $contents;
|
||||
}
|
||||
|
||||
// Method 1: Try reading with explicit Windows-1252 context
|
||||
$context = stream_context_create([
|
||||
'file' => [
|
||||
'encoding' => 'WINDOWS-1252'
|
||||
]
|
||||
]);
|
||||
|
||||
$contextContents = @file_get_contents($filePath, false, $context);
|
||||
if ($contextContents !== false) {
|
||||
$contextContents = $this->removeBOM($contextContents);
|
||||
$converted = mb_convert_encoding($contextContents, 'UTF-8', 'WINDOWS-1252');
|
||||
if ($this->isValidConversion($converted)) {
|
||||
return $converted;
|
||||
}
|
||||
}
|
||||
|
||||
// Method 2: Binary read with forced Windows-1252 conversion
|
||||
$handle = @fopen($filePath, 'rb');
|
||||
if ($handle) {
|
||||
$binaryContents = fread($handle, filesize($filePath));
|
||||
fclose($handle);
|
||||
|
||||
$binaryContents = $this->removeBOM($binaryContents);
|
||||
|
||||
// Check if this looks like Windows-1252 by looking for problem bytes
|
||||
if ($this->containsWindows1252Bytes($binaryContents)) {
|
||||
$converted = mb_convert_encoding($binaryContents, 'UTF-8', 'WINDOWS-1252');
|
||||
if ($this->isValidConversion($converted)) {
|
||||
return $converted;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Method 3: Fix corrupted UTF-8 replacement characters
|
||||
if ($contents !== false) {
|
||||
$fixed = $this->fixCorruptedWindows1252($contents);
|
||||
if ($this->isValidConversion($fixed)) {
|
||||
return $fixed;
|
||||
}
|
||||
}
|
||||
|
||||
// Method 4: Try different encoding auto-detection with broader list
|
||||
if ($contents !== false) {
|
||||
$encodings = ['WINDOWS-1252', 'ISO-8859-1', 'ISO-8859-15', 'CP1252'];
|
||||
foreach ($encodings as $encoding) {
|
||||
$converted = mb_convert_encoding($contents, 'UTF-8', $encoding);
|
||||
if ($this->isValidConversion($converted)) {
|
||||
return $converted;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Fallback: return original contents
|
||||
return $contents ?: '';
|
||||
}
|
||||
|
||||
/**
|
||||
* Detect and handle UTF-16 and UTF-32 encodings based on BOM
|
||||
*/
|
||||
private function detectAndHandleUTFEncoding(string $data): ?string
|
||||
{
|
||||
// UTF-32 BE BOM
|
||||
if (substr($data, 0, 4) === "\x00\x00\xFE\xFF") {
|
||||
$withoutBOM = substr($data, 4);
|
||||
return mb_convert_encoding($withoutBOM, 'UTF-8', 'UTF-32BE');
|
||||
}
|
||||
|
||||
// UTF-32 LE BOM
|
||||
if (substr($data, 0, 4) === "\xFF\xFE\x00\x00") {
|
||||
$withoutBOM = substr($data, 4);
|
||||
return mb_convert_encoding($withoutBOM, 'UTF-8', 'UTF-32LE');
|
||||
}
|
||||
|
||||
// UTF-16 BE BOM
|
||||
if (substr($data, 0, 2) === "\xFE\xFF") {
|
||||
$withoutBOM = substr($data, 2);
|
||||
return mb_convert_encoding($withoutBOM, 'UTF-8', 'UTF-16BE');
|
||||
}
|
||||
|
||||
// UTF-16 LE BOM
|
||||
if (substr($data, 0, 2) === "\xFF\xFE") {
|
||||
$withoutBOM = substr($data, 2);
|
||||
return mb_convert_encoding($withoutBOM, 'UTF-8', 'UTF-16LE');
|
||||
}
|
||||
|
||||
// Try to detect UTF-16/32 without BOM (heuristic approach)
|
||||
$length = strlen($data);
|
||||
|
||||
// UTF-32 detection (every 4th byte pattern)
|
||||
if ($length >= 8 && $length % 4 === 0) {
|
||||
$nullCount = 0;
|
||||
for ($i = 0; $i < min(100, $length); $i += 4) {
|
||||
if ($data[$i] === "\x00" && $data[$i + 1] === "\x00" && $data[$i + 2] === "\x00") {
|
||||
$nullCount++;
|
||||
}
|
||||
}
|
||||
if ($nullCount > 5) { // Likely UTF-32LE
|
||||
return mb_convert_encoding($data, 'UTF-8', 'UTF-32LE');
|
||||
}
|
||||
}
|
||||
|
||||
// UTF-16 detection (every 2nd byte pattern)
|
||||
if ($length >= 4 && $length % 2 === 0) {
|
||||
$nullCount = 0;
|
||||
for ($i = 0; $i < min(100, $length); $i += 2) {
|
||||
if ($data[$i + 1] === "\x00") {
|
||||
$nullCount++;
|
||||
}
|
||||
}
|
||||
if ($nullCount > 10) { // Likely UTF-16LE
|
||||
return mb_convert_encoding($data, 'UTF-8', 'UTF-16LE');
|
||||
}
|
||||
|
||||
// Check for UTF-16BE
|
||||
$nullCount = 0;
|
||||
for ($i = 0; $i < min(100, $length); $i += 2) {
|
||||
if ($data[$i] === "\x00") {
|
||||
$nullCount++;
|
||||
}
|
||||
}
|
||||
if ($nullCount > 10) { // Likely UTF-16BE
|
||||
return mb_convert_encoding($data, 'UTF-8', 'UTF-16BE');
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Remove BOM (Byte Order Mark) from the beginning of a string
|
||||
*/
|
||||
private function removeBOM(string $data): string
|
||||
{
|
||||
// UTF-8 BOM
|
||||
if (substr($data, 0, 3) === "\xEF\xBB\xBF") {
|
||||
return substr($data, 3);
|
||||
}
|
||||
|
||||
// UTF-16 BE BOM
|
||||
if (substr($data, 0, 2) === "\xFE\xFF") {
|
||||
return substr($data, 2);
|
||||
}
|
||||
|
||||
// UTF-16 LE BOM
|
||||
if (substr($data, 0, 2) === "\xFF\xFE") {
|
||||
return substr($data, 2);
|
||||
}
|
||||
|
||||
// UTF-32 BE BOM
|
||||
if (substr($data, 0, 4) === "\x00\x00\xFE\xFF") {
|
||||
return substr($data, 4);
|
||||
}
|
||||
|
||||
// UTF-32 LE BOM
|
||||
if (substr($data, 0, 4) === "\xFF\xFE\x00\x00") {
|
||||
return substr($data, 4);
|
||||
}
|
||||
|
||||
return $data;
|
||||
}
|
||||
|
||||
private function containsWindows1252Bytes(string $data): bool
|
||||
{
|
||||
// Check for Windows-1252 specific bytes in 0x80-0x9F range
|
||||
$windows1252Bytes = [0x80, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8A, 0x8B, 0x8C, 0x8E, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9A, 0x9B, 0x9C, 0x9E, 0x9F];
|
||||
|
||||
foreach ($windows1252Bytes as $byte) {
|
||||
if (strpos($data, chr($byte)) !== false) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
private function fixCorruptedWindows1252(string $data): string
|
||||
{
|
||||
// Map of UTF-8 replacement sequences back to proper characters
|
||||
$replacements = [
|
||||
"\xEF\xBF\xBD" => "\u{2019}", // Most common: right single quote (0x92) - use smart quote
|
||||
// Add more mappings as needed based on your data
|
||||
];
|
||||
|
||||
return str_replace(array_keys($replacements), array_values($replacements), $data);
|
||||
}
|
||||
|
||||
private function isValidConversion(string $data): bool
|
||||
{
|
||||
// Check if conversion was successful:
|
||||
// 1. Must be valid UTF-8
|
||||
// 2. Must NOT contain replacement characters (indicating corruption)
|
||||
// 3. Additional check for double-encoded replacement
|
||||
return mb_check_encoding($data, 'UTF-8') &&
|
||||
!str_contains($data, "\xEF\xBF\xBD") && // UTF-8 replacement character bytes
|
||||
!str_contains($data, '�'); // Double-encoded replacement character
|
||||
}
|
||||
|
||||
private function setImportHints($entity_type, $available_keys, $headers): array
|
||||
{
|
||||
$hints = [];
|
||||
|
|
@ -203,7 +421,9 @@ class ImportController extends Controller
|
|||
|
||||
/** @var UploadedFile $file */
|
||||
foreach ($request->files->get('files') as $entityType => $file) {
|
||||
$contents = file_get_contents($file->getPathname());
|
||||
// $contents = file_get_contents($file->getPathname());
|
||||
$contents = $this->readFileWithProperEncoding($file->getPathname());
|
||||
|
||||
// Store the csv in cache with an expiry of 10 minutes
|
||||
Cache::put($hash.'-'.$entityType, base64_encode($contents), 600);
|
||||
nlog($hash.'-'.$entityType);
|
||||
|
|
@ -248,54 +468,9 @@ class ImportController extends Controller
|
|||
}
|
||||
}
|
||||
|
||||
return $this->convertData($data);
|
||||
return $data; // Remove the convertData call since we fixed encoding upfront
|
||||
}
|
||||
|
||||
|
||||
|
||||
private function convertData(array $data): array
|
||||
{
|
||||
|
||||
// List of encodings to check against
|
||||
$encodings = [
|
||||
'UTF-8',
|
||||
'ISO-8859-1', // Latin-1
|
||||
'ISO-8859-2', // Latin-2
|
||||
'WINDOWS-1252', // CP1252
|
||||
'SHIFT-JIS',
|
||||
'EUC-JP',
|
||||
'GB2312',
|
||||
'GBK',
|
||||
'BIG5',
|
||||
'ISO-2022-JP',
|
||||
'KOI8-R',
|
||||
'KOI8-U',
|
||||
'WINDOWS-1251', // CP1251
|
||||
'UTF-16',
|
||||
'UTF-32',
|
||||
'ASCII',
|
||||
'WINDOWS-1254', // Turkish, which sometimes includes Georgian
|
||||
'WINDOWS-1256', // Arabic, which sometimes includes Georgian
|
||||
'ISO-8859-10',
|
||||
];
|
||||
|
||||
foreach ($data as $key => $value) {
|
||||
// Only process strings
|
||||
if (is_string($value)) {
|
||||
// Detect the encoding of the string
|
||||
$detectedEncoding = mb_detect_encoding($value, $encodings, true);
|
||||
|
||||
// If encoding is detected and it's not UTF-8, convert it to UTF-8
|
||||
if ($detectedEncoding && $detectedEncoding !== 'UTF-8') {
|
||||
$array[$key] = mb_convert_encoding($value, 'UTF-8', $detectedEncoding);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return $data;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Returns the best delimiter
|
||||
*
|
||||
|
|
|
|||
|
|
@ -107,7 +107,7 @@ class ImportJsonController extends BaseController
|
|||
return response()->json(array_merge(['message' => 'Processing','success' => true], $metadata), 200);
|
||||
}
|
||||
|
||||
private function handleChunkedUpload(ImportJsonRequest $request)
|
||||
private function handleChunkedUploadX(ImportJsonRequest $request)
|
||||
{
|
||||
$metadata = json_decode($request->metadata, true);
|
||||
|
||||
|
|
@ -251,4 +251,146 @@ class ImportJsonController extends BaseController
|
|||
|
||||
rmdir($dir);
|
||||
}
|
||||
|
||||
private function handleChunkedUpload(ImportJsonRequest $request)
|
||||
{
|
||||
$metadata = json_decode($request->metadata, true);
|
||||
|
||||
// Validate metadata structure
|
||||
if (!isset($metadata['fileHash'], $metadata['fileName'], $metadata['totalChunks'], $metadata['currentChunk'])) {
|
||||
throw new \InvalidArgumentException('Invalid metadata structure');
|
||||
}
|
||||
|
||||
// Sanitize and validate file hash (should be alphanumeric)
|
||||
if (!preg_match('/^[a-zA-Z0-9]+$/', $metadata['fileHash'])) {
|
||||
throw new \InvalidArgumentException('Invalid file hash format');
|
||||
}
|
||||
|
||||
// Sanitize and validate filename
|
||||
$safeFileName = basename($metadata['fileName']);
|
||||
if ($safeFileName !== $metadata['fileName']) {
|
||||
throw new \InvalidArgumentException('Invalid filename');
|
||||
}
|
||||
|
||||
// Validate chunk number format
|
||||
if (!is_numeric($metadata['currentChunk']) || $metadata['currentChunk'] < 0) {
|
||||
throw new \InvalidArgumentException('Invalid chunk number');
|
||||
}
|
||||
|
||||
// Validate total chunks
|
||||
if (!is_numeric($metadata['totalChunks']) || $metadata['totalChunks'] <= 0 || $metadata['totalChunks'] > 1000) {
|
||||
throw new \InvalidArgumentException('Invalid total chunks');
|
||||
}
|
||||
|
||||
// Validate file type
|
||||
$chunk = $request->file('file');
|
||||
if (!$chunk || !$chunk->isValid()) {
|
||||
throw new \InvalidArgumentException('Invalid file chunk');
|
||||
}
|
||||
|
||||
// Validate file size before saving
|
||||
$maxChunkSize = 5 * 1024 * 1024; // 5MB
|
||||
if ($chunk->getSize() > $maxChunkSize) {
|
||||
throw new \InvalidArgumentException('Chunk size exceeds limit');
|
||||
}
|
||||
|
||||
$disk = Ninja::isHosted() ? 'backup' : config('filesystems.default');
|
||||
|
||||
// Store chunk in S3 with unique path
|
||||
$chunkKey = "tmp/chunks/{$metadata['fileHash']}/chunk-{$metadata['currentChunk']}";
|
||||
|
||||
Storage::disk($disk)->put(
|
||||
$chunkKey,
|
||||
file_get_contents($chunk->getRealPath()),
|
||||
['visibility' => 'private']
|
||||
);
|
||||
|
||||
// Check if all chunks are uploaded by listing S3 objects
|
||||
$chunkPrefix = "tmp/chunks/{$metadata['fileHash']}/";
|
||||
$uploadedChunks = collect(Storage::disk($disk)->files($chunkPrefix))
|
||||
->filter(function($file) {
|
||||
return str_contains(basename($file), 'chunk-');
|
||||
})
|
||||
->count();
|
||||
|
||||
if ($uploadedChunks >= $metadata['totalChunks']) {
|
||||
try {
|
||||
// Combine chunks from S3
|
||||
$finalPath = "migrations/{$safeFileName}";
|
||||
$this->combineChunksFromS3($disk, $metadata['fileHash'], $metadata['totalChunks'], $finalPath);
|
||||
|
||||
// Clean up
|
||||
$this->cleanupS3Chunks($disk, $metadata['fileHash']);
|
||||
|
||||
$metadata['uploaded_filepath'] = $finalPath;
|
||||
return $metadata;
|
||||
|
||||
} catch (\Exception $e) {
|
||||
// Clean up on error
|
||||
$this->cleanupS3Chunks($disk, $metadata['fileHash']);
|
||||
throw $e;
|
||||
}
|
||||
}
|
||||
|
||||
return $metadata;
|
||||
}
|
||||
|
||||
private function combineChunksFromS3(string $disk, string $fileHash, int $totalChunks, string $finalPath): void
|
||||
{
|
||||
// Create a temporary local file to combine chunks
|
||||
$tempFile = tempnam(sys_get_temp_dir(), 'chunk_combine_');
|
||||
|
||||
try {
|
||||
$handle = fopen($tempFile, 'wb');
|
||||
if ($handle === false) {
|
||||
throw new \RuntimeException('Failed to create temporary file');
|
||||
}
|
||||
|
||||
// Download and combine chunks in order
|
||||
for ($i = 0; $i < $totalChunks; $i++) {
|
||||
$chunkKey = "tmp/chunks/{$fileHash}/chunk-{$i}";
|
||||
|
||||
if (!Storage::disk($disk)->exists($chunkKey)) {
|
||||
throw new \RuntimeException("Missing chunk: {$i}");
|
||||
}
|
||||
|
||||
$chunkContent = Storage::disk($disk)->get($chunkKey);
|
||||
if ($chunkContent === null) {
|
||||
throw new \RuntimeException("Failed to read chunk: {$i}");
|
||||
}
|
||||
|
||||
if (fwrite($handle, $chunkContent) === false) {
|
||||
throw new \RuntimeException("Failed to write chunk: {$i}");
|
||||
}
|
||||
}
|
||||
|
||||
fclose($handle);
|
||||
|
||||
// Upload combined file to final location
|
||||
Storage::disk($disk)->put(
|
||||
$finalPath,
|
||||
file_get_contents($tempFile),
|
||||
['visibility' => 'private']
|
||||
);
|
||||
|
||||
} finally {
|
||||
// Clean up temporary file
|
||||
if (file_exists($tempFile)) {
|
||||
unlink($tempFile);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private function cleanupS3Chunks(string $disk, string $fileHash): void
|
||||
{
|
||||
$chunkPrefix = "tmp/chunks/{$fileHash}/";
|
||||
|
||||
// Get all chunk files for this upload
|
||||
$chunkFiles = Storage::disk($disk)->files($chunkPrefix);
|
||||
|
||||
// Delete all chunk files
|
||||
if (!empty($chunkFiles)) {
|
||||
Storage::disk($disk)->delete($chunkFiles);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -108,7 +108,7 @@ class BaseImport
|
|||
nlog("found {$entity_type}");
|
||||
|
||||
$csv = base64_decode($base64_encoded_csv);
|
||||
$csv = mb_convert_encoding($csv, 'UTF-8', 'UTF-8');
|
||||
// $csv = mb_convert_encoding($csv, 'UTF-8', 'UTF-8');
|
||||
|
||||
$csv = Reader::createFromString($csv);
|
||||
$csvdelimiter = self::detectDelimiter($csv);
|
||||
|
|
|
|||
|
|
@ -38,7 +38,7 @@ class ZipInvoices implements ShouldQueue
|
|||
|
||||
public $tries = 1;
|
||||
|
||||
public $timeout = 3600;
|
||||
public $timeout = 10800;
|
||||
|
||||
/**
|
||||
* @param $invoices
|
||||
|
|
|
|||
|
|
@ -0,0 +1,480 @@
|
|||
<?php
|
||||
|
||||
namespace Tests\Unit;
|
||||
|
||||
use Tests\TestCase;
|
||||
use App\Http\Controllers\ImportController;
|
||||
use Illuminate\Http\UploadedFile;
|
||||
use Illuminate\Support\Facades\Storage;
|
||||
use ReflectionClass;
|
||||
use ReflectionMethod;
|
||||
|
||||
class ImportEncodingTest extends TestCase
|
||||
{
|
||||
private ImportController $controller;
|
||||
private ReflectionMethod $readFileMethod;
|
||||
private ReflectionMethod $containsWindows1252Method;
|
||||
private ReflectionMethod $fixCorruptedMethod;
|
||||
private ReflectionMethod $isValidConversionMethod;
|
||||
|
||||
protected function setUp(): void
|
||||
{
|
||||
parent::setUp();
|
||||
|
||||
$this->controller = new ImportController();
|
||||
|
||||
// Use reflection to access private methods
|
||||
$reflection = new ReflectionClass($this->controller);
|
||||
$this->readFileMethod = $reflection->getMethod('readFileWithProperEncoding');
|
||||
$this->readFileMethod->setAccessible(true);
|
||||
|
||||
$this->containsWindows1252Method = $reflection->getMethod('containsWindows1252Bytes');
|
||||
$this->containsWindows1252Method->setAccessible(true);
|
||||
|
||||
$this->fixCorruptedMethod = $reflection->getMethod('fixCorruptedWindows1252');
|
||||
$this->fixCorruptedMethod->setAccessible(true);
|
||||
|
||||
$this->isValidConversionMethod = $reflection->getMethod('isValidConversion');
|
||||
$this->isValidConversionMethod->setAccessible(true);
|
||||
}
|
||||
|
||||
/**
|
||||
* Test data for various encoding scenarios
|
||||
*/
|
||||
private function getTestData(): array
|
||||
{
|
||||
return [
|
||||
// Test string with common problematic characters
|
||||
'basic' => "Company's text with quotes",
|
||||
'apostrophes' => "Sya's Ian Le Led",
|
||||
'quotes' => '"Smart quotes" and \'single quotes\'',
|
||||
'currency' => "Price: 50.00, 25.99", // Simplified to avoid currency symbols in basic test
|
||||
'symbols' => "Trademark and copyright symbols",
|
||||
'accents' => "Cafe resume naive facade", // Simplified accents
|
||||
];
|
||||
}
|
||||
|
||||
/**
|
||||
* Get complex test data with full Unicode characters (for specific encoding tests)
|
||||
*/
|
||||
private function getComplexTestData(): array
|
||||
{
|
||||
return [
|
||||
'complex' => "Company's «quoted» text—dash…ellipsis",
|
||||
'currency' => "Price: €50.00, £25.99",
|
||||
'symbols' => "Trademark™ and copyright© symbols",
|
||||
'accents' => "Café résumé naïve piñata façade",
|
||||
];
|
||||
}
|
||||
|
||||
/**
|
||||
* Windows-1252 special characters (0x80-0x9F range)
|
||||
*/
|
||||
private function getWindows1252SpecialChars(): array
|
||||
{
|
||||
return [
|
||||
0x80 => '€', // Euro sign
|
||||
0x82 => '‚', // Single low-9 quotation mark
|
||||
0x83 => 'ƒ', // Latin small letter f with hook
|
||||
0x84 => '„', // Double low-9 quotation mark
|
||||
0x85 => '…', // Horizontal ellipsis
|
||||
0x86 => '†', // Dagger
|
||||
0x87 => '‡', // Double dagger
|
||||
0x88 => 'ˆ', // Modifier letter circumflex accent
|
||||
0x89 => '‰', // Per mille sign
|
||||
0x8A => 'Š', // Latin capital letter S with caron
|
||||
0x8B => '‹', // Single left-pointing angle quotation mark
|
||||
0x8C => 'Œ', // Latin capital ligature OE
|
||||
0x8E => 'Ž', // Latin capital letter Z with caron
|
||||
0x91 => "\u{2018}", // Left single quotation mark (smart quote)
|
||||
0x92 => "\u{2019}", // Right single quotation mark (smart quote)
|
||||
0x93 => "\u{201C}", // Left double quotation mark
|
||||
0x94 => "\u{201D}", // Right double quotation mark
|
||||
0x95 => '•', // Bullet
|
||||
0x96 => '–', // En dash
|
||||
0x97 => '—', // Em dash
|
||||
0x98 => '˜', // Small tilde
|
||||
0x99 => '™', // Trade mark sign
|
||||
0x9A => 'š', // Latin small letter s with caron
|
||||
0x9B => '›', // Single right-pointing angle quotation mark
|
||||
0x9C => 'œ', // Latin small ligature oe
|
||||
0x9E => 'ž', // Latin small letter z with caron
|
||||
0x9F => 'Ÿ', // Latin capital letter Y with diaeresis
|
||||
];
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a temporary file with specific encoding
|
||||
*/
|
||||
private function createTestFile(string $content, string $encoding): string
|
||||
{
|
||||
$tempFile = tempnam(sys_get_temp_dir(), 'encoding_test_');
|
||||
|
||||
if ($encoding === 'UTF-8-BOM') {
|
||||
$content = "\xEF\xBB\xBF" . $content;
|
||||
file_put_contents($tempFile, $content);
|
||||
} elseif ($encoding === 'UTF-8-CORRUPTED') {
|
||||
// Simulate corrupted UTF-8 with replacement characters
|
||||
$content = str_replace("'", "\xEF\xBF\xBD", $content);
|
||||
file_put_contents($tempFile, $content);
|
||||
} elseif ($encoding === 'UTF-8') {
|
||||
file_put_contents($tempFile, $content);
|
||||
} else {
|
||||
// Convert to target encoding
|
||||
$encoded = mb_convert_encoding($content, $encoding, 'UTF-8');
|
||||
file_put_contents($tempFile, $encoded);
|
||||
}
|
||||
|
||||
return $tempFile;
|
||||
}
|
||||
|
||||
/**
|
||||
* Test 1: UTF-8 clean files (should pass through unchanged)
|
||||
*/
|
||||
public function testCleanUtf8Files()
|
||||
{
|
||||
foreach ($this->getTestData() as $name => $content) {
|
||||
$tempFile = $this->createTestFile($content, 'UTF-8');
|
||||
|
||||
$result = $this->readFileMethod->invoke($this->controller, $tempFile);
|
||||
|
||||
$this->assertEquals($content, $result, "Clean UTF-8 test failed for: {$name}");
|
||||
$this->assertTrue(
|
||||
$this->isValidConversionMethod->invoke($this->controller, $result),
|
||||
"Validation failed for clean UTF-8: {$name}"
|
||||
);
|
||||
|
||||
unlink($tempFile);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Test 2: UTF-8 with BOM
|
||||
*/
|
||||
public function testUtf8WithBom()
|
||||
{
|
||||
foreach ($this->getTestData() as $name => $content) {
|
||||
$tempFile = $this->createTestFile($content, 'UTF-8-BOM');
|
||||
|
||||
$result = $this->readFileMethod->invoke($this->controller, $tempFile);
|
||||
|
||||
// Should remove BOM and return clean content
|
||||
$this->assertEquals($content, $result, "UTF-8 BOM test failed for: {$name}");
|
||||
$this->assertTrue(
|
||||
$this->isValidConversionMethod->invoke($this->controller, $result),
|
||||
"Validation failed for UTF-8 BOM: {$name}"
|
||||
);
|
||||
|
||||
unlink($tempFile);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Test 3: Windows-1252 files
|
||||
*/
|
||||
public function testWindows1252Files()
|
||||
{
|
||||
// Test with complex Unicode characters for Windows-1252
|
||||
foreach ($this->getComplexTestData() as $name => $content) {
|
||||
$tempFile = $this->createTestFile($content, 'WINDOWS-1252');
|
||||
|
||||
$result = $this->readFileMethod->invoke($this->controller, $tempFile);
|
||||
|
||||
$this->assertEquals($content, $result, "Windows-1252 test failed for: {$name}");
|
||||
$this->assertTrue(
|
||||
$this->isValidConversionMethod->invoke($this->controller, $result),
|
||||
"Validation failed for Windows-1252: {$name}"
|
||||
);
|
||||
|
||||
unlink($tempFile);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Test 3.5: Complex UTF-8 files with Unicode characters
|
||||
*/
|
||||
public function testComplexUtf8Files()
|
||||
{
|
||||
foreach ($this->getComplexTestData() as $name => $content) {
|
||||
$tempFile = $this->createTestFile($content, 'UTF-8');
|
||||
|
||||
$result = $this->readFileMethod->invoke($this->controller, $tempFile);
|
||||
|
||||
$this->assertEquals($content, $result, "Complex UTF-8 test failed for: {$name}");
|
||||
$this->assertTrue(
|
||||
$this->isValidConversionMethod->invoke($this->controller, $result),
|
||||
"Validation failed for complex UTF-8: {$name}"
|
||||
);
|
||||
|
||||
unlink($tempFile);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Test 4: ISO-8859-1 files
|
||||
*/
|
||||
public function testIso88591Files()
|
||||
{
|
||||
// Use only characters that exist in ISO-8859-1
|
||||
$testData = [
|
||||
'basic' => "Company's text",
|
||||
'accents' => "Café résumé naïve façade",
|
||||
];
|
||||
|
||||
foreach ($testData as $name => $content) {
|
||||
$tempFile = $this->createTestFile($content, 'ISO-8859-1');
|
||||
|
||||
$result = $this->readFileMethod->invoke($this->controller, $tempFile);
|
||||
|
||||
$this->assertEquals($content, $result, "ISO-8859-1 test failed for: {$name}");
|
||||
$this->assertTrue(
|
||||
$this->isValidConversionMethod->invoke($this->controller, $result),
|
||||
"Validation failed for ISO-8859-1: {$name}"
|
||||
);
|
||||
|
||||
unlink($tempFile);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Test 5: Corrupted UTF-8 with replacement characters
|
||||
*/
|
||||
public function testCorruptedUtf8Files()
|
||||
{
|
||||
foreach ($this->getTestData() as $name => $content) {
|
||||
$tempFile = $this->createTestFile($content, 'UTF-8-CORRUPTED');
|
||||
|
||||
$result = $this->readFileMethod->invoke($this->controller, $tempFile);
|
||||
|
||||
// Expected result should have smart quotes instead of straight apostrophes
|
||||
$expectedContent = str_replace("'", "\u{2019}", $content);
|
||||
$this->assertEquals($expectedContent, $result, "Corrupted UTF-8 test failed for: {$name}");
|
||||
$this->assertTrue(
|
||||
$this->isValidConversionMethod->invoke($this->controller, $result),
|
||||
"Validation failed for corrupted UTF-8: {$name}"
|
||||
);
|
||||
|
||||
unlink($tempFile);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Test 6: All Windows-1252 special characters
|
||||
*/
|
||||
public function testAllWindows1252SpecialCharacters()
|
||||
{
|
||||
$specialChars = $this->getWindows1252SpecialChars();
|
||||
|
||||
foreach ($specialChars as $byte => $expectedChar) {
|
||||
// Create content with the specific byte
|
||||
$content = "Test " . chr($byte) . " character";
|
||||
$tempFile = tempnam(sys_get_temp_dir(), 'char_test_');
|
||||
|
||||
// Write raw bytes including the Windows-1252 character
|
||||
$rawContent = "Test " . chr($byte) . " character";
|
||||
file_put_contents($tempFile, $rawContent);
|
||||
|
||||
$result = $this->readFileMethod->invoke($this->controller, $tempFile);
|
||||
|
||||
$expectedResult = "Test {$expectedChar} character";
|
||||
$this->assertEquals(
|
||||
$expectedResult,
|
||||
$result,
|
||||
"Windows-1252 character test failed for byte 0x" . dechex($byte) . " ({$expectedChar})"
|
||||
);
|
||||
|
||||
unlink($tempFile);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Test 7: containsWindows1252Bytes method
|
||||
*/
|
||||
public function testContainsWindows1252Bytes()
|
||||
{
|
||||
// Test with Windows-1252 bytes
|
||||
$dataWithWindows1252 = "Test " . chr(0x92) . " content";
|
||||
$this->assertTrue(
|
||||
$this->containsWindows1252Method->invoke($this->controller, $dataWithWindows1252),
|
||||
"Should detect Windows-1252 bytes"
|
||||
);
|
||||
|
||||
// Test without Windows-1252 bytes
|
||||
$cleanData = "Test clean content";
|
||||
$this->assertFalse(
|
||||
$this->containsWindows1252Method->invoke($this->controller, $cleanData),
|
||||
"Should not detect Windows-1252 bytes in clean data"
|
||||
);
|
||||
|
||||
// Test with UTF-8 replacement characters
|
||||
$corruptedData = "Test \xEF\xBF\xBD content";
|
||||
$this->assertFalse(
|
||||
$this->containsWindows1252Method->invoke($this->controller, $corruptedData),
|
||||
"Should not detect Windows-1252 bytes in corrupted UTF-8"
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* Test 8: fixCorruptedWindows1252 method
|
||||
*/
|
||||
public function testFixCorruptedWindows1252()
|
||||
{
|
||||
$corruptedData = "Sya\xEF\xBF\xBDs In Le";
|
||||
$expectedResult = "Sya\u{2019}s In Le";
|
||||
|
||||
$result = $this->fixCorruptedMethod->invoke($this->controller, $corruptedData);
|
||||
|
||||
$this->assertEquals($expectedResult, $result, "Failed to fix corrupted Windows-1252 data");
|
||||
}
|
||||
|
||||
/**
|
||||
* Test 9: isValidConversion method
|
||||
*/
|
||||
public function testIsValidConversion()
|
||||
{
|
||||
// Valid UTF-8 without replacement characters
|
||||
$validData = "Clean UTF-8 content with apostrophe's";
|
||||
$this->assertTrue(
|
||||
$this->isValidConversionMethod->invoke($this->controller, $validData),
|
||||
"Should validate clean UTF-8 content"
|
||||
);
|
||||
|
||||
// Invalid - contains replacement character bytes
|
||||
$invalidData1 = "Content with \xEF\xBF\xBD replacement";
|
||||
$this->assertFalse(
|
||||
$this->isValidConversionMethod->invoke($this->controller, $invalidData1),
|
||||
"Should reject content with UTF-8 replacement bytes"
|
||||
);
|
||||
|
||||
// Invalid - contains double-encoded replacement
|
||||
$invalidData2 = "Content with � replacement";
|
||||
$this->assertFalse(
|
||||
$this->isValidConversionMethod->invoke($this->controller, $invalidData2),
|
||||
"Should reject content with double-encoded replacement"
|
||||
);
|
||||
|
||||
// Invalid UTF-8
|
||||
$invalidUtf8 = "Invalid \xFF UTF-8";
|
||||
$this->assertFalse(
|
||||
$this->isValidConversionMethod->invoke($this->controller, $invalidUtf8),
|
||||
"Should reject invalid UTF-8"
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* Test 10: Multiple encoding types comprehensive test
|
||||
*/
|
||||
public function testMultipleEncodingTypes()
|
||||
{
|
||||
$encodings = [
|
||||
'UTF-8',
|
||||
'WINDOWS-1252',
|
||||
'ISO-8859-1',
|
||||
'ISO-8859-15',
|
||||
'ASCII',
|
||||
];
|
||||
|
||||
$testContent = "Company's «test» data—with symbols";
|
||||
|
||||
foreach ($encodings as $encoding) {
|
||||
if ($encoding === 'ASCII') {
|
||||
// ASCII can't handle special characters, use simpler content
|
||||
$content = "Company data test";
|
||||
} else {
|
||||
$content = $testContent;
|
||||
}
|
||||
|
||||
$tempFile = $this->createTestFile($content, $encoding);
|
||||
$result = $this->readFileMethod->invoke($this->controller, $tempFile);
|
||||
|
||||
// Result should always be valid UTF-8
|
||||
$this->assertTrue(
|
||||
mb_check_encoding($result, 'UTF-8'),
|
||||
"Result should be valid UTF-8 for encoding: {$encoding}"
|
||||
);
|
||||
|
||||
// Should not contain replacement characters
|
||||
$this->assertFalse(
|
||||
str_contains($result, '<27>'),
|
||||
"Result should not contain replacement characters for encoding: {$encoding}"
|
||||
);
|
||||
|
||||
unlink($tempFile);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Test 11: Backward compatibility - existing functionality should not break
|
||||
*/
|
||||
public function testBackwardCompatibility()
|
||||
{
|
||||
// Test that normal CSV content still works
|
||||
$csvContent = "Name,Amount,Date\n\"John's Company\",100.50,2024-01-01\n\"Mary's Store\",250.75,2024-01-02";
|
||||
|
||||
$tempFile = $this->createTestFile($csvContent, 'UTF-8');
|
||||
$result = $this->readFileMethod->invoke($this->controller, $tempFile);
|
||||
|
||||
$this->assertEquals($csvContent, $result, "Backward compatibility test failed for CSV content");
|
||||
|
||||
// Test that it contains expected structure
|
||||
$this->assertStringContainsString("John's Company", $result, "CSV should contain original apostrophes");
|
||||
$this->assertStringContainsString("Mary's Store", $result, "CSV should contain original apostrophes");
|
||||
|
||||
unlink($tempFile);
|
||||
}
|
||||
|
||||
/**
|
||||
* Test 12: Edge cases and error handling
|
||||
*/
|
||||
public function testEdgeCases()
|
||||
{
|
||||
// Empty file
|
||||
$tempFile = tempnam(sys_get_temp_dir(), 'empty_test_');
|
||||
file_put_contents($tempFile, '');
|
||||
|
||||
$result = $this->readFileMethod->invoke($this->controller, $tempFile);
|
||||
$this->assertEquals('', $result, "Empty file should return empty string");
|
||||
|
||||
unlink($tempFile);
|
||||
|
||||
// Non-existent file
|
||||
$result = $this->readFileMethod->invoke($this->controller, '/non/existent/file.csv');
|
||||
$this->assertEquals('', $result, "Non-existent file should return empty string");
|
||||
|
||||
// Very large content with mixed characters
|
||||
$largeContent = str_repeat("Test's data with special chars—", 1000);
|
||||
$tempFile = $this->createTestFile($largeContent, 'WINDOWS-1252');
|
||||
|
||||
$result = $this->readFileMethod->invoke($this->controller, $tempFile);
|
||||
$this->assertTrue(
|
||||
$this->isValidConversionMethod->invoke($this->controller, $result),
|
||||
"Large file conversion should be valid"
|
||||
);
|
||||
|
||||
unlink($tempFile);
|
||||
}
|
||||
|
||||
/**
|
||||
* Test 13: Performance test to ensure no significant regression
|
||||
*/
|
||||
public function testPerformance()
|
||||
{
|
||||
$content = str_repeat("Company's data with special characters test\n", 10000);
|
||||
$tempFile = $this->createTestFile($content, 'WINDOWS-1252');
|
||||
|
||||
$startTime = microtime(true);
|
||||
$result = $this->readFileMethod->invoke($this->controller, $tempFile);
|
||||
$endTime = microtime(true);
|
||||
|
||||
$processingTime = $endTime - $startTime;
|
||||
|
||||
// Should process reasonably fast (less than 1 second for 10k lines)
|
||||
$this->assertLessThan(1.0, $processingTime, "Processing should be reasonably fast");
|
||||
$this->assertTrue(
|
||||
$this->isValidConversionMethod->invoke($this->controller, $result),
|
||||
"Performance test result should be valid"
|
||||
);
|
||||
|
||||
unlink($tempFile);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,511 @@
|
|||
<?php
|
||||
|
||||
namespace Tests\Unit;
|
||||
|
||||
use Tests\TestCase;
|
||||
use App\Http\Controllers\ImportController;
|
||||
use ReflectionClass;
|
||||
use ReflectionMethod;
|
||||
|
||||
class ImportUnicodeEncodingTest extends TestCase
|
||||
{
|
||||
private ImportController $controller;
|
||||
private ReflectionMethod $readFileMethod;
|
||||
private ReflectionMethod $isValidConversionMethod;
|
||||
private ReflectionMethod $removeBOMMethod;
|
||||
|
||||
protected function setUp(): void
|
||||
{
|
||||
parent::setUp();
|
||||
|
||||
$this->controller = new ImportController();
|
||||
|
||||
// Use reflection to access private methods
|
||||
$reflection = new ReflectionClass($this->controller);
|
||||
$this->readFileMethod = $reflection->getMethod('readFileWithProperEncoding');
|
||||
$this->readFileMethod->setAccessible(true);
|
||||
|
||||
$this->isValidConversionMethod = $reflection->getMethod('isValidConversion');
|
||||
$this->isValidConversionMethod->setAccessible(true);
|
||||
|
||||
$this->removeBOMMethod = $reflection->getMethod('removeBOM');
|
||||
$this->removeBOMMethod->setAccessible(true);
|
||||
}
|
||||
|
||||
/**
|
||||
* Test data with various Unicode blocks and international content
|
||||
*/
|
||||
private function getUnicodeTestData(): array
|
||||
{
|
||||
return [
|
||||
// Basic Latin and Latin Extended
|
||||
'latin_basic' => "Hello World! Company's data",
|
||||
'latin_extended' => "Café résumé naïve piñata façade",
|
||||
|
||||
// Greek
|
||||
'greek' => "Καλημέρα κόσμε! Ελληνικά γράμματα",
|
||||
|
||||
// Cyrillic
|
||||
'cyrillic' => "Привет мир! Русский текст",
|
||||
|
||||
// Arabic (RTL)
|
||||
'arabic' => "مرحبا بالعالم! النص العربي",
|
||||
|
||||
// Hebrew (RTL)
|
||||
'hebrew' => "שלום עולם! טקסט עברי",
|
||||
|
||||
// Chinese Simplified
|
||||
'chinese_simplified' => "你好世界!简体中文",
|
||||
|
||||
// Chinese Traditional
|
||||
'chinese_traditional' => "你好世界!繁體中文",
|
||||
|
||||
// Japanese (Hiragana, Katakana, Kanji)
|
||||
'japanese' => "こんにちは世界!ひらがな・カタカナ・漢字",
|
||||
|
||||
// Korean
|
||||
'korean' => "안녕하세요 세계! 한국어 텍스트",
|
||||
|
||||
// Mathematical symbols
|
||||
'mathematical' => "∑∫∞±≤≥≠√∂∇∆",
|
||||
|
||||
// Currency symbols
|
||||
'currency' => "€£¥₹₽₨₩₪₦₡₸",
|
||||
|
||||
// Emoji and symbols
|
||||
'emoji' => "😀🌍🚀💻📊✨🎉🔥💡⭐",
|
||||
|
||||
// Mixed scripts
|
||||
'mixed_scripts' => "Hello мир 世界 🌍 café résumé",
|
||||
|
||||
// Special Unicode cases
|
||||
'zero_width' => "Text\u{200B}with\u{FEFF}zero\u{200C}width\u{200D}chars",
|
||||
'combining' => "e\u{0301}a\u{0300}i\u{0302}o\u{0303}u\u{0308}", // é à î õ ü
|
||||
|
||||
// Quotation marks and dashes
|
||||
'punctuation' => "«quotes» \u{201C}smart\u{201D} \u{2018}quotes\u{2019} — – … ‚ „",
|
||||
];
|
||||
}
|
||||
|
||||
/**
|
||||
* Extended encoding list for comprehensive testing
|
||||
*/
|
||||
private function getExtendedEncodings(): array
|
||||
{
|
||||
return [
|
||||
// Unicode variants
|
||||
'UTF-8',
|
||||
'UTF-8-BOM',
|
||||
'UTF-16BE',
|
||||
'UTF-16LE',
|
||||
'UTF-32BE',
|
||||
'UTF-32LE',
|
||||
|
||||
// ISO Latin variants (commonly supported)
|
||||
'ISO-8859-1', // Western European
|
||||
'ISO-8859-2', // Central European
|
||||
'ISO-8859-5', // Cyrillic
|
||||
'ISO-8859-7', // Greek
|
||||
'ISO-8859-9', // Turkish
|
||||
'ISO-8859-15', // Western European (with Euro)
|
||||
|
||||
// Windows code pages (commonly supported)
|
||||
'Windows-1251', // Cyrillic
|
||||
'Windows-1252', // Western European
|
||||
|
||||
// Other commonly supported encodings
|
||||
'CP1252', // Windows Western
|
||||
];
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a test file with specific content and encoding
|
||||
*/
|
||||
private function createTestFile(string $content, string $encoding): string
|
||||
{
|
||||
$tempFile = tempnam(sys_get_temp_dir(), 'unicode_test_');
|
||||
|
||||
switch ($encoding) {
|
||||
case 'UTF-8-BOM':
|
||||
$content = "\xEF\xBB\xBF" . $content;
|
||||
file_put_contents($tempFile, $content);
|
||||
break;
|
||||
|
||||
case 'UTF-16BE':
|
||||
$content = "\xFE\xFF" . mb_convert_encoding($content, 'UTF-16BE', 'UTF-8');
|
||||
file_put_contents($tempFile, $content);
|
||||
break;
|
||||
|
||||
case 'UTF-16LE':
|
||||
$content = "\xFF\xFE" . mb_convert_encoding($content, 'UTF-16LE', 'UTF-8');
|
||||
file_put_contents($tempFile, $content);
|
||||
break;
|
||||
|
||||
case 'UTF-32BE':
|
||||
$content = "\x00\x00\xFE\xFF" . mb_convert_encoding($content, 'UTF-32BE', 'UTF-8');
|
||||
file_put_contents($tempFile, $content);
|
||||
break;
|
||||
|
||||
case 'UTF-32LE':
|
||||
$content = "\xFF\xFE\x00\x00" . mb_convert_encoding($content, 'UTF-32LE', 'UTF-8');
|
||||
file_put_contents($tempFile, $content);
|
||||
break;
|
||||
|
||||
case 'UTF-8':
|
||||
file_put_contents($tempFile, $content);
|
||||
break;
|
||||
|
||||
default:
|
||||
// Try to convert using mb_convert_encoding
|
||||
try {
|
||||
// Check if encoding is supported
|
||||
if (!in_array($encoding, mb_list_encodings())) {
|
||||
// If encoding not supported, use UTF-8 fallback
|
||||
file_put_contents($tempFile, $content);
|
||||
break;
|
||||
}
|
||||
|
||||
$encoded = mb_convert_encoding($content, $encoding, 'UTF-8');
|
||||
file_put_contents($tempFile, $encoded);
|
||||
} catch (Exception | ValueError $e) {
|
||||
// If conversion fails, use UTF-8 fallback
|
||||
file_put_contents($tempFile, $content);
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
return $tempFile;
|
||||
}
|
||||
|
||||
/**
|
||||
* Test 1: Unicode content preservation across different UTF encodings
|
||||
*/
|
||||
public function testUnicodeContentPreservation()
|
||||
{
|
||||
$unicodeEncodings = ['UTF-8', 'UTF-8-BOM', 'UTF-16BE', 'UTF-16LE', 'UTF-32BE', 'UTF-32LE'];
|
||||
|
||||
foreach ($this->getUnicodeTestData() as $name => $content) {
|
||||
foreach ($unicodeEncodings as $encoding) {
|
||||
$tempFile = $this->createTestFile($content, $encoding);
|
||||
|
||||
$result = $this->readFileMethod->invoke($this->controller, $tempFile);
|
||||
|
||||
$this->assertEquals(
|
||||
$content,
|
||||
$result,
|
||||
"Unicode preservation failed for {$name} with {$encoding} encoding"
|
||||
);
|
||||
|
||||
$this->assertTrue(
|
||||
$this->isValidConversionMethod->invoke($this->controller, $result),
|
||||
"Validation failed for {$name} with {$encoding} encoding"
|
||||
);
|
||||
|
||||
unlink($tempFile);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Test 2: BOM handling for different UTF variants
|
||||
*/
|
||||
public function testBOMHandlingForAllUTF()
|
||||
{
|
||||
$testContent = "Hello 世界! Тест العالم";
|
||||
|
||||
$bomTests = [
|
||||
'UTF-8' => "\xEF\xBB\xBF",
|
||||
'UTF-16BE' => "\xFE\xFF",
|
||||
'UTF-16LE' => "\xFF\xFE",
|
||||
'UTF-32BE' => "\x00\x00\xFE\xFF",
|
||||
'UTF-32LE' => "\xFF\xFE\x00\x00",
|
||||
];
|
||||
|
||||
foreach ($bomTests as $encoding => $bom) {
|
||||
// Create file with BOM using the createTestFile method
|
||||
$tempFile = $this->createTestFile($testContent, $encoding);
|
||||
|
||||
// Test file processing with BOM
|
||||
$fileResult = $this->readFileMethod->invoke($this->controller, $tempFile);
|
||||
|
||||
$this->assertEquals(
|
||||
$testContent,
|
||||
$fileResult,
|
||||
"File processing with BOM failed for {$encoding}"
|
||||
);
|
||||
|
||||
$this->assertTrue(
|
||||
$this->isValidConversionMethod->invoke($this->controller, $fileResult),
|
||||
"BOM file validation failed for {$encoding}"
|
||||
);
|
||||
|
||||
unlink($tempFile);
|
||||
}
|
||||
|
||||
// Test UTF-8 BOM removal specifically (since that's what the method is designed for)
|
||||
$utf8DataWithBOM = "\xEF\xBB\xBF" . $testContent;
|
||||
$result = $this->removeBOMMethod->invoke($this->controller, $utf8DataWithBOM);
|
||||
|
||||
$this->assertEquals(
|
||||
$testContent,
|
||||
$result,
|
||||
"UTF-8 BOM removal failed"
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* Test 3: Extended encoding compatibility
|
||||
*/
|
||||
public function testExtendedEncodingCompatibility()
|
||||
{
|
||||
// Use content that's compatible with most encodings
|
||||
$basicContent = "Company data with special chars";
|
||||
$accentContent = "Cafe resume naive facade"; // Without actual accents for broader compatibility
|
||||
|
||||
foreach ($this->getExtendedEncodings() as $encoding) {
|
||||
// Skip encodings that are known to not support certain characters
|
||||
$content = $this->isAsciiCompatibleEncoding($encoding) ? $basicContent : $accentContent;
|
||||
|
||||
$tempFile = $this->createTestFile($content, $encoding);
|
||||
|
||||
$result = $this->readFileMethod->invoke($this->controller, $tempFile);
|
||||
|
||||
// Result should always be valid UTF-8
|
||||
$this->assertTrue(
|
||||
mb_check_encoding($result, 'UTF-8'),
|
||||
"Result should be valid UTF-8 for encoding: {$encoding}"
|
||||
);
|
||||
|
||||
// Should not contain replacement characters
|
||||
$this->assertFalse(
|
||||
str_contains($result, '<27>'),
|
||||
"Result should not contain replacement characters for encoding: {$encoding}"
|
||||
);
|
||||
|
||||
$this->assertTrue(
|
||||
$this->isValidConversionMethod->invoke($this->controller, $result),
|
||||
"Validation failed for encoding: {$encoding}"
|
||||
);
|
||||
|
||||
unlink($tempFile);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Test 4: Right-to-left (RTL) text handling
|
||||
*/
|
||||
public function testRightToLeftTextHandling()
|
||||
{
|
||||
$rtlContent = [
|
||||
'arabic' => "مرحبا بالعالم! شركة البيانات",
|
||||
'hebrew' => "שלום עולם! חברת הנתונים",
|
||||
'mixed_rtl' => "Hello مرحبا World עולם!",
|
||||
];
|
||||
|
||||
foreach ($rtlContent as $name => $content) {
|
||||
$tempFile = $this->createTestFile($content, 'UTF-8');
|
||||
|
||||
$result = $this->readFileMethod->invoke($this->controller, $tempFile);
|
||||
|
||||
$this->assertEquals($content, $result, "RTL test failed for: {$name}");
|
||||
$this->assertTrue(
|
||||
$this->isValidConversionMethod->invoke($this->controller, $result),
|
||||
"RTL validation failed for: {$name}"
|
||||
);
|
||||
|
||||
unlink($tempFile);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Test 5: Asian character sets (CJK)
|
||||
*/
|
||||
public function testAsianCharacterSets()
|
||||
{
|
||||
$cjkContent = [
|
||||
'chinese_simplified' => "公司数据处理系统",
|
||||
'chinese_traditional' => "公司資料處理系統",
|
||||
'japanese_hiragana' => "かいしゃのでーたしすてむ",
|
||||
'japanese_katakana' => "カイシャノデータシステム",
|
||||
'japanese_kanji' => "会社のデータシステム",
|
||||
'korean' => "회사 데이터 시스템",
|
||||
'mixed_cjk' => "Company 公司 会社 회사 Data",
|
||||
];
|
||||
|
||||
foreach ($cjkContent as $name => $content) {
|
||||
$tempFile = $this->createTestFile($content, 'UTF-8');
|
||||
|
||||
$result = $this->readFileMethod->invoke($this->controller, $tempFile);
|
||||
|
||||
$this->assertEquals($content, $result, "CJK test failed for: {$name}");
|
||||
$this->assertTrue(
|
||||
$this->isValidConversionMethod->invoke($this->controller, $result),
|
||||
"CJK validation failed for: {$name}"
|
||||
);
|
||||
|
||||
unlink($tempFile);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Test 6: Emoji and symbol handling
|
||||
*/
|
||||
public function testEmojiAndSymbolHandling()
|
||||
{
|
||||
$symbolContent = [
|
||||
'basic_emoji' => "Data 📊 Reports 📈 Analysis 🔍",
|
||||
'complex_emoji' => "👨💻👩💼🏢💼📋📊📈📉",
|
||||
'mathematical' => "∑(x²) ∫f(x)dx ∞ ≠ ≤ ≥ ± √",
|
||||
'currency_symbols' => "Price: €100 £80 ¥1000 $75",
|
||||
'technical_symbols' => "® © ™ § ¶ † ‡ • ‰ ‱",
|
||||
'arrows_symbols' => "← → ↑ ↓ ↔ ↕ ⇐ ⇒ ⇔",
|
||||
];
|
||||
|
||||
foreach ($symbolContent as $name => $content) {
|
||||
$tempFile = $this->createTestFile($content, 'UTF-8');
|
||||
|
||||
$result = $this->readFileMethod->invoke($this->controller, $tempFile);
|
||||
|
||||
$this->assertEquals($content, $result, "Symbol test failed for: {$name}");
|
||||
$this->assertTrue(
|
||||
$this->isValidConversionMethod->invoke($this->controller, $result),
|
||||
"Symbol validation failed for: {$name}"
|
||||
);
|
||||
|
||||
unlink($tempFile);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Test 7: Combining characters and normalization
|
||||
*/
|
||||
public function testCombiningCharacters()
|
||||
{
|
||||
$combiningContent = [
|
||||
'accents_composed' => "café résumé naïve",
|
||||
'accents_decomposed' => "cafe\u{0301} re\u{0301}sume\u{0301} nai\u{0308}ve",
|
||||
'mixed_normalization' => "café cafe\u{0301} résumé re\u{0301}sume\u{0301}",
|
||||
];
|
||||
|
||||
foreach ($combiningContent as $name => $content) {
|
||||
$tempFile = $this->createTestFile($content, 'UTF-8');
|
||||
|
||||
$result = $this->readFileMethod->invoke($this->controller, $tempFile);
|
||||
|
||||
// Content should be preserved (normalization might occur but content should be valid)
|
||||
$this->assertTrue(
|
||||
mb_check_encoding($result, 'UTF-8'),
|
||||
"Combining character result should be valid UTF-8 for: {$name}"
|
||||
);
|
||||
$this->assertTrue(
|
||||
$this->isValidConversionMethod->invoke($this->controller, $result),
|
||||
"Combining character validation failed for: {$name}"
|
||||
);
|
||||
|
||||
unlink($tempFile);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Test 8: Large Unicode content performance
|
||||
*/
|
||||
public function testLargeUnicodeContentPerformance()
|
||||
{
|
||||
$unicodePattern = "🌍 Hello 世界 مرحبا Здравствуй שלום こんにちは 안녕하세요 ";
|
||||
$largeContent = str_repeat($unicodePattern, 1000); // ~50KB of Unicode content
|
||||
|
||||
$tempFile = $this->createTestFile($largeContent, 'UTF-8');
|
||||
|
||||
$startTime = microtime(true);
|
||||
$result = $this->readFileMethod->invoke($this->controller, $tempFile);
|
||||
$endTime = microtime(true);
|
||||
|
||||
$processingTime = $endTime - $startTime;
|
||||
|
||||
$this->assertLessThan(2.0, $processingTime, "Large Unicode content processing should be fast");
|
||||
$this->assertEquals($largeContent, $result, "Large Unicode content should be preserved");
|
||||
$this->assertTrue(
|
||||
$this->isValidConversionMethod->invoke($this->controller, $result),
|
||||
"Large Unicode content validation failed"
|
||||
);
|
||||
|
||||
unlink($tempFile);
|
||||
}
|
||||
|
||||
/**
|
||||
* Test 9: Mixed encoding scenarios
|
||||
*/
|
||||
public function testMixedEncodingScenarios()
|
||||
{
|
||||
// Simulate files that might have mixed encoding issues
|
||||
$scenarios = [
|
||||
'mostly_ascii_with_unicode' => "Regular text with émojis 😀 and symbols ™",
|
||||
'csv_with_international' => "Name,Company,Location\n\"José García\",\"Café España\",\"São Paulo\"",
|
||||
'business_names' => "McDonald's, L'Oréal, Nestlé, Björk & Co, Müller GmbH",
|
||||
];
|
||||
|
||||
foreach ($scenarios as $name => $content) {
|
||||
// Test with multiple encodings
|
||||
$encodings = ['UTF-8', 'UTF-8-BOM', 'WINDOWS-1252', 'ISO-8859-1'];
|
||||
|
||||
foreach ($encodings as $encoding) {
|
||||
$tempFile = $this->createTestFile($content, $encoding);
|
||||
|
||||
$result = $this->readFileMethod->invoke($this->controller, $tempFile);
|
||||
|
||||
$this->assertTrue(
|
||||
mb_check_encoding($result, 'UTF-8'),
|
||||
"Mixed encoding result should be valid UTF-8 for {$name} with {$encoding}"
|
||||
);
|
||||
$this->assertTrue(
|
||||
$this->isValidConversionMethod->invoke($this->controller, $result),
|
||||
"Mixed encoding validation failed for {$name} with {$encoding}"
|
||||
);
|
||||
|
||||
unlink($tempFile);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Helper method to determine if an encoding is ASCII-compatible
|
||||
*/
|
||||
private function isAsciiCompatibleEncoding(string $encoding): bool
|
||||
{
|
||||
$asciiOnlyEncodings = ['ASCII', 'US-ASCII'];
|
||||
return in_array($encoding, $asciiOnlyEncodings);
|
||||
}
|
||||
|
||||
/**
|
||||
* Test 10: CSV data with international content
|
||||
*/
|
||||
public function testCSVWithInternationalContent()
|
||||
{
|
||||
$csvContent = "Name,Company,City,Country,Notes\n" .
|
||||
"\"José García\",\"Café España\",\"São Paulo\",\"Brasil\",\"Açaí supplier\"\n" .
|
||||
"\"李小明\",\"北京科技公司\",\"北京\",\"中国\",\"Technology partner\"\n" .
|
||||
"\"Müller\",\"Bäckerei München\",\"München\",\"Deutschland\",\"Café & Bäckerei\"\n" .
|
||||
"\"Иванов\",\"Москва ООО\",\"Москва\",\"Россия\",\"Software development\"\n" .
|
||||
"\"محمد أحمد\",\"شركة الرياض\",\"الرياض\",\"السعودية\",\"Trading company\"";
|
||||
|
||||
$encodings = ['UTF-8', 'UTF-8-BOM', 'WINDOWS-1252'];
|
||||
|
||||
foreach ($encodings as $encoding) {
|
||||
$tempFile = $this->createTestFile($csvContent, $encoding);
|
||||
|
||||
$result = $this->readFileMethod->invoke($this->controller, $tempFile);
|
||||
|
||||
$this->assertTrue(
|
||||
mb_check_encoding($result, 'UTF-8'),
|
||||
"CSV result should be valid UTF-8 for encoding: {$encoding}"
|
||||
);
|
||||
|
||||
// Check that it contains expected international content
|
||||
$this->assertStringContainsString("José García", $result, "Should contain Spanish names");
|
||||
$this->assertStringContainsString("李小明", $result, "Should contain Chinese names");
|
||||
$this->assertStringContainsString("Müller", $result, "Should contain German names");
|
||||
|
||||
unlink($tempFile);
|
||||
}
|
||||
}
|
||||
}
|
||||
Loading…
Reference in New Issue