user()->company()->getLocale()); // Create a reference $hash = Str::random(32); $data = [ 'hash' => $hash, 'mappings' => [], ]; /** @var UploadedFile $file */ foreach ($request->files->get('files') as $entityType => $file) { $contents = $this->readFileWithProperEncoding($file->getPathname()); Cache::put($hash.'-'.$entityType, base64_encode($contents), 1200); // Parse CSV $csv_array = $this->getCsvData($contents); $class_map = $this->getEntityMap($entityType); $hints = $this->setImportHints($entityType, $class_map::importable(), $csv_array[0]); $data['mappings'][$entityType] = [ 'available' => $class_map::importable(), 'headers' => array_slice($csv_array, 0, 2), 'hints' => $hints, ]; } return response()->json($data); } private function readFileWithProperEncoding(string $filePath): string { // First, read the file and check if it's already clean UTF-8 $contents = @file_get_contents($filePath); if ($contents === false) { return ''; } // Check for different UTF BOMs and handle accordingly $bomResult = $this->detectAndHandleUTFEncoding($contents); if ($bomResult !== null) { return $bomResult; } // Remove BOM if present (for UTF-8 BOM) $contents = $this->removeBOM($contents); // Check if it's clean UTF-8 first (no conversion needed) if (mb_check_encoding($contents, 'UTF-8') && $this->isValidConversion($contents)) { return $contents; } // Method 1: Try reading with explicit Windows-1252 context $context = stream_context_create([ 'file' => [ 'encoding' => 'WINDOWS-1252' ] ]); $contextContents = @file_get_contents($filePath, false, $context); if ($contextContents !== false) { $contextContents = $this->removeBOM($contextContents); $converted = mb_convert_encoding($contextContents, 'UTF-8', 'WINDOWS-1252'); if ($this->isValidConversion($converted)) { return $converted; } } // Method 2: Binary read with forced Windows-1252 conversion $handle = @fopen($filePath, 'rb'); if ($handle) { $binaryContents = fread($handle, filesize($filePath)); fclose($handle); $binaryContents = $this->removeBOM($binaryContents); // Check if this looks like Windows-1252 by looking for problem bytes if ($this->containsWindows1252Bytes($binaryContents)) { $converted = mb_convert_encoding($binaryContents, 'UTF-8', 'WINDOWS-1252'); if ($this->isValidConversion($converted)) { return $converted; } } } // Method 3: Fix corrupted UTF-8 replacement characters if ($contents !== false) { $fixed = $this->fixCorruptedWindows1252($contents); if ($this->isValidConversion($fixed)) { return $fixed; } } // Method 4: Try different encoding auto-detection with broader list if ($contents !== false) { $encodings = ['WINDOWS-1252', 'ISO-8859-1', 'ISO-8859-15', 'CP1252']; foreach ($encodings as $encoding) { $converted = mb_convert_encoding($contents, 'UTF-8', $encoding); if ($this->isValidConversion($converted)) { return $converted; } } } // Fallback: return original contents return $contents ?: ''; } /** * Detect and handle UTF-16 and UTF-32 encodings based on BOM */ private function detectAndHandleUTFEncoding(string $data): ?string { // UTF-32 BE BOM if (substr($data, 0, 4) === "\x00\x00\xFE\xFF") { $withoutBOM = substr($data, 4); return mb_convert_encoding($withoutBOM, 'UTF-8', 'UTF-32BE'); } // UTF-32 LE BOM if (substr($data, 0, 4) === "\xFF\xFE\x00\x00") { $withoutBOM = substr($data, 4); return mb_convert_encoding($withoutBOM, 'UTF-8', 'UTF-32LE'); } // UTF-16 BE BOM if (substr($data, 0, 2) === "\xFE\xFF") { $withoutBOM = substr($data, 2); return mb_convert_encoding($withoutBOM, 'UTF-8', 'UTF-16BE'); } // UTF-16 LE BOM if (substr($data, 0, 2) === "\xFF\xFE") { $withoutBOM = substr($data, 2); return mb_convert_encoding($withoutBOM, 'UTF-8', 'UTF-16LE'); } // Try to detect UTF-16/32 without BOM (heuristic approach) $length = strlen($data); // UTF-32 detection (every 4th byte pattern) if ($length >= 8 && $length % 4 === 0) { $nullCount = 0; for ($i = 0; $i < min(100, $length); $i += 4) { if ($data[$i] === "\x00" && $data[$i + 1] === "\x00" && $data[$i + 2] === "\x00") { $nullCount++; } } if ($nullCount > 5) { // Likely UTF-32LE return mb_convert_encoding($data, 'UTF-8', 'UTF-32LE'); } } // UTF-16 detection (every 2nd byte pattern) if ($length >= 4 && $length % 2 === 0) { $nullCount = 0; for ($i = 0; $i < min(100, $length); $i += 2) { if ($data[$i + 1] === "\x00") { $nullCount++; } } if ($nullCount > 10) { // Likely UTF-16LE return mb_convert_encoding($data, 'UTF-8', 'UTF-16LE'); } // Check for UTF-16BE $nullCount = 0; for ($i = 0; $i < min(100, $length); $i += 2) { if ($data[$i] === "\x00") { $nullCount++; } } if ($nullCount > 10) { // Likely UTF-16BE return mb_convert_encoding($data, 'UTF-8', 'UTF-16BE'); } } return null; } /** * Remove BOM (Byte Order Mark) from the beginning of a string */ private function removeBOM(string $data): string { // UTF-8 BOM if (substr($data, 0, 3) === "\xEF\xBB\xBF") { return substr($data, 3); } // UTF-16 BE BOM if (substr($data, 0, 2) === "\xFE\xFF") { return substr($data, 2); } // UTF-16 LE BOM if (substr($data, 0, 2) === "\xFF\xFE") { return substr($data, 2); } // UTF-32 BE BOM if (substr($data, 0, 4) === "\x00\x00\xFE\xFF") { return substr($data, 4); } // UTF-32 LE BOM if (substr($data, 0, 4) === "\xFF\xFE\x00\x00") { return substr($data, 4); } return $data; } private function containsWindows1252Bytes(string $data): bool { // Check for Windows-1252 specific bytes in 0x80-0x9F range $windows1252Bytes = [0x80, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8A, 0x8B, 0x8C, 0x8E, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9A, 0x9B, 0x9C, 0x9E, 0x9F]; foreach ($windows1252Bytes as $byte) { if (strpos($data, chr($byte)) !== false) { return true; } } return false; } private function fixCorruptedWindows1252(string $data): string { // Map of UTF-8 replacement sequences back to proper characters $replacements = [ "\xEF\xBF\xBD" => "\u{2019}", // Most common: right single quote (0x92) - use smart quote // Add more mappings as needed based on your data ]; return str_replace(array_keys($replacements), array_values($replacements), $data); } private function isValidConversion(string $data): bool { // Check if conversion was successful: // 1. Must be valid UTF-8 // 2. Must NOT contain replacement characters (indicating corruption) // 3. Additional check for double-encoded replacement return mb_check_encoding($data, 'UTF-8') && !str_contains($data, "\xEF\xBF\xBD") && // UTF-8 replacement character bytes !str_contains($data, '�'); // Double-encoded replacement character } private function setImportHints($entity_type, $available_keys, $headers): array { $hints = []; $translated_keys = collect($available_keys)->map(function ($value, $key) { $parts = explode(".", $value); $index = $parts[0]; $label = $parts[1] ?? $parts[0]; return ['key' => $key, 'index' => ctrans("texts.{$index}"), 'label' => ctrans("texts.{$label}")]; })->toArray(); //Exact string match foreach ($headers as $key => $value) { foreach ($translated_keys as $tkey => $tvalue) { $concat_needle = str_ireplace(" ", "", $tvalue['index'].$tvalue['label']); $concat_value = str_ireplace(" ", "", $value); if ($this->testMatch($concat_value, $concat_needle)) { $hit = $tvalue['key']; $hints[$key] = $hit; unset($translated_keys[$tkey]); break; } else { $hints[$key] = null; } } } //Label Match foreach ($headers as $key => $value) { if (isset($hints[$key])) { continue; } foreach ($translated_keys as $tkey => $tvalue) { if ($this->testMatch($value, $tvalue['label'])) { $hit = $tvalue['key']; $hints[$key] = $hit; unset($translated_keys[$tkey]); break; } else { $hints[$key] = null; } } } //Index matching pass using the index of the translation here foreach ($headers as $key => $value) { if (isset($hints[$key])) { continue; } foreach ($translated_keys as $tkey => $tvalue) { if ($this->testMatch($value, $tvalue['index'])) { $hit = $tvalue['key']; $hints[$key] = $hit; unset($translated_keys[$tkey]); break; } else { $hints[$key] = null; } } } return $hints; } private function testMatch($haystack, $needle): bool { return stripos($haystack, $needle) !== false; } public function import(ImportRequest $request) { /** @var \App\Models\User $user */ $user = auth()->user(); $data = $request->all(); if (empty($data['hash'])) { // Create a reference $data['hash'] = $hash = Str::random(32); /** @var UploadedFile $file */ foreach ($request->files->get('files') as $entityType => $file) { // $contents = file_get_contents($file->getPathname()); $contents = $this->readFileWithProperEncoding($file->getPathname()); // Store the csv in cache with an expiry of 10 minutes Cache::put($hash.'-'.$entityType, base64_encode($contents), 600); nlog($hash.'-'.$entityType); } } unset($data['files']); CSVIngest::dispatch($data, $user->company()); return response()->json(['message' => ctrans('texts.import_started')], 200); } private function getEntityMap($entity_type) { return sprintf('App\\Import\\Definitions\%sMap', ucfirst(Str::camel($entity_type))); } private function getCsvData($csvfile) { if (! ini_get('auto_detect_line_endings')) { ini_set('auto_detect_line_endings', '1'); } $csv = Reader::createFromString($csvfile); $csvdelimiter = self::detectDelimiter($csvfile); $csv->setDelimiter($csvdelimiter); $stmt = new Statement(); $data = iterator_to_array($stmt->process($csv)); if (count($data) > 0) { $headers = $data[0]; // Remove Invoice Ninja headers if (count($headers) && count($data) > 4) { $firstCell = $headers[0]; if (strstr($firstCell, (string) config('ninja.app_name'))) { array_shift($data); // Invoice Ninja... array_shift($data); // array_shift($data); // Entity Type Header } } } return $data; // Remove the convertData call since we fixed encoding upfront } /** * Returns the best delimiter * * @param string $csvfile * @return string */ public function detectDelimiter($csvfile): string { $delimiters = [',', '.', ';', '|']; $bestDelimiter = ','; $count = 0; // 10-01-2024 - A better way to resolve the csv file delimiter. $csvfile = substr($csvfile, 0, strpos($csvfile, "\n")); foreach ($delimiters as $delimiter) { if (substr_count(strstr($csvfile, "\n", true), $delimiter) >= $count) { $count = substr_count($csvfile, $delimiter); $bestDelimiter = $delimiter; } } /** @phpstan-ignore-next-line **/ return $bestDelimiter ?? ','; } }