From 324bd3a4309565dc2f0fc885dd8e3f390c2d4376 Mon Sep 17 00:00:00 2001 From: Henry Date: Mon, 15 May 2023 20:06:22 +0100 Subject: [PATCH] add fixes to multiple documents --- .../nodes/documentloaders/Csv/Csv.ts | 32 +++++++---- .../nodes/documentloaders/Docx/Docx.ts | 32 +++++++---- .../nodes/documentloaders/Json/Json.ts | 32 +++++++---- .../nodes/documentloaders/Pdf/Pdf.ts | 54 ++++++++++++------- .../nodes/documentloaders/Text/Text.ts | 32 +++++++---- packages/components/src/utils.ts | 24 --------- 6 files changed, 128 insertions(+), 78 deletions(-) diff --git a/packages/components/nodes/documentloaders/Csv/Csv.ts b/packages/components/nodes/documentloaders/Csv/Csv.ts index bcaec79f3..f4b36ad03 100644 --- a/packages/components/nodes/documentloaders/Csv/Csv.ts +++ b/packages/components/nodes/documentloaders/Csv/Csv.ts @@ -1,7 +1,6 @@ import { INode, INodeData, INodeParams } from '../../../src/Interface' import { TextSplitter } from 'langchain/text_splitter' import { CSVLoader } from 'langchain/document_loaders/fs/csv' -import { getBlob } from '../../../src/utils' class Csv_DocumentLoaders implements INode { label: string @@ -58,20 +57,35 @@ class Csv_DocumentLoaders implements INode { const columnName = nodeData.inputs?.columnName as string const metadata = nodeData.inputs?.metadata - const blob = new Blob(getBlob(csvFileBase64)) - const loader = new CSVLoader(blob, columnName.trim().length === 0 ? undefined : columnName.trim()) - let docs = [] + let alldocs = [] + let files: string[] = [] - if (textSplitter) { - docs = await loader.loadAndSplit(textSplitter) + if (csvFileBase64.startsWith('[') && csvFileBase64.endsWith(']')) { + files = JSON.parse(csvFileBase64) } else { - docs = await loader.load() + files = [csvFileBase64] + } + + for (const file of files) { + const splitDataURI = file.split(',') + splitDataURI.pop() + const bf = Buffer.from(splitDataURI.pop() || '', 'base64') + const blob = new Blob([bf]) + const loader = new CSVLoader(blob, columnName.trim().length === 0 ? undefined : columnName.trim()) + + if (textSplitter) { + const docs = await loader.loadAndSplit(textSplitter) + alldocs.push(...docs) + } else { + const docs = await loader.load() + alldocs.push(...docs) + } } if (metadata) { const parsedMetadata = typeof metadata === 'object' ? metadata : JSON.parse(metadata) let finaldocs = [] - for (const doc of docs) { + for (const doc of alldocs) { const newdoc = { ...doc, metadata: { @@ -84,7 +98,7 @@ class Csv_DocumentLoaders implements INode { return finaldocs } - return docs + return alldocs } } diff --git a/packages/components/nodes/documentloaders/Docx/Docx.ts b/packages/components/nodes/documentloaders/Docx/Docx.ts index 36dd04651..e27991a51 100644 --- a/packages/components/nodes/documentloaders/Docx/Docx.ts +++ b/packages/components/nodes/documentloaders/Docx/Docx.ts @@ -1,7 +1,6 @@ import { INode, INodeData, INodeParams } from '../../../src/Interface' import { TextSplitter } from 'langchain/text_splitter' import { DocxLoader } from 'langchain/document_loaders/fs/docx' -import { getBlob } from '../../../src/utils' class Docx_DocumentLoaders implements INode { label: string @@ -49,20 +48,35 @@ class Docx_DocumentLoaders implements INode { const docxFileBase64 = nodeData.inputs?.docxFile as string const metadata = nodeData.inputs?.metadata - const blob = new Blob(getBlob(docxFileBase64)) - const loader = new DocxLoader(blob) - let docs = [] + let alldocs = [] + let files: string[] = [] - if (textSplitter) { - docs = await loader.loadAndSplit(textSplitter) + if (docxFileBase64.startsWith('[') && docxFileBase64.endsWith(']')) { + files = JSON.parse(docxFileBase64) } else { - docs = await loader.load() + files = [docxFileBase64] + } + + for (const file of files) { + const splitDataURI = file.split(',') + splitDataURI.pop() + const bf = Buffer.from(splitDataURI.pop() || '', 'base64') + const blob = new Blob([bf]) + const loader = new DocxLoader(blob) + + if (textSplitter) { + const docs = await loader.loadAndSplit(textSplitter) + alldocs.push(...docs) + } else { + const docs = await loader.load() + alldocs.push(...docs) + } } if (metadata) { const parsedMetadata = typeof metadata === 'object' ? metadata : JSON.parse(metadata) let finaldocs = [] - for (const doc of docs) { + for (const doc of alldocs) { const newdoc = { ...doc, metadata: { @@ -75,7 +89,7 @@ class Docx_DocumentLoaders implements INode { return finaldocs } - return docs + return alldocs } } diff --git a/packages/components/nodes/documentloaders/Json/Json.ts b/packages/components/nodes/documentloaders/Json/Json.ts index 46f7704d6..9177df5cb 100644 --- a/packages/components/nodes/documentloaders/Json/Json.ts +++ b/packages/components/nodes/documentloaders/Json/Json.ts @@ -1,7 +1,6 @@ import { INode, INodeData, INodeParams } from '../../../src/Interface' import { TextSplitter } from 'langchain/text_splitter' import { JSONLoader } from 'langchain/document_loaders/fs/json' -import { getBlob } from '../../../src/utils' class Json_DocumentLoaders implements INode { label: string @@ -64,20 +63,35 @@ class Json_DocumentLoaders implements INode { pointers = outputString.split(',').map((pointer) => '/' + pointer.trim()) } - const blob = new Blob(getBlob(jsonFileBase64)) - const loader = new JSONLoader(blob, pointers.length != 0 ? pointers : undefined) - let docs = [] + let alldocs = [] + let files: string[] = [] - if (textSplitter) { - docs = await loader.loadAndSplit(textSplitter) + if (jsonFileBase64.startsWith('[') && jsonFileBase64.endsWith(']')) { + files = JSON.parse(jsonFileBase64) } else { - docs = await loader.load() + files = [jsonFileBase64] + } + + for (const file of files) { + const splitDataURI = file.split(',') + splitDataURI.pop() + const bf = Buffer.from(splitDataURI.pop() || '', 'base64') + const blob = new Blob([bf]) + const loader = new JSONLoader(blob, pointers.length != 0 ? pointers : undefined) + + if (textSplitter) { + const docs = await loader.loadAndSplit(textSplitter) + alldocs.push(...docs) + } else { + const docs = await loader.load() + alldocs.push(...docs) + } } if (metadata) { const parsedMetadata = typeof metadata === 'object' ? metadata : JSON.parse(metadata) let finaldocs = [] - for (const doc of docs) { + for (const doc of alldocs) { const newdoc = { ...doc, metadata: { @@ -90,7 +104,7 @@ class Json_DocumentLoaders implements INode { return finaldocs } - return docs + return alldocs } } diff --git a/packages/components/nodes/documentloaders/Pdf/Pdf.ts b/packages/components/nodes/documentloaders/Pdf/Pdf.ts index c27f78091..bc36f8cb5 100644 --- a/packages/components/nodes/documentloaders/Pdf/Pdf.ts +++ b/packages/components/nodes/documentloaders/Pdf/Pdf.ts @@ -1,7 +1,6 @@ import { INode, INodeData, INodeParams } from '../../../src/Interface' import { TextSplitter } from 'langchain/text_splitter' import { PDFLoader } from 'langchain/document_loaders/fs/pdf' -import { getBlob } from '../../../src/utils' class Pdf_DocumentLoaders implements INode { label: string @@ -66,30 +65,49 @@ class Pdf_DocumentLoaders implements INode { const usage = nodeData.inputs?.usage as string const metadata = nodeData.inputs?.metadata - const blob = new Blob(getBlob(pdfFileBase64)) - let docs = [] - if (usage === 'perFile') { - // @ts-ignore - const loader = new PDFLoader(blob, { splitPages: false, pdfjs: () => import('pdf-parse/lib/pdf.js/v1.10.100/build/pdf.js') }) - if (textSplitter) { - docs = await loader.loadAndSplit(textSplitter) - } else { - docs = await loader.load() - } + let alldocs = [] + let files: string[] = [] + + if (pdfFileBase64.startsWith('[') && pdfFileBase64.endsWith(']')) { + files = JSON.parse(pdfFileBase64) } else { - // @ts-ignore - const loader = new PDFLoader(blob, { pdfjs: () => import('pdf-parse/lib/pdf.js/v1.10.100/build/pdf.js') }) - if (textSplitter) { - docs = await loader.loadAndSplit(textSplitter) + files = [pdfFileBase64] + } + + for (const file of files) { + const splitDataURI = file.split(',') + splitDataURI.pop() + const bf = Buffer.from(splitDataURI.pop() || '', 'base64') + if (usage === 'perFile') { + const loader = new PDFLoader(new Blob([bf]), { + splitPages: false, + // @ts-ignore + pdfjs: () => import('pdf-parse/lib/pdf.js/v1.10.100/build/pdf.js') + }) + if (textSplitter) { + const docs = await loader.loadAndSplit(textSplitter) + alldocs.push(...docs) + } else { + const docs = await loader.load() + alldocs.push(...docs) + } } else { - docs = await loader.load() + // @ts-ignore + const loader = new PDFLoader(new Blob([bf]), { pdfjs: () => import('pdf-parse/lib/pdf.js/v1.10.100/build/pdf.js') }) + if (textSplitter) { + const docs = await loader.loadAndSplit(textSplitter) + alldocs.push(...docs) + } else { + const docs = await loader.load() + alldocs.push(...docs) + } } } if (metadata) { const parsedMetadata = typeof metadata === 'object' ? metadata : JSON.parse(metadata) let finaldocs = [] - for (const doc of docs) { + for (const doc of alldocs) { const newdoc = { ...doc, metadata: { @@ -102,7 +120,7 @@ class Pdf_DocumentLoaders implements INode { return finaldocs } - return docs + return alldocs } } diff --git a/packages/components/nodes/documentloaders/Text/Text.ts b/packages/components/nodes/documentloaders/Text/Text.ts index 466c45200..63e7e0e26 100644 --- a/packages/components/nodes/documentloaders/Text/Text.ts +++ b/packages/components/nodes/documentloaders/Text/Text.ts @@ -1,7 +1,6 @@ import { INode, INodeData, INodeParams } from '../../../src/Interface' import { TextSplitter } from 'langchain/text_splitter' import { TextLoader } from 'langchain/document_loaders/fs/text' -import { getBlob } from '../../../src/utils' class Text_DocumentLoaders implements INode { label: string @@ -49,20 +48,35 @@ class Text_DocumentLoaders implements INode { const txtFileBase64 = nodeData.inputs?.txtFile as string const metadata = nodeData.inputs?.metadata - const blob = new Blob(getBlob(txtFileBase64)) - const loader = new TextLoader(blob) - let docs = [] + let alldocs = [] + let files: string[] = [] - if (textSplitter) { - docs = await loader.loadAndSplit(textSplitter) + if (txtFileBase64.startsWith('[') && txtFileBase64.endsWith(']')) { + files = JSON.parse(txtFileBase64) } else { - docs = await loader.load() + files = [txtFileBase64] + } + + for (const file of files) { + const splitDataURI = file.split(',') + splitDataURI.pop() + const bf = Buffer.from(splitDataURI.pop() || '', 'base64') + const blob = new Blob([bf]) + const loader = new TextLoader(blob) + + if (textSplitter) { + const docs = await loader.loadAndSplit(textSplitter) + alldocs.push(...docs) + } else { + const docs = await loader.load() + alldocs.push(...docs) + } } if (metadata) { const parsedMetadata = typeof metadata === 'object' ? metadata : JSON.parse(metadata) let finaldocs = [] - for (const doc of docs) { + for (const doc of alldocs) { const newdoc = { ...doc, metadata: { @@ -74,7 +88,7 @@ class Text_DocumentLoaders implements INode { } return finaldocs } - return docs + return alldocs } } diff --git a/packages/components/src/utils.ts b/packages/components/src/utils.ts index a2a415254..7a2a4d25d 100644 --- a/packages/components/src/utils.ts +++ b/packages/components/src/utils.ts @@ -149,27 +149,3 @@ export const getInputVariables = (paramValue: string): string[] => { } return inputVariables } - -/** - * Get blob - * @param {string} fileBase64Str - * @returns {Buffer[]} - */ -export const getBlob = (fileBase64Str: string) => { - let bufferArray: Buffer[] = [] - let files: string[] = [] - - if (fileBase64Str.startsWith('[') && fileBase64Str.endsWith(']')) { - files = JSON.parse(fileBase64Str) - } else { - files = [fileBase64Str] - } - - for (const file of files) { - const splitDataURI = file.split(',') - splitDataURI.pop() - const bf = Buffer.from(splitDataURI.pop() || '', 'base64') - bufferArray.push(bf) - } - return bufferArray -}