From 8f4b5ba3f62463a0fa1c8a23f98b6b2242d2665f Mon Sep 17 00:00:00 2001 From: Henry Date: Wed, 14 Jun 2023 12:34:25 +0100 Subject: [PATCH] pdf loader add legacy option --- .../components/nodes/documentloaders/Pdf/Pdf.ts | 16 ++++++++++++++-- packages/components/package.json | 5 +++-- 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/packages/components/nodes/documentloaders/Pdf/Pdf.ts b/packages/components/nodes/documentloaders/Pdf/Pdf.ts index bc36f8cb5..5bee0e65a 100644 --- a/packages/components/nodes/documentloaders/Pdf/Pdf.ts +++ b/packages/components/nodes/documentloaders/Pdf/Pdf.ts @@ -49,6 +49,13 @@ class Pdf_DocumentLoaders implements INode { ], default: 'perPage' }, + { + label: 'Use Legacy Build', + name: 'legacyBuild', + type: 'boolean', + optional: true, + additionalParams: true + }, { label: 'Metadata', name: 'metadata', @@ -64,6 +71,7 @@ class Pdf_DocumentLoaders implements INode { const pdfFileBase64 = nodeData.inputs?.pdfFile as string const usage = nodeData.inputs?.usage as string const metadata = nodeData.inputs?.metadata + const legacyBuild = nodeData.inputs?.legacyBuild as boolean let alldocs = [] let files: string[] = [] @@ -82,7 +90,8 @@ class Pdf_DocumentLoaders implements INode { const loader = new PDFLoader(new Blob([bf]), { splitPages: false, // @ts-ignore - pdfjs: () => import('pdf-parse/lib/pdf.js/v1.10.100/build/pdf.js') + pdfjs: () => + legacyBuild ? import('pdfjs-dist/legacy/build/pdf.js') : import('pdf-parse/lib/pdf.js/v1.10.100/build/pdf.js') }) if (textSplitter) { const docs = await loader.loadAndSplit(textSplitter) @@ -93,7 +102,10 @@ class Pdf_DocumentLoaders implements INode { } } else { // @ts-ignore - const loader = new PDFLoader(new Blob([bf]), { pdfjs: () => import('pdf-parse/lib/pdf.js/v1.10.100/build/pdf.js') }) + const loader = new PDFLoader(new Blob([bf]), { + pdfjs: () => + legacyBuild ? import('pdfjs-dist/legacy/build/pdf.js') : import('pdf-parse/lib/pdf.js/v1.10.100/build/pdf.js') + }) if (textSplitter) { const docs = await loader.loadAndSplit(textSplitter) alldocs.push(...docs) diff --git a/packages/components/package.json b/packages/components/package.json index 207d3e897..07275b08e 100644 --- a/packages/components/package.json +++ b/packages/components/package.json @@ -32,15 +32,16 @@ "faiss-node": "^0.2.1", "form-data": "^4.0.0", "graphql": "^16.6.0", + "html-to-text": "^9.0.5", "langchain": "^0.0.94", "linkifyjs": "^4.1.1", "mammoth": "^1.5.1", "moment": "^2.29.3", "node-fetch": "^2.6.11", "pdf-parse": "^1.1.1", + "pdfjs-dist": "^3.7.107", "weaviate-ts-client": "^1.1.0", - "ws": "^8.9.0", - "html-to-text": "^9.0.5" + "ws": "^8.9.0" }, "devDependencies": { "@types/gulp": "4.0.9",