pdf loader add legacy option

This commit is contained in:
Henry 2023-06-14 12:34:25 +01:00
parent 92cd760b16
commit 8f4b5ba3f6
2 changed files with 17 additions and 4 deletions

View File

@ -49,6 +49,13 @@ class Pdf_DocumentLoaders implements INode {
],
default: 'perPage'
},
{
label: 'Use Legacy Build',
name: 'legacyBuild',
type: 'boolean',
optional: true,
additionalParams: true
},
{
label: 'Metadata',
name: 'metadata',
@ -64,6 +71,7 @@ class Pdf_DocumentLoaders implements INode {
const pdfFileBase64 = nodeData.inputs?.pdfFile as string
const usage = nodeData.inputs?.usage as string
const metadata = nodeData.inputs?.metadata
const legacyBuild = nodeData.inputs?.legacyBuild as boolean
let alldocs = []
let files: string[] = []
@ -82,7 +90,8 @@ class Pdf_DocumentLoaders implements INode {
const loader = new PDFLoader(new Blob([bf]), {
splitPages: false,
// @ts-ignore
pdfjs: () => import('pdf-parse/lib/pdf.js/v1.10.100/build/pdf.js')
pdfjs: () =>
legacyBuild ? import('pdfjs-dist/legacy/build/pdf.js') : import('pdf-parse/lib/pdf.js/v1.10.100/build/pdf.js')
})
if (textSplitter) {
const docs = await loader.loadAndSplit(textSplitter)
@ -93,7 +102,10 @@ class Pdf_DocumentLoaders implements INode {
}
} else {
// @ts-ignore
const loader = new PDFLoader(new Blob([bf]), { pdfjs: () => import('pdf-parse/lib/pdf.js/v1.10.100/build/pdf.js') })
const loader = new PDFLoader(new Blob([bf]), {
pdfjs: () =>
legacyBuild ? import('pdfjs-dist/legacy/build/pdf.js') : import('pdf-parse/lib/pdf.js/v1.10.100/build/pdf.js')
})
if (textSplitter) {
const docs = await loader.loadAndSplit(textSplitter)
alldocs.push(...docs)

View File

@ -32,15 +32,16 @@
"faiss-node": "^0.2.1",
"form-data": "^4.0.0",
"graphql": "^16.6.0",
"html-to-text": "^9.0.5",
"langchain": "^0.0.94",
"linkifyjs": "^4.1.1",
"mammoth": "^1.5.1",
"moment": "^2.29.3",
"node-fetch": "^2.6.11",
"pdf-parse": "^1.1.1",
"pdfjs-dist": "^3.7.107",
"weaviate-ts-client": "^1.1.0",
"ws": "^8.9.0",
"html-to-text": "^9.0.5"
"ws": "^8.9.0"
},
"devDependencies": {
"@types/gulp": "4.0.9",