diff --git a/packages/components/nodes/documentloaders/Docx/Docx.png b/packages/components/nodes/documentloaders/Docx/Docx.png new file mode 100644 index 000000000..6d527bd2d Binary files /dev/null and b/packages/components/nodes/documentloaders/Docx/Docx.png differ diff --git a/packages/components/nodes/documentloaders/Docx/Docx.ts b/packages/components/nodes/documentloaders/Docx/Docx.ts new file mode 100644 index 000000000..bfc859b96 --- /dev/null +++ b/packages/components/nodes/documentloaders/Docx/Docx.ts @@ -0,0 +1,59 @@ +import { INode, INodeData, INodeParams } from '../../../src/Interface' +import { TextSplitter } from 'langchain/text_splitter' +import { DocxLoader } from 'langchain/document_loaders/fs/docx' + +class Docx_DocumentLoaders implements INode { + label: string + name: string + description: string + type: string + icon: string + category: string + baseClasses: string[] + inputs: INodeParams[] + + constructor() { + this.label = 'Docx File' + this.name = 'docxFile' + this.type = 'Document' + this.icon = 'Docx.png' + this.category = 'Document Loaders' + this.description = `Load data from DOCX files` + this.baseClasses = [this.type] + this.inputs = [ + { + label: 'Docx File', + name: 'docxFile', + type: 'file', + fileType: '.docx' + }, + { + label: 'Text Splitter', + name: 'textSplitter', + type: 'TextSplitter', + optional: true + } + ] + } + + async init(nodeData: INodeData): Promise { + const textSplitter = nodeData.inputs?.textSplitter as TextSplitter + const docxFileBase64 = nodeData.inputs?.docxFile as string + const splitDataURI = docxFileBase64.split(',') + splitDataURI.pop() + const bf = Buffer.from(splitDataURI.pop() || '', 'base64') + + const blob = new Blob([bf]) + const loader = new DocxLoader(blob) + + if (textSplitter) { + const docs = await loader.loadAndSplit(textSplitter) + return docs + } else { + const docs = await loader.load() + return docs + } + } +} + +module.exports = { nodeClass: Docx_DocumentLoaders } diff --git a/packages/components/package.json b/packages/components/package.json index 793356063..1d8af269e 100644 --- a/packages/components/package.json +++ b/packages/components/package.json @@ -28,6 +28,7 @@ "express": "^4.17.3", "form-data": "^4.0.0", "langchain": "^0.0.63", + "mammoth": "^1.5.1", "moment": "^2.29.3", "node-fetch": "2", "pdf-parse": "^1.1.1",