diff --git a/packages/components/nodes/documentloaders/Unstructured/Unstructured.ts b/packages/components/nodes/documentloaders/Unstructured/Unstructured.ts new file mode 100644 index 000000000..ba081ab24 --- /dev/null +++ b/packages/components/nodes/documentloaders/Unstructured/Unstructured.ts @@ -0,0 +1,176 @@ +import { + HiResModelName, + SkipInferTableTypes, + UnstructuredLoaderOptions, + UnstructuredLoaderStrategy +} from 'langchain/document_loaders/fs/unstructured' +import { BaseDocumentLoader } from 'langchain/document_loaders/base' +import { StringWithAutocomplete } from 'langchain/dist/util/types' +import { Document } from '@langchain/core/documents' + +/** + * Set the chunking_strategy to chunk text into larger or smaller elements. Defaults to None with optional arg of by_title + */ +type ChunkingStrategy = 'None' | 'by_title' + +/** + * Represents an element returned by the Unstructured API. It has + * properties for the element type, text content, and metadata. + */ +type Element = { + type: string + text: string + // this is purposefully loosely typed + metadata: { + [key: string]: unknown + } +} + +export class UnstructuredLoader extends BaseDocumentLoader { + public filePath: string + + private apiUrl = 'https://api.unstructured.io/general/v0/general' + + private apiKey?: string + + private strategy: StringWithAutocomplete = 'hi_res' + + private encoding?: string + + private ocrLanguages: Array = [] + + private coordinates?: boolean + + private pdfInferTableStructure?: boolean + + private xmlKeepTags?: boolean + + private skipInferTableTypes?: Array> + + private hiResModelName?: StringWithAutocomplete + + private includePageBreaks?: boolean + + private chunkingStrategy?: StringWithAutocomplete + + private multiPageSections?: boolean + + private combineUnderNChars?: number + + private newAfterNChars?: number + + private maxCharacters?: number + + constructor(optionsOrLegacyFilePath: UnstructuredLoaderOptions) { + super() + + const options = optionsOrLegacyFilePath + this.apiKey = options.apiKey + this.apiUrl = options.apiUrl ?? this.apiUrl + this.strategy = options.strategy ?? this.strategy + this.encoding = options.encoding + this.ocrLanguages = options.ocrLanguages ?? this.ocrLanguages + this.coordinates = options.coordinates + this.pdfInferTableStructure = options.pdfInferTableStructure + this.xmlKeepTags = options.xmlKeepTags + this.skipInferTableTypes = options.skipInferTableTypes + this.hiResModelName = options.hiResModelName + this.includePageBreaks = options.includePageBreaks + this.chunkingStrategy = options.chunkingStrategy + this.multiPageSections = options.multiPageSections + this.combineUnderNChars = options.combineUnderNChars + this.newAfterNChars = options.newAfterNChars + this.maxCharacters = options.maxCharacters + } + + async _partition(buffer: Buffer, fileName: string): Promise { + const formData = new FormData() + formData.append('files', new Blob([buffer]), fileName) + formData.append('strategy', this.strategy) + this.ocrLanguages.forEach((language) => { + formData.append('ocr_languages', language) + }) + if (this.encoding) { + formData.append('encoding', this.encoding) + } + if (this.coordinates === true) { + formData.append('coordinates', 'true') + } + if (this.pdfInferTableStructure === true) { + formData.append('pdf_infer_table_structure', 'true') + } + if (this.xmlKeepTags === true) { + formData.append('xml_keep_tags', 'true') + } + if (this.skipInferTableTypes) { + formData.append('skip_infer_table_types', JSON.stringify(this.skipInferTableTypes)) + } + if (this.hiResModelName) { + formData.append('hi_res_model_name', this.hiResModelName) + } + if (this.includePageBreaks) { + formData.append('include_page_breaks', 'true') + } + if (this.chunkingStrategy) { + formData.append('chunking_strategy', this.chunkingStrategy) + } + if (this.multiPageSections !== undefined) { + formData.append('multipage_sections', this.multiPageSections ? 'true' : 'false') + } + if (this.combineUnderNChars !== undefined) { + formData.append('combine_under_n_chars', String(this.combineUnderNChars)) + } + if (this.newAfterNChars !== undefined) { + formData.append('new_after_n_chars', String(this.newAfterNChars)) + } + if (this.maxCharacters !== undefined) { + formData.append('max_characters', String(this.maxCharacters)) + } + + const headers = { + 'UNSTRUCTURED-API-KEY': this.apiKey ?? '' + } + + const response = await fetch(this.apiUrl, { + method: 'POST', + body: formData, + headers + }) + + if (!response.ok) { + throw new Error(`Failed to partition file ${this.filePath} with error ${response.status} and message ${await response.text()}`) + } + + const elements = await response.json() + if (!Array.isArray(elements)) { + throw new Error(`Expected partitioning request to return an array, but got ${elements}`) + } + return elements.filter((el) => typeof el.text === 'string') as Element[] + } + + async loadAndSplitBuffer(buffer: Buffer, fileName: string): Promise { + const elements = await this._partition(buffer, fileName) + + const documents: Document[] = [] + for (const element of elements) { + const { metadata, text } = element + if (typeof text === 'string') { + documents.push( + new Document({ + pageContent: text, + metadata: { + ...metadata, + category: element.type + } + }) + ) + } + } + + return documents + } + + async load(): Promise { + return Promise.reject(new Error('load() is not supported for UnstructuredLoader. Use loadAndSplitBuffer() instead.')) + } +} diff --git a/packages/components/nodes/documentloaders/Unstructured/UnstructuredFile.ts b/packages/components/nodes/documentloaders/Unstructured/UnstructuredFile.ts index e935893bc..fa9b2b4a1 100644 --- a/packages/components/nodes/documentloaders/Unstructured/UnstructuredFile.ts +++ b/packages/components/nodes/documentloaders/Unstructured/UnstructuredFile.ts @@ -1,12 +1,14 @@ import { ICommonObject, INode, INodeData, INodeParams } from '../../../src/Interface' import { - UnstructuredLoader, UnstructuredLoaderOptions, UnstructuredLoaderStrategy, SkipInferTableTypes, - HiResModelName + HiResModelName, + UnstructuredLoader as LCUnstructuredLoader } from 'langchain/document_loaders/fs/unstructured' import { getCredentialData, getCredentialParam } from '../../../src/utils' +import { getFileFromStorage } from '../../../src' +import { UnstructuredLoader } from './Unstructured' class UnstructuredFile_DocumentLoaders implements INode { label: string @@ -23,7 +25,7 @@ class UnstructuredFile_DocumentLoaders implements INode { constructor() { this.label = 'Unstructured File Loader' this.name = 'unstructuredFileLoader' - this.version = 2.0 + this.version = 3.0 this.type = 'Document' this.icon = 'unstructured-file.svg' this.category = 'Document Loaders' @@ -41,7 +43,18 @@ class UnstructuredFile_DocumentLoaders implements INode { label: 'File Path', name: 'filePath', type: 'string', - placeholder: '' + placeholder: '', + optional: true, + warning: + 'Use the File Upload instead of File path. If file is uploaded, this path is ignored. Path will be deprecated in future releases.' + }, + { + label: 'Files Upload', + name: 'fileObject', + type: 'file', + description: 'Files to be processed. Multiple files can be uploaded.', + fileType: + '.txt, .text, .pdf, .docx, .doc, .jpg, .jpeg, .eml, .html, .htm, .md, .pptx, .ppt, .msg, .rtf, .xlsx, .xls, .odt, .epub' }, { label: 'Unstructured API URL', @@ -416,6 +429,7 @@ class UnstructuredFile_DocumentLoaders implements INode { const combineUnderNChars = nodeData.inputs?.combineUnderNChars as number const newAfterNChars = nodeData.inputs?.newAfterNChars as number const maxCharacters = nodeData.inputs?.maxCharacters as number + const fileBase64 = nodeData.inputs?.fileObject as string const obj: UnstructuredLoaderOptions = { apiUrl: unstructuredAPIUrl, @@ -438,8 +452,48 @@ class UnstructuredFile_DocumentLoaders implements INode { const unstructuredAPIKey = getCredentialParam('unstructuredAPIKey', credentialData, nodeData) if (unstructuredAPIKey) obj.apiKey = unstructuredAPIKey - const loader = new UnstructuredLoader(filePath, obj) - let docs = await loader.load() + let docs: any[] = [] + let files: string[] = [] + + if (fileBase64) { + const loader = new UnstructuredLoader(obj) + //FILE-STORAGE::["CONTRIBUTING.md","LICENSE.md","README.md"] + if (fileBase64.startsWith('FILE-STORAGE::')) { + const fileName = fileBase64.replace('FILE-STORAGE::', '') + if (fileName.startsWith('[') && fileName.endsWith(']')) { + files = JSON.parse(fileName) + } else { + files = [fileName] + } + const chatflowid = options.chatflowid + + for (const file of files) { + const fileData = await getFileFromStorage(file, chatflowid) + const loaderDocs = await loader.loadAndSplitBuffer(fileData, file) + docs.push(...loaderDocs) + } + } else { + if (fileBase64.startsWith('[') && fileBase64.endsWith(']')) { + files = JSON.parse(fileBase64) + } else { + files = [fileBase64] + } + + for (const file of files) { + const splitDataURI = file.split(',') + const filename = splitDataURI.pop()?.split(':')[1] ?? '' + const bf = Buffer.from(splitDataURI.pop() || '', 'base64') + const loaderDocs = await loader.loadAndSplitBuffer(bf, filename) + docs.push(...loaderDocs) + } + } + } else if (filePath) { + const loader = new LCUnstructuredLoader(filePath, obj) + const loaderDocs = await loader.load() + docs.push(...loaderDocs) + } else { + throw new Error('File path or File upload is required') + } if (metadata) { const parsedMetadata = typeof metadata === 'object' ? metadata : JSON.parse(metadata) diff --git a/packages/components/src/Interface.ts b/packages/components/src/Interface.ts index ce7b752de..d7aa90003 100644 --- a/packages/components/src/Interface.ts +++ b/packages/components/src/Interface.ts @@ -104,6 +104,7 @@ export interface INodeProperties { description?: string filePath?: string badge?: string + deprecateMessage?: string } export interface INode extends INodeProperties { diff --git a/packages/ui/src/views/canvas/CanvasNode.jsx b/packages/ui/src/views/canvas/CanvasNode.jsx index c7ce2f59a..3ba524431 100644 --- a/packages/ui/src/views/canvas/CanvasNode.jsx +++ b/packages/ui/src/views/canvas/CanvasNode.jsx @@ -66,7 +66,10 @@ const CanvasNode = ({ data }) => { } else if (data.version && componentNode.version > data.version) { setWarningMessage(nodeOutdatedMessage(data.version, componentNode.version)) } else if (componentNode.badge === 'DEPRECATING') { - setWarningMessage('This node will be deprecated in the next release. Change to a new node tagged with NEW') + setWarningMessage( + componentNode?.deprecateMessage ?? + 'This node will be deprecated in the next release. Change to a new node tagged with NEW' + ) } } }, [canvas.componentNodes, data.name, data.version])