FEATURE: Adding File Upload to Unstructured Loader (#2304)
* initial commit * updates to loader to support file upload * updates to loader to support file upload * update unstructured file --------- Co-authored-by: Henry <hzj94@hotmail.com>
This commit is contained in:
parent
e71266de87
commit
d5a97060e2
|
|
@ -0,0 +1,176 @@
|
||||||
|
import {
|
||||||
|
HiResModelName,
|
||||||
|
SkipInferTableTypes,
|
||||||
|
UnstructuredLoaderOptions,
|
||||||
|
UnstructuredLoaderStrategy
|
||||||
|
} from 'langchain/document_loaders/fs/unstructured'
|
||||||
|
import { BaseDocumentLoader } from 'langchain/document_loaders/base'
|
||||||
|
import { StringWithAutocomplete } from 'langchain/dist/util/types'
|
||||||
|
import { Document } from '@langchain/core/documents'
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Set the chunking_strategy to chunk text into larger or smaller elements. Defaults to None with optional arg of by_title
|
||||||
|
*/
|
||||||
|
type ChunkingStrategy = 'None' | 'by_title'
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Represents an element returned by the Unstructured API. It has
|
||||||
|
* properties for the element type, text content, and metadata.
|
||||||
|
*/
|
||||||
|
type Element = {
|
||||||
|
type: string
|
||||||
|
text: string
|
||||||
|
// this is purposefully loosely typed
|
||||||
|
metadata: {
|
||||||
|
[key: string]: unknown
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
export class UnstructuredLoader extends BaseDocumentLoader {
|
||||||
|
public filePath: string
|
||||||
|
|
||||||
|
private apiUrl = 'https://api.unstructured.io/general/v0/general'
|
||||||
|
|
||||||
|
private apiKey?: string
|
||||||
|
|
||||||
|
private strategy: StringWithAutocomplete<UnstructuredLoaderStrategy> = 'hi_res'
|
||||||
|
|
||||||
|
private encoding?: string
|
||||||
|
|
||||||
|
private ocrLanguages: Array<string> = []
|
||||||
|
|
||||||
|
private coordinates?: boolean
|
||||||
|
|
||||||
|
private pdfInferTableStructure?: boolean
|
||||||
|
|
||||||
|
private xmlKeepTags?: boolean
|
||||||
|
|
||||||
|
private skipInferTableTypes?: Array<StringWithAutocomplete<SkipInferTableTypes>>
|
||||||
|
|
||||||
|
private hiResModelName?: StringWithAutocomplete<HiResModelName>
|
||||||
|
|
||||||
|
private includePageBreaks?: boolean
|
||||||
|
|
||||||
|
private chunkingStrategy?: StringWithAutocomplete<ChunkingStrategy>
|
||||||
|
|
||||||
|
private multiPageSections?: boolean
|
||||||
|
|
||||||
|
private combineUnderNChars?: number
|
||||||
|
|
||||||
|
private newAfterNChars?: number
|
||||||
|
|
||||||
|
private maxCharacters?: number
|
||||||
|
|
||||||
|
constructor(optionsOrLegacyFilePath: UnstructuredLoaderOptions) {
|
||||||
|
super()
|
||||||
|
|
||||||
|
const options = optionsOrLegacyFilePath
|
||||||
|
this.apiKey = options.apiKey
|
||||||
|
this.apiUrl = options.apiUrl ?? this.apiUrl
|
||||||
|
this.strategy = options.strategy ?? this.strategy
|
||||||
|
this.encoding = options.encoding
|
||||||
|
this.ocrLanguages = options.ocrLanguages ?? this.ocrLanguages
|
||||||
|
this.coordinates = options.coordinates
|
||||||
|
this.pdfInferTableStructure = options.pdfInferTableStructure
|
||||||
|
this.xmlKeepTags = options.xmlKeepTags
|
||||||
|
this.skipInferTableTypes = options.skipInferTableTypes
|
||||||
|
this.hiResModelName = options.hiResModelName
|
||||||
|
this.includePageBreaks = options.includePageBreaks
|
||||||
|
this.chunkingStrategy = options.chunkingStrategy
|
||||||
|
this.multiPageSections = options.multiPageSections
|
||||||
|
this.combineUnderNChars = options.combineUnderNChars
|
||||||
|
this.newAfterNChars = options.newAfterNChars
|
||||||
|
this.maxCharacters = options.maxCharacters
|
||||||
|
}
|
||||||
|
|
||||||
|
async _partition(buffer: Buffer, fileName: string): Promise<Element[]> {
|
||||||
|
const formData = new FormData()
|
||||||
|
formData.append('files', new Blob([buffer]), fileName)
|
||||||
|
formData.append('strategy', this.strategy)
|
||||||
|
this.ocrLanguages.forEach((language) => {
|
||||||
|
formData.append('ocr_languages', language)
|
||||||
|
})
|
||||||
|
if (this.encoding) {
|
||||||
|
formData.append('encoding', this.encoding)
|
||||||
|
}
|
||||||
|
if (this.coordinates === true) {
|
||||||
|
formData.append('coordinates', 'true')
|
||||||
|
}
|
||||||
|
if (this.pdfInferTableStructure === true) {
|
||||||
|
formData.append('pdf_infer_table_structure', 'true')
|
||||||
|
}
|
||||||
|
if (this.xmlKeepTags === true) {
|
||||||
|
formData.append('xml_keep_tags', 'true')
|
||||||
|
}
|
||||||
|
if (this.skipInferTableTypes) {
|
||||||
|
formData.append('skip_infer_table_types', JSON.stringify(this.skipInferTableTypes))
|
||||||
|
}
|
||||||
|
if (this.hiResModelName) {
|
||||||
|
formData.append('hi_res_model_name', this.hiResModelName)
|
||||||
|
}
|
||||||
|
if (this.includePageBreaks) {
|
||||||
|
formData.append('include_page_breaks', 'true')
|
||||||
|
}
|
||||||
|
if (this.chunkingStrategy) {
|
||||||
|
formData.append('chunking_strategy', this.chunkingStrategy)
|
||||||
|
}
|
||||||
|
if (this.multiPageSections !== undefined) {
|
||||||
|
formData.append('multipage_sections', this.multiPageSections ? 'true' : 'false')
|
||||||
|
}
|
||||||
|
if (this.combineUnderNChars !== undefined) {
|
||||||
|
formData.append('combine_under_n_chars', String(this.combineUnderNChars))
|
||||||
|
}
|
||||||
|
if (this.newAfterNChars !== undefined) {
|
||||||
|
formData.append('new_after_n_chars', String(this.newAfterNChars))
|
||||||
|
}
|
||||||
|
if (this.maxCharacters !== undefined) {
|
||||||
|
formData.append('max_characters', String(this.maxCharacters))
|
||||||
|
}
|
||||||
|
|
||||||
|
const headers = {
|
||||||
|
'UNSTRUCTURED-API-KEY': this.apiKey ?? ''
|
||||||
|
}
|
||||||
|
|
||||||
|
const response = await fetch(this.apiUrl, {
|
||||||
|
method: 'POST',
|
||||||
|
body: formData,
|
||||||
|
headers
|
||||||
|
})
|
||||||
|
|
||||||
|
if (!response.ok) {
|
||||||
|
throw new Error(`Failed to partition file ${this.filePath} with error ${response.status} and message ${await response.text()}`)
|
||||||
|
}
|
||||||
|
|
||||||
|
const elements = await response.json()
|
||||||
|
if (!Array.isArray(elements)) {
|
||||||
|
throw new Error(`Expected partitioning request to return an array, but got ${elements}`)
|
||||||
|
}
|
||||||
|
return elements.filter((el) => typeof el.text === 'string') as Element[]
|
||||||
|
}
|
||||||
|
|
||||||
|
async loadAndSplitBuffer(buffer: Buffer, fileName: string): Promise<Document[]> {
|
||||||
|
const elements = await this._partition(buffer, fileName)
|
||||||
|
|
||||||
|
const documents: Document[] = []
|
||||||
|
for (const element of elements) {
|
||||||
|
const { metadata, text } = element
|
||||||
|
if (typeof text === 'string') {
|
||||||
|
documents.push(
|
||||||
|
new Document({
|
||||||
|
pageContent: text,
|
||||||
|
metadata: {
|
||||||
|
...metadata,
|
||||||
|
category: element.type
|
||||||
|
}
|
||||||
|
})
|
||||||
|
)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return documents
|
||||||
|
}
|
||||||
|
|
||||||
|
async load(): Promise<Document[]> {
|
||||||
|
return Promise.reject(new Error('load() is not supported for UnstructuredLoader. Use loadAndSplitBuffer() instead.'))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -1,12 +1,14 @@
|
||||||
import { ICommonObject, INode, INodeData, INodeParams } from '../../../src/Interface'
|
import { ICommonObject, INode, INodeData, INodeParams } from '../../../src/Interface'
|
||||||
import {
|
import {
|
||||||
UnstructuredLoader,
|
|
||||||
UnstructuredLoaderOptions,
|
UnstructuredLoaderOptions,
|
||||||
UnstructuredLoaderStrategy,
|
UnstructuredLoaderStrategy,
|
||||||
SkipInferTableTypes,
|
SkipInferTableTypes,
|
||||||
HiResModelName
|
HiResModelName,
|
||||||
|
UnstructuredLoader as LCUnstructuredLoader
|
||||||
} from 'langchain/document_loaders/fs/unstructured'
|
} from 'langchain/document_loaders/fs/unstructured'
|
||||||
import { getCredentialData, getCredentialParam } from '../../../src/utils'
|
import { getCredentialData, getCredentialParam } from '../../../src/utils'
|
||||||
|
import { getFileFromStorage } from '../../../src'
|
||||||
|
import { UnstructuredLoader } from './Unstructured'
|
||||||
|
|
||||||
class UnstructuredFile_DocumentLoaders implements INode {
|
class UnstructuredFile_DocumentLoaders implements INode {
|
||||||
label: string
|
label: string
|
||||||
|
|
@ -23,7 +25,7 @@ class UnstructuredFile_DocumentLoaders implements INode {
|
||||||
constructor() {
|
constructor() {
|
||||||
this.label = 'Unstructured File Loader'
|
this.label = 'Unstructured File Loader'
|
||||||
this.name = 'unstructuredFileLoader'
|
this.name = 'unstructuredFileLoader'
|
||||||
this.version = 2.0
|
this.version = 3.0
|
||||||
this.type = 'Document'
|
this.type = 'Document'
|
||||||
this.icon = 'unstructured-file.svg'
|
this.icon = 'unstructured-file.svg'
|
||||||
this.category = 'Document Loaders'
|
this.category = 'Document Loaders'
|
||||||
|
|
@ -41,7 +43,18 @@ class UnstructuredFile_DocumentLoaders implements INode {
|
||||||
label: 'File Path',
|
label: 'File Path',
|
||||||
name: 'filePath',
|
name: 'filePath',
|
||||||
type: 'string',
|
type: 'string',
|
||||||
placeholder: ''
|
placeholder: '',
|
||||||
|
optional: true,
|
||||||
|
warning:
|
||||||
|
'Use the File Upload instead of File path. If file is uploaded, this path is ignored. Path will be deprecated in future releases.'
|
||||||
|
},
|
||||||
|
{
|
||||||
|
label: 'Files Upload',
|
||||||
|
name: 'fileObject',
|
||||||
|
type: 'file',
|
||||||
|
description: 'Files to be processed. Multiple files can be uploaded.',
|
||||||
|
fileType:
|
||||||
|
'.txt, .text, .pdf, .docx, .doc, .jpg, .jpeg, .eml, .html, .htm, .md, .pptx, .ppt, .msg, .rtf, .xlsx, .xls, .odt, .epub'
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
label: 'Unstructured API URL',
|
label: 'Unstructured API URL',
|
||||||
|
|
@ -416,6 +429,7 @@ class UnstructuredFile_DocumentLoaders implements INode {
|
||||||
const combineUnderNChars = nodeData.inputs?.combineUnderNChars as number
|
const combineUnderNChars = nodeData.inputs?.combineUnderNChars as number
|
||||||
const newAfterNChars = nodeData.inputs?.newAfterNChars as number
|
const newAfterNChars = nodeData.inputs?.newAfterNChars as number
|
||||||
const maxCharacters = nodeData.inputs?.maxCharacters as number
|
const maxCharacters = nodeData.inputs?.maxCharacters as number
|
||||||
|
const fileBase64 = nodeData.inputs?.fileObject as string
|
||||||
|
|
||||||
const obj: UnstructuredLoaderOptions = {
|
const obj: UnstructuredLoaderOptions = {
|
||||||
apiUrl: unstructuredAPIUrl,
|
apiUrl: unstructuredAPIUrl,
|
||||||
|
|
@ -438,8 +452,48 @@ class UnstructuredFile_DocumentLoaders implements INode {
|
||||||
const unstructuredAPIKey = getCredentialParam('unstructuredAPIKey', credentialData, nodeData)
|
const unstructuredAPIKey = getCredentialParam('unstructuredAPIKey', credentialData, nodeData)
|
||||||
if (unstructuredAPIKey) obj.apiKey = unstructuredAPIKey
|
if (unstructuredAPIKey) obj.apiKey = unstructuredAPIKey
|
||||||
|
|
||||||
const loader = new UnstructuredLoader(filePath, obj)
|
let docs: any[] = []
|
||||||
let docs = await loader.load()
|
let files: string[] = []
|
||||||
|
|
||||||
|
if (fileBase64) {
|
||||||
|
const loader = new UnstructuredLoader(obj)
|
||||||
|
//FILE-STORAGE::["CONTRIBUTING.md","LICENSE.md","README.md"]
|
||||||
|
if (fileBase64.startsWith('FILE-STORAGE::')) {
|
||||||
|
const fileName = fileBase64.replace('FILE-STORAGE::', '')
|
||||||
|
if (fileName.startsWith('[') && fileName.endsWith(']')) {
|
||||||
|
files = JSON.parse(fileName)
|
||||||
|
} else {
|
||||||
|
files = [fileName]
|
||||||
|
}
|
||||||
|
const chatflowid = options.chatflowid
|
||||||
|
|
||||||
|
for (const file of files) {
|
||||||
|
const fileData = await getFileFromStorage(file, chatflowid)
|
||||||
|
const loaderDocs = await loader.loadAndSplitBuffer(fileData, file)
|
||||||
|
docs.push(...loaderDocs)
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if (fileBase64.startsWith('[') && fileBase64.endsWith(']')) {
|
||||||
|
files = JSON.parse(fileBase64)
|
||||||
|
} else {
|
||||||
|
files = [fileBase64]
|
||||||
|
}
|
||||||
|
|
||||||
|
for (const file of files) {
|
||||||
|
const splitDataURI = file.split(',')
|
||||||
|
const filename = splitDataURI.pop()?.split(':')[1] ?? ''
|
||||||
|
const bf = Buffer.from(splitDataURI.pop() || '', 'base64')
|
||||||
|
const loaderDocs = await loader.loadAndSplitBuffer(bf, filename)
|
||||||
|
docs.push(...loaderDocs)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else if (filePath) {
|
||||||
|
const loader = new LCUnstructuredLoader(filePath, obj)
|
||||||
|
const loaderDocs = await loader.load()
|
||||||
|
docs.push(...loaderDocs)
|
||||||
|
} else {
|
||||||
|
throw new Error('File path or File upload is required')
|
||||||
|
}
|
||||||
|
|
||||||
if (metadata) {
|
if (metadata) {
|
||||||
const parsedMetadata = typeof metadata === 'object' ? metadata : JSON.parse(metadata)
|
const parsedMetadata = typeof metadata === 'object' ? metadata : JSON.parse(metadata)
|
||||||
|
|
|
||||||
|
|
@ -104,6 +104,7 @@ export interface INodeProperties {
|
||||||
description?: string
|
description?: string
|
||||||
filePath?: string
|
filePath?: string
|
||||||
badge?: string
|
badge?: string
|
||||||
|
deprecateMessage?: string
|
||||||
}
|
}
|
||||||
|
|
||||||
export interface INode extends INodeProperties {
|
export interface INode extends INodeProperties {
|
||||||
|
|
|
||||||
|
|
@ -66,7 +66,10 @@ const CanvasNode = ({ data }) => {
|
||||||
} else if (data.version && componentNode.version > data.version) {
|
} else if (data.version && componentNode.version > data.version) {
|
||||||
setWarningMessage(nodeOutdatedMessage(data.version, componentNode.version))
|
setWarningMessage(nodeOutdatedMessage(data.version, componentNode.version))
|
||||||
} else if (componentNode.badge === 'DEPRECATING') {
|
} else if (componentNode.badge === 'DEPRECATING') {
|
||||||
setWarningMessage('This node will be deprecated in the next release. Change to a new node tagged with NEW')
|
setWarningMessage(
|
||||||
|
componentNode?.deprecateMessage ??
|
||||||
|
'This node will be deprecated in the next release. Change to a new node tagged with NEW'
|
||||||
|
)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}, [canvas.componentNodes, data.name, data.version])
|
}, [canvas.componentNodes, data.name, data.version])
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue