Chore/Remove Deprecated File Path Unstructured (#5478)

* Refactor UnstructuredFile and UnstructuredFolder loaders to remove deprecated file path handling and enhance folder path validation. Ensure folder paths are sanitized and validated against path traversal attacks.

* Update UnstructuredFolder.ts
This commit is contained in:
Henry Heng 2025-11-15 11:16:42 +00:00 committed by GitHub
parent ceb0512e2f
commit 4a642f02d0
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
1 changed files with 2 additions and 47 deletions

View File

@ -4,15 +4,11 @@ import {
UnstructuredLoaderOptions, UnstructuredLoaderOptions,
UnstructuredLoaderStrategy, UnstructuredLoaderStrategy,
SkipInferTableTypes, SkipInferTableTypes,
HiResModelName, HiResModelName
UnstructuredLoader as LCUnstructuredLoader
} from '@langchain/community/document_loaders/fs/unstructured' } from '@langchain/community/document_loaders/fs/unstructured'
import { getCredentialData, getCredentialParam, handleEscapeCharacters } from '../../../src/utils' import { getCredentialData, getCredentialParam, handleEscapeCharacters } from '../../../src/utils'
import { getFileFromStorage, INodeOutputsValue } from '../../../src' import { getFileFromStorage, INodeOutputsValue } from '../../../src'
import { UnstructuredLoader } from './Unstructured' import { UnstructuredLoader } from './Unstructured'
import { isPathTraversal, isUnsafeFilePath } from '../../../src/validator'
import sanitize from 'sanitize-filename'
import path from 'path'
class UnstructuredFile_DocumentLoaders implements INode { class UnstructuredFile_DocumentLoaders implements INode {
label: string label: string
@ -44,17 +40,6 @@ class UnstructuredFile_DocumentLoaders implements INode {
optional: true optional: true
} }
this.inputs = [ this.inputs = [
/** Deprecated
{
label: 'File Path',
name: 'filePath',
type: 'string',
placeholder: '',
optional: true,
warning:
'Use the File Upload instead of File path. If file is uploaded, this path is ignored. Path will be deprecated in future releases.'
},
*/
{ {
label: 'Files Upload', label: 'Files Upload',
name: 'fileObject', name: 'fileObject',
@ -455,7 +440,6 @@ class UnstructuredFile_DocumentLoaders implements INode {
} }
async init(nodeData: INodeData, _: string, options: ICommonObject): Promise<any> { async init(nodeData: INodeData, _: string, options: ICommonObject): Promise<any> {
const filePath = nodeData.inputs?.filePath as string
const unstructuredAPIUrl = nodeData.inputs?.unstructuredAPIUrl as string const unstructuredAPIUrl = nodeData.inputs?.unstructuredAPIUrl as string
const strategy = nodeData.inputs?.strategy as UnstructuredLoaderStrategy const strategy = nodeData.inputs?.strategy as UnstructuredLoaderStrategy
const encoding = nodeData.inputs?.encoding as string const encoding = nodeData.inputs?.encoding as string
@ -560,37 +544,8 @@ class UnstructuredFile_DocumentLoaders implements INode {
docs.push(...loaderDocs) docs.push(...loaderDocs)
} }
} }
} else if (filePath) {
if (!filePath || typeof filePath !== 'string') {
throw new Error('Invalid file path format')
}
if (isPathTraversal(filePath) || isUnsafeFilePath(filePath)) {
throw new Error('Invalid path characters detected in filePath - path traversal not allowed')
}
const parsedPath = path.parse(filePath)
const sanitizedFilename = sanitize(parsedPath.base)
if (!sanitizedFilename || sanitizedFilename.trim() === '') {
throw new Error('Invalid filename after sanitization')
}
const sanitizedFilePath = path.join(parsedPath.dir, sanitizedFilename)
if (!path.isAbsolute(sanitizedFilePath)) {
throw new Error('File path must be absolute')
}
if (sanitizedFilePath.includes('..')) {
throw new Error('Invalid file path - directory traversal not allowed')
}
const loader = new LCUnstructuredLoader(sanitizedFilePath, obj)
const loaderDocs = await loader.load()
docs.push(...loaderDocs)
} else { } else {
throw new Error('File path or File upload is required') throw new Error('File upload is required')
} }
if (metadata) { if (metadata) {