Chore/JSON Array (#5467)

* add separate by JSON object

* add file check for Unstructured

* Enhance JSON DocumentLoader: Update label and description for 'Separate by JSON Object' option, and add type check for JSON objects in array processing.
This commit is contained in:
Henry Heng 2025-11-13 11:11:39 +00:00 committed by GitHub
parent 94cae3b66f
commit ceb0512e2f
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 68 additions and 15 deletions

View File

@ -47,7 +47,7 @@ class Json_DocumentLoaders implements INode {
constructor() { constructor() {
this.label = 'Json File' this.label = 'Json File'
this.name = 'jsonFile' this.name = 'jsonFile'
this.version = 3.0 this.version = 3.1
this.type = 'Document' this.type = 'Document'
this.icon = 'json.svg' this.icon = 'json.svg'
this.category = 'Document Loaders' this.category = 'Document Loaders'
@ -66,6 +66,14 @@ class Json_DocumentLoaders implements INode {
type: 'TextSplitter', type: 'TextSplitter',
optional: true optional: true
}, },
{
label: 'Separate by JSON Object (JSON Array)',
name: 'separateByObject',
type: 'boolean',
description: 'If enabled and the file is a JSON Array, each JSON object will be extracted as a chunk',
optional: true,
additionalParams: true
},
{ {
label: 'Pointers Extraction (separated by commas)', label: 'Pointers Extraction (separated by commas)',
name: 'pointersName', name: 'pointersName',
@ -73,7 +81,10 @@ class Json_DocumentLoaders implements INode {
description: description:
'Ex: { "key": "value" }, Pointer Extraction = "key", "value" will be extracted as pageContent of the chunk. Use comma to separate multiple pointers', 'Ex: { "key": "value" }, Pointer Extraction = "key", "value" will be extracted as pageContent of the chunk. Use comma to separate multiple pointers',
placeholder: 'key1, key2', placeholder: 'key1, key2',
optional: true optional: true,
hide: {
separateByObject: true
}
}, },
{ {
label: 'Additional Metadata', label: 'Additional Metadata',
@ -122,6 +133,7 @@ class Json_DocumentLoaders implements INode {
const pointersName = nodeData.inputs?.pointersName as string const pointersName = nodeData.inputs?.pointersName as string
const metadata = nodeData.inputs?.metadata const metadata = nodeData.inputs?.metadata
const _omitMetadataKeys = nodeData.inputs?.omitMetadataKeys as string const _omitMetadataKeys = nodeData.inputs?.omitMetadataKeys as string
const separateByObject = nodeData.inputs?.separateByObject as boolean
const output = nodeData.outputs?.output as string const output = nodeData.outputs?.output as string
let omitMetadataKeys: string[] = [] let omitMetadataKeys: string[] = []
@ -153,7 +165,7 @@ class Json_DocumentLoaders implements INode {
if (!file) continue if (!file) continue
const fileData = await getFileFromStorage(file, orgId, chatflowid) const fileData = await getFileFromStorage(file, orgId, chatflowid)
const blob = new Blob([fileData]) const blob = new Blob([fileData])
const loader = new JSONLoader(blob, pointers.length != 0 ? pointers : undefined, metadata) const loader = new JSONLoader(blob, pointers.length != 0 ? pointers : undefined, metadata, separateByObject)
if (textSplitter) { if (textSplitter) {
let splittedDocs = await loader.load() let splittedDocs = await loader.load()
@ -176,7 +188,7 @@ class Json_DocumentLoaders implements INode {
splitDataURI.pop() splitDataURI.pop()
const bf = Buffer.from(splitDataURI.pop() || '', 'base64') const bf = Buffer.from(splitDataURI.pop() || '', 'base64')
const blob = new Blob([bf]) const blob = new Blob([bf])
const loader = new JSONLoader(blob, pointers.length != 0 ? pointers : undefined, metadata) const loader = new JSONLoader(blob, pointers.length != 0 ? pointers : undefined, metadata, separateByObject)
if (textSplitter) { if (textSplitter) {
let splittedDocs = await loader.load() let splittedDocs = await loader.load()
@ -306,13 +318,20 @@ class TextLoader extends BaseDocumentLoader {
class JSONLoader extends TextLoader { class JSONLoader extends TextLoader {
public pointers: string[] public pointers: string[]
private metadataMapping: Record<string, string> private metadataMapping: Record<string, string>
private separateByObject: boolean
constructor(filePathOrBlob: string | Blob, pointers: string | string[] = [], metadataMapping: Record<string, string> = {}) { constructor(
filePathOrBlob: string | Blob,
pointers: string | string[] = [],
metadataMapping: Record<string, string> = {},
separateByObject: boolean = false
) {
super(filePathOrBlob) super(filePathOrBlob)
this.pointers = Array.isArray(pointers) ? pointers : [pointers] this.pointers = Array.isArray(pointers) ? pointers : [pointers]
if (metadataMapping) { if (metadataMapping) {
this.metadataMapping = typeof metadataMapping === 'object' ? metadataMapping : JSON.parse(metadataMapping) this.metadataMapping = typeof metadataMapping === 'object' ? metadataMapping : JSON.parse(metadataMapping)
} }
this.separateByObject = separateByObject
} }
protected async parse(raw: string): Promise<Document[]> { protected async parse(raw: string): Promise<Document[]> {
@ -323,14 +342,24 @@ class JSONLoader extends TextLoader {
const jsonArray = Array.isArray(json) ? json : [json] const jsonArray = Array.isArray(json) ? json : [json]
for (const item of jsonArray) { for (const item of jsonArray) {
const content = this.extractContent(item) if (this.separateByObject) {
const metadata = this.extractMetadata(item) if (typeof item === 'object' && item !== null && !Array.isArray(item)) {
const metadata = this.extractMetadata(item)
for (const pageContent of content) { const pageContent = this.formatObjectAsKeyValue(item)
documents.push({ documents.push({
pageContent, pageContent,
metadata metadata
}) })
}
} else {
const content = this.extractContent(item)
const metadata = this.extractMetadata(item)
for (const pageContent of content) {
documents.push({
pageContent,
metadata
})
}
} }
} }
@ -370,6 +399,30 @@ class JSONLoader extends TextLoader {
return metadata return metadata
} }
/**
* Formats a JSON object as readable key-value pairs
*/
private formatObjectAsKeyValue(obj: any, prefix: string = ''): string {
const lines: string[] = []
for (const [key, value] of Object.entries(obj)) {
const fullKey = prefix ? `${prefix}.${key}` : key
if (value === null || value === undefined) {
lines.push(`${fullKey}: ${value}`)
} else if (Array.isArray(value)) {
lines.push(`${fullKey}: ${JSON.stringify(value)}`)
} else if (typeof value === 'object') {
// Recursively format nested objects
lines.push(this.formatObjectAsKeyValue(value, fullKey))
} else {
lines.push(`${fullKey}: ${value}`)
}
}
return lines.join('\n')
}
/** /**
* If JSON pointers are specified, return all strings below any of them * If JSON pointers are specified, return all strings below any of them
* and exclude all other nodes expect if they match a JSON pointer. * and exclude all other nodes expect if they match a JSON pointer.

View File

@ -10,7 +10,7 @@ import {
import { getCredentialData, getCredentialParam, handleEscapeCharacters } from '../../../src/utils' import { getCredentialData, getCredentialParam, handleEscapeCharacters } from '../../../src/utils'
import { getFileFromStorage, INodeOutputsValue } from '../../../src' import { getFileFromStorage, INodeOutputsValue } from '../../../src'
import { UnstructuredLoader } from './Unstructured' import { UnstructuredLoader } from './Unstructured'
import { isPathTraversal } from '../../../src/validator' import { isPathTraversal, isUnsafeFilePath } from '../../../src/validator'
import sanitize from 'sanitize-filename' import sanitize from 'sanitize-filename'
import path from 'path' import path from 'path'
@ -565,7 +565,7 @@ class UnstructuredFile_DocumentLoaders implements INode {
throw new Error('Invalid file path format') throw new Error('Invalid file path format')
} }
if (isPathTraversal(filePath)) { if (isPathTraversal(filePath) || isUnsafeFilePath(filePath)) {
throw new Error('Invalid path characters detected in filePath - path traversal not allowed') throw new Error('Invalid path characters detected in filePath - path traversal not allowed')
} }