Feature/update json loader to have dynamic metadata (#3811)

update json loader to have dynamic metadata
2025-01-06 16:18:49 +00:00 · 2025-01-06 16:18:49 +00:00 · 7aead83d61
parent 4c400301f2
commit 7aead83d61
1 changed files with 249 additions and 8 deletions
--- a/packages/components/nodes/documentloaders/Json/Json.ts
+++ b/packages/components/nodes/documentloaders/Json/Json.ts
@ -1,8 +1,36 @@
 import { omit } from 'lodash'
 import { ICommonObject, IDocument, INode, INodeData, INodeParams } from '../../../src/Interface'
 import { TextSplitter } from 'langchain/text_splitter'
-import { JSONLoader } from 'langchain/document_loaders/fs/json'
 import { getFileFromStorage, handleEscapeCharacters, INodeOutputsValue } from '../../../src'
+import { Document } from '@langchain/core/documents'
+import jsonpointer from 'jsonpointer'
+import type { readFile as ReadFileT } from 'node:fs/promises'
+import { BaseDocumentLoader } from 'langchain/document_loaders/base'
+
+const howToUseCode = `
+You can add metadata dynamically from the document:
+
+For example, if the JSON document is:
+\`\`\`json
+[
+    {
+        "url": "https://www.google.com",
+        "body": "This is body 1"
+    },
+    {
+        "url": "https://www.yahoo.com",
+        "body": "This is body 2"
+    }
+]
+
+\`\`\`
+
+You can have the "url" value as metadata by returning the following:
+\`\`\`json
+{
+    "url": "/url"
+}
+\`\`\``

 class Json_DocumentLoaders implements INode {
    label: string
@ -19,7 +47,7 @@ class Json_DocumentLoaders implements INode {
    constructor() {
        this.label = 'Json File'
        this.name = 'jsonFile'
-        this.version = 2.0
+        this.version = 3.0
        this.type = 'Document'
        this.icon = 'json.svg'
        this.category = 'Document Loaders'
@ -42,15 +70,21 @@ class Json_DocumentLoaders implements INode {
                label: 'Pointers Extraction (separated by commas)',
                name: 'pointersName',
                type: 'string',
-                description: 'Extracting multiple pointers',
-                placeholder: 'Enter pointers name',
+                description:
+                    'Ex: { "key": "value" }, Pointer Extraction = "key", "value" will be extracted as pageContent of the chunk. Use comma to separate multiple pointers',
+                placeholder: 'key1, key2',
                optional: true
            },
            {
                label: 'Additional Metadata',
                name: 'metadata',
                type: 'json',
-                description: 'Additional metadata to be added to the extracted documents',
+                description:
+                    'Additional metadata to be added to the extracted documents. You can add metadata dynamically from the document. Ex: { "key": "value", "source": "www.example.com" }. Metadata: { "page": "/source" } will extract the value of the key "source" from the document and add it to the metadata with the key "page"',
+                hint: {
+                    label: 'How to use',
+                    value: howToUseCode
+                },
                optional: true,
                additionalParams: true
            },
@ -118,7 +152,7 @@ class Json_DocumentLoaders implements INode {
                if (!file) continue
                const fileData = await getFileFromStorage(file, chatflowid)
                const blob = new Blob([fileData])
-                const loader = new JSONLoader(blob, pointers.length != 0 ? pointers : undefined)
+                const loader = new JSONLoader(blob, pointers.length != 0 ? pointers : undefined, metadata)

                if (textSplitter) {
                    let splittedDocs = await loader.load()
@ -141,7 +175,7 @@ class Json_DocumentLoaders implements INode {
                splitDataURI.pop()
                const bf = Buffer.from(splitDataURI.pop() || '', 'base64')
                const blob = new Blob([bf])
-                const loader = new JSONLoader(blob, pointers.length != 0 ? pointers : undefined)
+                const loader = new JSONLoader(blob, pointers.length != 0 ? pointers : undefined, metadata)

                if (textSplitter) {
                    let splittedDocs = await loader.load()
@ -154,7 +188,8 @@ class Json_DocumentLoaders implements INode {
        }

        if (metadata) {
-            const parsedMetadata = typeof metadata === 'object' ? metadata : JSON.parse(metadata)
+            let parsedMetadata = typeof metadata === 'object' ? metadata : JSON.parse(metadata)
+            parsedMetadata = removeValuesStartingWithSlash(parsedMetadata)
            docs = docs.map((doc) => ({
                ...doc,
                metadata:
@ -197,4 +232,210 @@ class Json_DocumentLoaders implements INode {
    }
 }

+const removeValuesStartingWithSlash = (obj: Record<string, any>): Record<string, any> => {
+    const result: Record<string, any> = {}
+
+    for (const key in obj) {
+        const value = obj[key]
+        if (typeof value === 'string' && value.startsWith('/')) {
+            continue
+        }
+        result[key] = value
+    }
+
+    return result
+}
+
+class TextLoader extends BaseDocumentLoader {
+    constructor(public filePathOrBlob: string | Blob) {
+        super()
+    }
+
+    protected async parse(raw: string): Promise<{ pageContent: string; metadata: ICommonObject }[]> {
+        return [{ pageContent: raw, metadata: {} }]
+    }
+
+    public async load(): Promise<Document[]> {
+        let text: string
+        let metadata: Record<string, string>
+        if (typeof this.filePathOrBlob === 'string') {
+            const { readFile } = await TextLoader.imports()
+            text = await readFile(this.filePathOrBlob, 'utf8')
+            metadata = { source: this.filePathOrBlob }
+        } else {
+            text = await this.filePathOrBlob.text()
+            metadata = { source: 'blob', blobType: this.filePathOrBlob.type }
+        }
+        const parsed = await this.parse(text)
+        parsed.forEach((parsedData, i) => {
+            const { pageContent } = parsedData
+            if (typeof pageContent !== 'string') {
+                throw new Error(`Expected string, at position ${i} got ${typeof pageContent}`)
+            }
+        })
+        return parsed.map((parsedData, i) => {
+            const { pageContent, metadata: additionalMetadata } = parsedData
+            return new Document({
+                pageContent,
+                metadata:
+                    parsed.length === 1
+                        ? { ...metadata, ...additionalMetadata }
+                        : {
+                              ...metadata,
+                              line: i + 1,
+                              ...additionalMetadata
+                          }
+            })
+        })
+    }
+
+    static async imports(): Promise<{
+        readFile: typeof ReadFileT
+    }> {
+        try {
+            const { readFile } = await import('node:fs/promises')
+            return { readFile }
+        } catch (e) {
+            console.error(e)
+            throw new Error(`Failed to load fs/promises. Make sure you are running in Node.js environment.`)
+        }
+    }
+}
+
+class JSONLoader extends TextLoader {
+    public pointers: string[]
+    private metadataMapping: Record<string, string>
+
+    constructor(filePathOrBlob: string | Blob, pointers: string | string[] = [], metadataMapping: Record<string, string> = {}) {
+        super(filePathOrBlob)
+        this.pointers = Array.isArray(pointers) ? pointers : [pointers]
+        if (metadataMapping) {
+            this.metadataMapping = typeof metadataMapping === 'object' ? metadataMapping : JSON.parse(metadataMapping)
+        }
+    }
+
+    protected async parse(raw: string): Promise<Document[]> {
+        const json = JSON.parse(raw.trim())
+        const documents: Document[] = []
+
+        // Handle both single object and array of objects
+        const jsonArray = Array.isArray(json) ? json : [json]
+
+        for (const item of jsonArray) {
+            const content = this.extractContent(item)
+            const metadata = this.extractMetadata(item)
+
+            for (const pageContent of content) {
+                documents.push({
+                    pageContent,
+                    metadata
+                })
+            }
+        }
+
+        return documents
+    }
+
+    /**
+     * Extracts content based on specified pointers or all strings if no pointers
+     */
+    private extractContent(json: any): string[] {
+        const compiledPointers = this.pointers.map((pointer) => jsonpointer.compile(pointer))
+
+        return this.extractArrayStringsFromObject(json, compiledPointers, !(this.pointers.length > 0))
+    }
+
+    /**
+     * Extracts metadata based on the mapping configuration
+     */
+    private extractMetadata(json: any): Record<string, any> {
+        let metadata: Record<string, any> = {}
+
+        if (this.metadataMapping) {
+            const values = Object.values(this.metadataMapping).filter((value) => typeof value === 'string' && value.startsWith('/'))
+            for (const value of values) {
+                if (value) {
+                    const key = Object.keys(this.metadataMapping).find((key) => this.metadataMapping?.[key] === value)
+                    if (key) {
+                        metadata = {
+                            ...metadata,
+                            [key]: jsonpointer.get(json, value)
+                        }
+                    }
+                }
+            }
+        }
+
+        return metadata
+    }
+
+    /**
+     * If JSON pointers are specified, return all strings below any of them
+     * and exclude all other nodes expect if they match a JSON pointer.
+     * If no JSON pointer is specified then return all string in the object.
+     */
+    private extractArrayStringsFromObject(
+        json: any,
+        pointers: jsonpointer[],
+        extractAllStrings = false,
+        keyHasBeenFound = false
+    ): string[] {
+        if (!json) {
+            return []
+        }
+
+        if (typeof json === 'string' && extractAllStrings) {
+            return [json]
+        }
+
+        if (Array.isArray(json) && extractAllStrings) {
+            let extractedString: string[] = []
+            for (const element of json) {
+                extractedString = extractedString.concat(this.extractArrayStringsFromObject(element, pointers, true))
+            }
+            return extractedString
+        }
+
+        if (typeof json === 'object') {
+            if (extractAllStrings) {
+                return this.extractArrayStringsFromObject(Object.values(json), pointers, true)
+            }
+
+            const targetedEntries = this.getTargetedEntries(json, pointers)
+            const thisLevelEntries = Object.values(json) as object[]
+            const notTargetedEntries = thisLevelEntries.filter((entry: object) => !targetedEntries.includes(entry))
+
+            let extractedStrings: string[] = []
+            if (targetedEntries.length > 0) {
+                for (const oneEntry of targetedEntries) {
+                    extractedStrings = extractedStrings.concat(this.extractArrayStringsFromObject(oneEntry, pointers, true, true))
+                }
+
+                for (const oneEntry of notTargetedEntries) {
+                    extractedStrings = extractedStrings.concat(this.extractArrayStringsFromObject(oneEntry, pointers, false, true))
+                }
+            } else if (extractAllStrings || !keyHasBeenFound) {
+                for (const oneEntry of notTargetedEntries) {
+                    extractedStrings = extractedStrings.concat(this.extractArrayStringsFromObject(oneEntry, pointers, extractAllStrings))
+                }
+            }
+
+            return extractedStrings
+        }
+
+        return []
+    }
+
+    private getTargetedEntries(json: object, pointers: jsonpointer[]): object[] {
+        const targetEntries = []
+        for (const pointer of pointers) {
+            const targetedEntry = pointer.get(json)
+            if (targetedEntry) {
+                targetEntries.push(targetedEntry)
+            }
+        }
+        return targetEntries
+    }
+}
+
 module.exports = { nodeClass: Json_DocumentLoaders }