diff --git a/packages/components/nodes/documentloaders/Jsonlines/Jsonlines.ts b/packages/components/nodes/documentloaders/Jsonlines/Jsonlines.ts index 2c895e8f5..4cb36f53c 100644 --- a/packages/components/nodes/documentloaders/Jsonlines/Jsonlines.ts +++ b/packages/components/nodes/documentloaders/Jsonlines/Jsonlines.ts @@ -1,8 +1,31 @@ import { omit } from 'lodash' import { ICommonObject, IDocument, INode, INodeData, INodeParams } from '../../../src/Interface' import { TextSplitter } from 'langchain/text_splitter' -import { JSONLinesLoader } from 'langchain/document_loaders/fs/json' +import jsonpointer from 'jsonpointer' import { getFileFromStorage } from '../../../src' +import { BaseDocumentLoader } from 'langchain/document_loaders/base' +import { Document } from '@langchain/core/documents' +import type { readFile as ReadFileT } from 'node:fs/promises' + +const howToUseCode = ` +You can add metadata dynamically from the document: + +For example, if the document is: +\`\`\`jsonl +{ + "source": "www.example.com", "content": "Hello World!" +} +{ + "source": "www.example2.com", "content": "Hi World!" +} +\`\`\` + +You can have the "source" value as metadata by returning the following: +\`\`\`json +{ + "source": "/source" +} +\`\`\`` class Jsonlines_DocumentLoaders implements INode { label: string @@ -18,7 +41,7 @@ class Jsonlines_DocumentLoaders implements INode { constructor() { this.label = 'Json Lines File' this.name = 'jsonlinesFile' - this.version = 1.0 + this.version = 2.0 this.type = 'Document' this.icon = 'jsonlines.svg' this.category = 'Document Loaders' @@ -41,14 +64,20 @@ class Jsonlines_DocumentLoaders implements INode { label: 'Pointer Extraction', name: 'pointerName', type: 'string', - placeholder: 'Enter pointer name', + placeholder: 'key', + description: 'Ex: { "key": "value" }, Pointer Extraction = "key", "value" will be extracted as pageContent of the chunk', optional: false }, { label: 'Additional Metadata', name: 'metadata', type: 'json', - description: 'Additional metadata to be added to the extracted documents', + description: + 'Additional metadata to be added to the extracted documents. You can add metadata dynamically from the document. Ex: { "key": "value", "source": "www.example.com" }. Metadata: { "page": "/source" } will extract the value of the key "source" from the document and add it to the metadata with the key "page"', + hint: { + label: 'How to use', + value: howToUseCode + }, optional: true, additionalParams: true }, @@ -96,7 +125,7 @@ class Jsonlines_DocumentLoaders implements INode { if (!file) continue const fileData = await getFileFromStorage(file, chatflowid) const blob = new Blob([fileData]) - const loader = new JSONLinesLoader(blob, pointer) + const loader = new JSONLinesLoader(blob, pointer, metadata) if (textSplitter) { let splittedDocs = await loader.load() @@ -119,7 +148,7 @@ class Jsonlines_DocumentLoaders implements INode { splitDataURI.pop() const bf = Buffer.from(splitDataURI.pop() || '', 'base64') const blob = new Blob([bf]) - const loader = new JSONLinesLoader(blob, pointer) + const loader = new JSONLinesLoader(blob, pointer, metadata) if (textSplitter) { let splittedDocs = await loader.load() @@ -132,7 +161,8 @@ class Jsonlines_DocumentLoaders implements INode { } if (metadata) { - const parsedMetadata = typeof metadata === 'object' ? metadata : JSON.parse(metadata) + let parsedMetadata = typeof metadata === 'object' ? metadata : JSON.parse(metadata) + parsedMetadata = removeValuesStartingWithSlash(parsedMetadata) docs = docs.map((doc) => ({ ...doc, metadata: @@ -167,4 +197,122 @@ class Jsonlines_DocumentLoaders implements INode { } } +const removeValuesStartingWithSlash = (obj: Record): Record => { + const result: Record = {} + + for (const key in obj) { + const value = obj[key] + if (typeof value === 'string' && value.startsWith('/')) { + continue + } + result[key] = value + } + + return result +} + +class TextLoader extends BaseDocumentLoader { + constructor(public filePathOrBlob: string | Blob) { + super() + } + + protected async parse(raw: string): Promise<{ pageContent: string; metadata: ICommonObject }[]> { + return [{ pageContent: raw, metadata: {} }] + } + + public async load(): Promise { + let text: string + let metadata: Record + if (typeof this.filePathOrBlob === 'string') { + const { readFile } = await TextLoader.imports() + text = await readFile(this.filePathOrBlob, 'utf8') + metadata = { source: this.filePathOrBlob } + } else { + text = await this.filePathOrBlob.text() + metadata = { source: 'blob', blobType: this.filePathOrBlob.type } + } + const parsed = await this.parse(text) + parsed.forEach((parsedData, i) => { + const { pageContent } = parsedData + if (typeof pageContent !== 'string') { + throw new Error(`Expected string, at position ${i} got ${typeof pageContent}`) + } + }) + return parsed.map((parsedData, i) => { + const { pageContent, metadata: additionalMetadata } = parsedData + return new Document({ + pageContent, + metadata: + parsed.length === 1 + ? { ...metadata, ...additionalMetadata } + : { + ...metadata, + line: i + 1, + ...additionalMetadata + } + }) + }) + } + + static async imports(): Promise<{ + readFile: typeof ReadFileT + }> { + try { + const { readFile } = await import('node:fs/promises') + return { readFile } + } catch (e) { + console.error(e) + throw new Error(`Failed to load fs/promises. Make sure you are running in Node.js environment.`) + } + } +} + +class JSONLinesLoader extends TextLoader { + metadata?: ICommonObject + additionalMetadata: ICommonObject[] = [] + + constructor(filePathOrBlob: string | Blob, public pointer: string, metadata?: any) { + super(filePathOrBlob) + if (metadata) { + this.metadata = typeof metadata === 'object' ? metadata : JSON.parse(metadata) + } + } + + async getAdditionalMetadata(): Promise { + return this.additionalMetadata + } + + protected async parse(raw: string): Promise<{ pageContent: string; metadata: ICommonObject }[]> { + const lines = raw.split('\n') + const jsons = lines + .map((line) => line.trim()) + .filter(Boolean) + .map((line) => JSON.parse(line)) + const pointer = jsonpointer.compile(this.pointer) + if (this.metadata) { + const values = Object.values(this.metadata).filter((value) => typeof value === 'string' && value.startsWith('/')) + let newJsons = [] + for (const json of jsons) { + let metadata = {} + for (const value of values) { + if (value) { + const key = Object.keys(this.metadata).find((key) => this.metadata?.[key] === value) + if (key) { + metadata = { + ...metadata, + [key]: jsonpointer.get(json, value) + } + } + } + } + newJsons.push({ pageContent: pointer.get(json), metadata }) + } + return newJsons + } + return jsons.map((json) => { + return { pageContent: pointer.get(json), metadata: {} } + }) + } +} + module.exports = { nodeClass: Jsonlines_DocumentLoaders }