From a2c36b4447347ea8ed96f869a9aee15b56b42c2d Mon Sep 17 00:00:00 2001 From: Henry Heng Date: Mon, 25 Nov 2024 15:47:13 +0000 Subject: [PATCH] Feature/Add new doc store upsert and refresh API (#3556) add new doc store upsert and refresh API --- .../api-documentation/src/yml/swagger.yml | 258 +++--- .../Unstructured/UnstructuredFile.ts | 46 +- .../server/src/Interface.DocumentStore.ts | 160 ++-- .../src/controllers/documentstore/index.ts | 82 +- .../server/src/routes/documentstore/index.ts | 12 +- .../src/services/documentstore/index.ts | 567 ++++++++++--- packages/server/src/utils/createAttachment.ts | 15 +- packages/ui/src/api/documentstore.js | 10 +- packages/ui/src/routes/MainRoutes.jsx | 4 + .../ui-component/cards/DocumentStoreCard.jsx | 3 +- .../views/docstore/DocumentStoreDetail.jsx | 244 ++++-- .../views/docstore/DocumentStoreStatus.jsx | 29 +- .../docstore/LoaderConfigPreviewChunks.jsx | 6 +- .../src/views/docstore/ShowStoredChunks.jsx | 2 +- .../views/docstore/VectorStoreConfigure.jsx | 789 +++++++++--------- 15 files changed, 1424 insertions(+), 803 deletions(-) diff --git a/packages/api-documentation/src/yml/swagger.yml b/packages/api-documentation/src/yml/swagger.yml index da52b2f05..ad9b5e4b8 100644 --- a/packages/api-documentation/src/yml/swagger.yml +++ b/packages/api-documentation/src/yml/swagger.yml @@ -305,6 +305,10 @@ paths: type: string format: binary description: Files to be uploaded + base64: + type: boolean + default: false + description: Return contents of the files in base64 format required: - files required: true @@ -618,171 +622,109 @@ paths: '500': description: Internal server error - /document-store/loader/preview: + /document-store/upsert/{id}: post: tags: - document-store security: - bearerAuth: [] - summary: Preview document chunks - description: Preview document chunks from loader - operationId: previewChunking + summary: Upsert new document to document store + description: Upsert new document to document store + operationId: upsertDocument + parameters: + - in: path + name: id + required: true + schema: + type: string + format: uuid + description: Document store ID requestBody: content: application/json: schema: - $ref: '#/components/schemas/DocumentStoreLoaderForPreview' - required: true - responses: - '200': - description: Successfully preview chunks - content: - application/json: - schema: - type: object - properties: - chunks: - type: array - items: - $ref: '#/components/schemas/Document' - totalChunks: - type: integer - example: 10 - previewChunkCount: - type: integer - example: 5 - '400': - description: Invalid request body - '500': - description: Internal server error - - /document-store/loader/process: - post: - tags: - - document-store - security: - - bearerAuth: [] - summary: Process loading & chunking operation - description: Process loading & chunking operation of document from loader - operationId: processChunking - requestBody: - content: - application/json: + $ref: '#/components/schemas/DocumentStoreLoaderForUpsert' + multipart/form-data: schema: type: object - required: - - storeId - - id properties: - storeId: + files: + type: array + items: + type: string + format: binary + description: Files to be uploaded + loader: type: string - description: Document store ID - example: '603a7b51-ae7c-4b0a-8865-e454ed2f6766' - id: + nullable: true + example: '{"name":"plainText","config":{"text":"why the sky is blue"}}' + description: Loader configurations + splitter: type: string - description: Document loader ID. If your URL is /document-stores/{storeId}/{id}, then id is the last part of the URL - example: 'c427e569-b81a-469a-b14c-fa73dd5bae49' + nullable: true + example: '{"name":"recursiveCharacterTextSplitter","config":{"chunkSize":2000}}' + description: Splitter configurations + embedding: + type: string + nullable: true + example: '{"name":"openAIEmbeddings","config":{"modelName":"text-embedding-ada-002"}}' + description: Embedding configurations + vectorStore: + type: string + nullable: true + example: '{"name":"faiss"}' + description: Vector Store configurations + recordManager: + type: string + nullable: true + example: '{"name":"postgresRecordManager"}' + description: Record Manager configurations + required: + - files required: true responses: '200': - description: Successfully process chunking operation + description: Successfully execute upsert operation content: application/json: schema: - $ref: '#/components/schemas/DocumentStoreFileChunkPagedResponse' + $ref: '#/components/schemas/VectorUpsertResponse' '400': description: Invalid request body '500': description: Internal server error - /document-store/vectorstore/save: + /document-store/refresh/{id}: post: tags: - document-store security: - bearerAuth: [] - summary: Save upsert configuration of document store - description: Save upsert configuration of document store - operationId: saveVectorStoreConfig + summary: Re-process and upsert all documents in document store + description: Re-process and upsert all existing documents in document store + operationId: refreshDocument + parameters: + - in: path + name: id + required: true + schema: + type: string + format: uuid + description: Document store ID requestBody: content: application/json: schema: - type: object - required: - - storeId - properties: - storeId: - type: string - description: Document store ID - example: '603a7b51-ae7c-4b0a-8865-e454ed2f6766' - embeddingName: - type: string - description: Name of the embedding - example: 'openAIEmbeddings' - embeddingConfig: - type: object - description: Configuration of the embedding - example: { 'model': 'text-embedding-ada-002', 'credential': '1eba5808-c55b-4817-a285-b0c92846a7ad' } - vectorStoreName: - type: string - description: Name of the vector store - example: 'faiss' - vectorStoreConfig: - type: object - description: Configuration of the embedding - example: { 'basePath': './faiss' } - recordManagerName: - type: string - description: Name of the record manager - example: 'SQLiteRecordManager' - recordManagerConfig: - type: object - description: Configuration of the embedding - example: { 'databaseFilePath': './recordManager.db' } + $ref: '#/components/schemas/DocumentStoreLoaderForRefresh' required: true responses: '200': - description: Successfully save upsert configuration of document store + description: Successfully execute refresh operation content: application/json: - schema: - $ref: '#/components/schemas/DocumentStore' - - '400': - description: Invalid request body - '500': - description: Internal server error - - /document-store/vectorstore/insert: - post: - tags: - - document-store - security: - - bearerAuth: [] - summary: Upsert chunks from document store - description: Upsert chunks from document store using the saved configuration - operationId: insertIntoVectorStore - requestBody: - content: - application/json: - schema: - type: object - required: - - storeId - properties: - storeId: - type: string - description: Document store ID - example: '603a7b51-ae7c-4b0a-8865-e454ed2f6766' - required: true - responses: - '200': - description: Successfully save upsert configuration of document store - content: - application/json: - schema: + type: array + items: $ref: '#/components/schemas/VectorUpsertResponse' '400': @@ -2220,6 +2162,72 @@ components: description: type: string + DocumentStoreLoaderForUpsert: + type: object + properties: + docId: + type: string + format: uuid + description: Document ID within the store. If provided, existing configuration from the document will be used for the new document + loader: + type: object + properties: + name: + type: string + example: plainText + description: Name of the loader (camelCase) + config: + type: object + description: Configuration for the loader + splitter: + type: object + properties: + name: + type: string + example: recursiveCharacterTextSplitter + description: Name of the text splitter (camelCase) + config: + type: object + description: Configuration for the text splitter + embedding: + type: object + properties: + name: + type: string + example: openAIEmbeddings + description: Name of the embedding generator (camelCase) + config: + type: object + description: Configuration for the embedding generator + vectorStore: + type: object + properties: + name: + type: string + example: faiss + description: Name of the vector store (camelCase) + config: + type: object + description: Configuration for the vector store + recordManager: + type: object + properties: + name: + type: string + example: postgresRecordManager + description: Name of the record manager (camelCase) + config: + type: object + description: Configuration for the record manager + + DocumentStoreLoaderForRefresh: + type: object + properties: + items: + type: array + items: + $ref: '#/components/schemas/DocumentStoreLoaderForUpsert' + ChatMessageFeedback: type: object properties: diff --git a/packages/components/nodes/documentloaders/Unstructured/UnstructuredFile.ts b/packages/components/nodes/documentloaders/Unstructured/UnstructuredFile.ts index 28dc793ec..45c098df7 100644 --- a/packages/components/nodes/documentloaders/Unstructured/UnstructuredFile.ts +++ b/packages/components/nodes/documentloaders/Unstructured/UnstructuredFile.ts @@ -26,7 +26,7 @@ class UnstructuredFile_DocumentLoaders implements INode { constructor() { this.label = 'Unstructured File Loader' this.name = 'unstructuredFileLoader' - this.version = 3.0 + this.version = 4.0 this.type = 'Document' this.icon = 'unstructured-file.svg' this.category = 'Document Loaders' @@ -40,6 +40,7 @@ class UnstructuredFile_DocumentLoaders implements INode { optional: true } this.inputs = [ + /** Deprecated { label: 'File Path', name: 'filePath', @@ -49,6 +50,7 @@ class UnstructuredFile_DocumentLoaders implements INode { warning: 'Use the File Upload instead of File path. If file is uploaded, this path is ignored. Path will be deprecated in future releases.' }, + */ { label: 'Files Upload', name: 'fileObject', @@ -200,7 +202,7 @@ class UnstructuredFile_DocumentLoaders implements INode { { label: 'Hi-Res Model Name', name: 'hiResModelName', - description: 'The name of the inference model used when strategy is hi_res. Default: detectron2_onnx.', + description: 'The name of the inference model used when strategy is hi_res', type: 'options', options: [ { @@ -227,8 +229,7 @@ class UnstructuredFile_DocumentLoaders implements INode { } ], optional: true, - additionalParams: true, - default: 'detectron2_onnx' + additionalParams: true }, { label: 'Chunking Strategy', @@ -241,9 +242,21 @@ class UnstructuredFile_DocumentLoaders implements INode { label: 'None', name: 'None' }, + { + label: 'Basic', + name: 'basic' + }, { label: 'By Title', name: 'by_title' + }, + { + label: 'By Page', + name: 'by_page' + }, + { + label: 'By Similarity', + name: 'by_similarity' } ], optional: true, @@ -434,15 +447,15 @@ class UnstructuredFile_DocumentLoaders implements INode { : ([] as SkipInferTableTypes[]) const hiResModelName = nodeData.inputs?.hiResModelName as HiResModelName const includePageBreaks = nodeData.inputs?.includePageBreaks as boolean - const chunkingStrategy = nodeData.inputs?.chunkingStrategy as 'None' | 'by_title' + const chunkingStrategy = nodeData.inputs?.chunkingStrategy as string const metadata = nodeData.inputs?.metadata const sourceIdKey = (nodeData.inputs?.sourceIdKey as string) || 'source' const ocrLanguages = nodeData.inputs?.ocrLanguages ? JSON.parse(nodeData.inputs?.ocrLanguages as string) : ([] as string[]) const xmlKeepTags = nodeData.inputs?.xmlKeepTags as boolean const multiPageSections = nodeData.inputs?.multiPageSections as boolean - const combineUnderNChars = nodeData.inputs?.combineUnderNChars as number - const newAfterNChars = nodeData.inputs?.newAfterNChars as number - const maxCharacters = nodeData.inputs?.maxCharacters as number + const combineUnderNChars = nodeData.inputs?.combineUnderNChars as string + const newAfterNChars = nodeData.inputs?.newAfterNChars as string + const maxCharacters = nodeData.inputs?.maxCharacters as string const _omitMetadataKeys = nodeData.inputs?.omitMetadataKeys as string let omitMetadataKeys: string[] = [] @@ -471,10 +484,19 @@ class UnstructuredFile_DocumentLoaders implements INode { chunkingStrategy, ocrLanguages, xmlKeepTags, - multiPageSections, - combineUnderNChars, - newAfterNChars, - maxCharacters + multiPageSections + } + + if (combineUnderNChars) { + obj.combineUnderNChars = parseInt(combineUnderNChars, 10) + } + + if (newAfterNChars) { + obj.newAfterNChars = parseInt(newAfterNChars, 10) + } + + if (maxCharacters) { + obj.maxCharacters = parseInt(maxCharacters, 10) } const credentialData = await getCredentialData(nodeData.credential ?? '', options) diff --git a/packages/server/src/Interface.DocumentStore.ts b/packages/server/src/Interface.DocumentStore.ts index e319fa748..44bb10dc4 100644 --- a/packages/server/src/Interface.DocumentStore.ts +++ b/packages/server/src/Interface.DocumentStore.ts @@ -1,3 +1,4 @@ +import { ICommonObject } from 'flowise-components' import { DocumentStore } from './database/entities/DocumentStore' export enum DocumentStoreStatus { @@ -36,23 +37,25 @@ export interface IDocumentStoreFileChunk { export interface IDocumentStoreFileChunkPagedResponse { chunks: IDocumentStoreFileChunk[] count: number + characters: number file?: IDocumentStoreLoader currentPage: number storeName: string description: string + docId: string } export interface IDocumentStoreLoader { - id: string - loaderId: string - loaderName: string - loaderConfig: any // JSON string - splitterId: string - splitterName: string - splitterConfig: any // JSON string - totalChunks: number - totalChars: number - status: DocumentStoreStatus + id?: string + loaderId?: string + loaderName?: string + loaderConfig?: any // JSON string + splitterId?: string + splitterName?: string + splitterConfig?: any // JSON string + totalChunks?: number + totalChars?: number + status?: DocumentStoreStatus storeId?: string files?: IDocumentStoreLoaderFile[] source?: string @@ -60,9 +63,37 @@ export interface IDocumentStoreLoader { } export interface IDocumentStoreLoaderForPreview extends IDocumentStoreLoader { - rehydrated: boolean - preview: boolean - previewChunkCount: number + rehydrated?: boolean + preview?: boolean + previewChunkCount?: number +} + +export interface IDocumentStoreUpsertData { + docId: string + loader?: { + name: string + config: ICommonObject + } + splitter?: { + name: string + config: ICommonObject + } + vectorStore?: { + name: string + config: ICommonObject + } + embedding?: { + name: string + config: ICommonObject + } + recordManager?: { + name: string + config: ICommonObject + } +} + +export interface IDocumentStoreRefreshData { + items: IDocumentStoreUpsertData[] } export interface IDocumentStoreLoaderFile { @@ -79,6 +110,72 @@ export interface IDocumentStoreWhereUsed { name: string } +const getFileName = (fileBase64: string) => { + let fileNames = [] + if (fileBase64.startsWith('FILE-STORAGE::')) { + const names = fileBase64.substring(14) + if (names.includes('[') && names.includes(']')) { + const files = JSON.parse(names) + return files.join(', ') + } else { + return fileBase64.substring(14) + } + } + if (fileBase64.startsWith('[') && fileBase64.endsWith(']')) { + const files = JSON.parse(fileBase64) + for (const file of files) { + const splitDataURI = file.split(',') + const filename = splitDataURI[splitDataURI.length - 1].split(':')[1] + fileNames.push(filename) + } + return fileNames.join(', ') + } else { + const splitDataURI = fileBase64.split(',') + const filename = splitDataURI[splitDataURI.length - 1].split(':')[1] + return filename + } +} + +export const addLoaderSource = (loader: IDocumentStoreLoader, isGetFileNameOnly = false) => { + let source = 'None' + + const handleUnstructuredFileLoader = (config: any, isGetFileNameOnly: boolean): string => { + if (config.fileObject) { + return isGetFileNameOnly ? getFileName(config.fileObject) : config.fileObject.replace('FILE-STORAGE::', '') + } + return config.filePath || 'None' + } + + switch (loader.loaderId) { + case 'pdfFile': + case 'jsonFile': + case 'csvFile': + case 'file': + case 'jsonlinesFile': + case 'txtFile': + source = isGetFileNameOnly + ? getFileName(loader.loaderConfig[loader.loaderId]) + : loader.loaderConfig[loader.loaderId]?.replace('FILE-STORAGE::', '') || 'None' + break + case 'apiLoader': + source = loader.loaderConfig.url + ' (' + loader.loaderConfig.method + ')' + break + case 'cheerioWebScraper': + case 'playwrightWebScraper': + case 'puppeteerWebScraper': + source = loader.loaderConfig.url || 'None' + break + case 'unstructuredFileLoader': + source = handleUnstructuredFileLoader(loader.loaderConfig, isGetFileNameOnly) + break + default: + source = 'None' + break + } + + return source +} + export class DocumentStoreDTO { id: string name: string @@ -130,40 +227,9 @@ export class DocumentStoreDTO { if (entity.loaders) { documentStoreDTO.loaders = JSON.parse(entity.loaders) documentStoreDTO.loaders.map((loader) => { - documentStoreDTO.totalChars += loader.totalChars - documentStoreDTO.totalChunks += loader.totalChunks - switch (loader.loaderId) { - case 'pdfFile': - loader.source = loader.loaderConfig.pdfFile.replace('FILE-STORAGE::', '') - break - case 'apiLoader': - loader.source = loader.loaderConfig.url + ' (' + loader.loaderConfig.method + ')' - break - case 'cheerioWebScraper': - loader.source = loader.loaderConfig.url - break - case 'playwrightWebScraper': - loader.source = loader.loaderConfig.url - break - case 'puppeteerWebScraper': - loader.source = loader.loaderConfig.url - break - case 'jsonFile': - loader.source = loader.loaderConfig.jsonFile.replace('FILE-STORAGE::', '') - break - case 'docxFile': - loader.source = loader.loaderConfig.docxFile.replace('FILE-STORAGE::', '') - break - case 'textFile': - loader.source = loader.loaderConfig.txtFile.replace('FILE-STORAGE::', '') - break - case 'unstructuredFileLoader': - loader.source = loader.loaderConfig.filePath - break - default: - loader.source = 'None' - break - } + documentStoreDTO.totalChars += loader.totalChars || 0 + documentStoreDTO.totalChunks += loader.totalChunks || 0 + loader.source = addLoaderSource(loader) if (loader.status !== 'SYNC') { documentStoreDTO.status = DocumentStoreStatus.STALE } diff --git a/packages/server/src/controllers/documentstore/index.ts b/packages/server/src/controllers/documentstore/index.ts index 100413594..6d56fa184 100644 --- a/packages/server/src/controllers/documentstore/index.ts +++ b/packages/server/src/controllers/documentstore/index.ts @@ -4,6 +4,15 @@ import documentStoreService from '../../services/documentstore' import { DocumentStore } from '../../database/entities/DocumentStore' import { InternalFlowiseError } from '../../errors/internalFlowiseError' import { DocumentStoreDTO } from '../../Interface' +import { getRateLimiter } from '../../utils/rateLimit' + +const getRateLimiterMiddleware = async (req: Request, res: Response, next: NextFunction) => { + try { + return getRateLimiter(req, res, next) + } catch (error) { + next(error) + } +} const createDocumentStore = async (req: Request, res: Response, next: NextFunction) => { try { @@ -160,16 +169,39 @@ const editDocumentStoreFileChunk = async (req: Request, res: Response, next: Nex } } -const processFileChunks = async (req: Request, res: Response, next: NextFunction) => { +const saveProcessingLoader = async (req: Request, res: Response, next: NextFunction) => { try { if (typeof req.body === 'undefined') { throw new InternalFlowiseError( StatusCodes.PRECONDITION_FAILED, - `Error: documentStoreController.processFileChunks - body not provided!` + `Error: documentStoreController.saveProcessingLoader - body not provided!` ) } const body = req.body - const apiResponse = await documentStoreService.processAndSaveChunks(body) + const apiResponse = await documentStoreService.saveProcessingLoader(body) + return res.json(apiResponse) + } catch (error) { + next(error) + } +} + +const processLoader = async (req: Request, res: Response, next: NextFunction) => { + try { + if (typeof req.params.loaderId === 'undefined' || req.params.loaderId === '') { + throw new InternalFlowiseError( + StatusCodes.PRECONDITION_FAILED, + `Error: documentStoreController.processLoader - loaderId not provided!` + ) + } + if (typeof req.body === 'undefined') { + throw new InternalFlowiseError( + StatusCodes.PRECONDITION_FAILED, + `Error: documentStoreController.processLoader - body not provided!` + ) + } + const docLoaderId = req.params.loaderId + const body = req.body + const apiResponse = await documentStoreService.processLoader(body, docLoaderId) return res.json(apiResponse) } catch (error) { next(error) @@ -342,6 +374,42 @@ const getRecordManagerProviders = async (req: Request, res: Response, next: Next } } +const upsertDocStoreMiddleware = async (req: Request, res: Response, next: NextFunction) => { + try { + if (typeof req.params.id === 'undefined' || req.params.id === '') { + throw new InternalFlowiseError( + StatusCodes.PRECONDITION_FAILED, + `Error: documentStoreController.upsertDocStoreMiddleware - storeId not provided!` + ) + } + if (typeof req.body === 'undefined') { + throw new Error('Error: documentStoreController.upsertDocStoreMiddleware - body not provided!') + } + const body = req.body + const files = (req.files as Express.Multer.File[]) || [] + const apiResponse = await documentStoreService.upsertDocStoreMiddleware(req.params.id, body, files) + return res.json(apiResponse) + } catch (error) { + next(error) + } +} + +const refreshDocStoreMiddleware = async (req: Request, res: Response, next: NextFunction) => { + try { + if (typeof req.params.id === 'undefined' || req.params.id === '') { + throw new InternalFlowiseError( + StatusCodes.PRECONDITION_FAILED, + `Error: documentStoreController.refreshDocStoreMiddleware - storeId not provided!` + ) + } + const body = req.body + const apiResponse = await documentStoreService.refreshDocStoreMiddleware(req.params.id, body) + return res.json(apiResponse) + } catch (error) { + next(error) + } +} + export default { deleteDocumentStore, createDocumentStore, @@ -350,7 +418,7 @@ export default { getDocumentStoreById, getDocumentStoreFileChunks, updateDocumentStore, - processFileChunks, + processLoader, previewFileChunks, getDocumentLoaders, deleteDocumentStoreFileChunk, @@ -362,5 +430,9 @@ export default { saveVectorStoreConfig, queryVectorStore, deleteVectorStoreFromStore, - updateVectorStoreConfigOnly + updateVectorStoreConfigOnly, + getRateLimiterMiddleware, + upsertDocStoreMiddleware, + refreshDocStoreMiddleware, + saveProcessingLoader } diff --git a/packages/server/src/routes/documentstore/index.ts b/packages/server/src/routes/documentstore/index.ts index 3f4cb9452..789a35662 100644 --- a/packages/server/src/routes/documentstore/index.ts +++ b/packages/server/src/routes/documentstore/index.ts @@ -1,6 +1,14 @@ import express from 'express' import documentStoreController from '../../controllers/documentstore' +import multer from 'multer' +import path from 'path' + const router = express.Router() +const upload = multer({ dest: `${path.join(__dirname, '..', '..', '..', 'uploads')}/` }) + +router.post(['/upsert/', '/upsert/:id'], upload.array('files'), documentStoreController.upsertDocStoreMiddleware) + +router.post(['/refresh/', '/refresh/:id'], documentStoreController.refreshDocStoreMiddleware) /** Document Store Routes */ // Create document store @@ -22,8 +30,10 @@ router.get('/components/loaders', documentStoreController.getDocumentLoaders) router.delete('/loader/:id/:loaderId', documentStoreController.deleteLoaderFromDocumentStore) // chunking preview router.post('/loader/preview', documentStoreController.previewFileChunks) +// saving process +router.post('/loader/save', documentStoreController.saveProcessingLoader) // chunking process -router.post('/loader/process', documentStoreController.processFileChunks) +router.post('/loader/process/:loaderId', documentStoreController.processLoader) /** Document Store - Loaders - Chunks */ // delete specific file chunk from the store diff --git a/packages/server/src/services/documentstore/index.ts b/packages/server/src/services/documentstore/index.ts index a8b69ac55..0ee6b83c8 100644 --- a/packages/server/src/services/documentstore/index.ts +++ b/packages/server/src/services/documentstore/index.ts @@ -1,20 +1,28 @@ import { getRunningExpressApp } from '../../utils/getRunningExpressApp' import { DocumentStore } from '../../database/entities/DocumentStore' +import * as fs from 'fs' +import * as path from 'path' import { + addArrayFilesToStorage, addSingleFileToStorage, getFileFromStorage, ICommonObject, IDocument, + mapExtToInputField, + mapMimeTypeToInputField, removeFilesFromStorage, removeSpecificFileFromStorage } from 'flowise-components' import { + addLoaderSource, ChatType, DocumentStoreStatus, IDocumentStoreFileChunkPagedResponse, IDocumentStoreLoader, IDocumentStoreLoaderFile, IDocumentStoreLoaderForPreview, + IDocumentStoreRefreshData, + IDocumentStoreUpsertData, IDocumentStoreWhereUsed, INodeData } from '../../Interface' @@ -75,7 +83,7 @@ const getAllDocumentFileChunks = async () => { } } -const deleteLoaderFromDocumentStore = async (storeId: string, loaderId: string) => { +const deleteLoaderFromDocumentStore = async (storeId: string, docId: string) => { try { const appServer = getRunningExpressApp() const entity = await appServer.AppDataSource.getRepository(DocumentStore).findOneBy({ @@ -88,12 +96,16 @@ const deleteLoaderFromDocumentStore = async (storeId: string, loaderId: string) ) } const existingLoaders = JSON.parse(entity.loaders) - const found = existingLoaders.find((uFile: IDocumentStoreLoader) => uFile.id === loaderId) + const found = existingLoaders.find((loader: IDocumentStoreLoader) => loader.id === docId) if (found) { if (found.files?.length) { for (const file of found.files) { if (file.name) { - await removeSpecificFileFromStorage(DOCUMENT_STORE_BASE_FOLDER, storeId, file.name) + try { + await removeSpecificFileFromStorage(DOCUMENT_STORE_BASE_FOLDER, storeId, file.name) + } catch (error) { + console.error(error) + } } } } @@ -169,7 +181,7 @@ const getUsedChatflowNames = async (entity: DocumentStore) => { } // Get chunks for a specific loader or store -const getDocumentStoreFileChunks = async (storeId: string, fileId: string, pageNo: number = 1) => { +const getDocumentStoreFileChunks = async (storeId: string, docId: string, pageNo: number = 1) => { try { const appServer = getRunningExpressApp() const entity = await appServer.AppDataSource.getRepository(DocumentStore).findOneBy({ @@ -184,29 +196,34 @@ const getDocumentStoreFileChunks = async (storeId: string, fileId: string, pageN const loaders = JSON.parse(entity.loaders) let found: IDocumentStoreLoader | undefined - if (fileId !== 'all') { - found = loaders.find((loader: IDocumentStoreLoader) => loader.id === fileId) + if (docId !== 'all') { + found = loaders.find((loader: IDocumentStoreLoader) => loader.id === docId) if (!found) { throw new InternalFlowiseError( StatusCodes.NOT_FOUND, - `Error: documentStoreServices.getDocumentStoreById - Document file ${fileId} not found` + `Error: documentStoreServices.getDocumentStoreById - Document loader ${docId} not found` ) } } - let totalChars = 0 - loaders.forEach((loader: IDocumentStoreLoader) => { - totalChars += loader.totalChars - }) if (found) { - found.totalChars = totalChars - found.id = fileId + found.id = docId found.status = entity.status } + + let characters = 0 + if (docId === 'all') { + loaders.forEach((loader: IDocumentStoreLoader) => { + characters += loader.totalChars || 0 + }) + } else { + characters = found?.totalChars || 0 + } + const PAGE_SIZE = 50 const skip = (pageNo - 1) * PAGE_SIZE const take = PAGE_SIZE - let whereCondition: any = { docId: fileId } - if (fileId === 'all') { + let whereCondition: any = { docId: docId } + if (docId === 'all') { whereCondition = { storeId: storeId } } const count = await appServer.AppDataSource.getRepository(DocumentStoreFileChunk).count({ @@ -222,7 +239,7 @@ const getDocumentStoreFileChunks = async (storeId: string, fileId: string, pageN }) if (!chunksWithCount) { - throw new InternalFlowiseError(StatusCodes.NOT_FOUND, `File ${fileId} not found`) + throw new InternalFlowiseError(StatusCodes.NOT_FOUND, `Chunks with docId: ${docId} not found`) } const response: IDocumentStoreFileChunkPagedResponse = { @@ -231,7 +248,9 @@ const getDocumentStoreFileChunks = async (storeId: string, fileId: string, pageN file: found, currentPage: pageNo, storeName: entity.name, - description: entity.description + description: entity.description, + docId: docId, + characters } return response } catch (error) { @@ -465,7 +484,7 @@ const _splitIntoChunks = async (data: IDocumentStoreLoaderForPreview) => { try { const appServer = getRunningExpressApp() let splitterInstance = null - if (data.splitterConfig && Object.keys(data.splitterConfig).length > 0) { + if (data.splitterId && data.splitterConfig && Object.keys(data.splitterConfig).length > 0) { const nodeInstanceFilePath = appServer.nodesPool.componentNodes[data.splitterId].filePath as string const nodeModule = await import(nodeInstanceFilePath) const newNodeInstance = new nodeModule.nodeClass() @@ -475,11 +494,12 @@ const _splitIntoChunks = async (data: IDocumentStoreLoaderForPreview) => { } splitterInstance = await newNodeInstance.init(nodeData) } + if (!data.loaderId) return [] const nodeInstanceFilePath = appServer.nodesPool.componentNodes[data.loaderId].filePath as string const nodeModule = await import(nodeInstanceFilePath) // doc loader configs const nodeData = { - credential: data.credential || undefined, + credential: data.credential || data.loaderConfig['FLOWISE_CREDENTIAL_ID'] || undefined, inputs: { ...data.loaderConfig, textSplitter: splitterInstance }, outputs: { output: 'document' } } @@ -568,9 +588,9 @@ const previewChunks = async (data: IDocumentStoreLoaderForPreview) => { // if -1, return all chunks if (data.previewChunkCount === -1) data.previewChunkCount = totalChunks // return all docs if the user ask for more than we have - if (totalChunks <= data.previewChunkCount) data.previewChunkCount = totalChunks + if (totalChunks <= (data.previewChunkCount || 0)) data.previewChunkCount = totalChunks // return only the first n chunks - if (totalChunks > data.previewChunkCount) docs = docs.slice(0, data.previewChunkCount) + if (totalChunks > (data.previewChunkCount || 0)) docs = docs.slice(0, data.previewChunkCount) return { chunks: docs, totalChunks: totalChunks, previewChunkCount: data.previewChunkCount } } catch (error) { @@ -581,7 +601,7 @@ const previewChunks = async (data: IDocumentStoreLoaderForPreview) => { } } -const processAndSaveChunks = async (data: IDocumentStoreLoaderForPreview) => { +const saveProcessingLoader = async (data: IDocumentStoreLoaderForPreview): Promise => { try { const appServer = getRunningExpressApp() const entity = await appServer.AppDataSource.getRepository(DocumentStore).findOneBy({ @@ -590,14 +610,14 @@ const processAndSaveChunks = async (data: IDocumentStoreLoaderForPreview) => { if (!entity) { throw new InternalFlowiseError( StatusCodes.NOT_FOUND, - `Error: documentStoreServices.processAndSaveChunks - Document store ${data.storeId} not found` + `Error: documentStoreServices.saveProcessingLoader - Document store ${data.storeId} not found` ) } const existingLoaders = JSON.parse(entity.loaders) - const newLoaderId = data.id ?? uuidv4() - const found = existingLoaders.find((ldr: IDocumentStoreLoader) => ldr.id === newLoaderId) + const newDocLoaderId = data.id ?? uuidv4() + const found = existingLoaders.find((ldr: IDocumentStoreLoader) => ldr.id === newDocLoaderId) if (found) { - const foundIndex = existingLoaders.findIndex((ldr: IDocumentStoreLoader) => ldr.id === newLoaderId) + const foundIndex = existingLoaders.findIndex((ldr: IDocumentStoreLoader) => ldr.id === newDocLoaderId) if (!data.loaderId) data.loaderId = found.loaderId if (!data.loaderName) data.loaderName = found.loaderName @@ -629,7 +649,7 @@ const processAndSaveChunks = async (data: IDocumentStoreLoaderForPreview) => { entity.loaders = JSON.stringify(existingLoaders) } else { let loader: IDocumentStoreLoader = { - id: newLoaderId, + id: newDocLoaderId, loaderId: data.loaderId, loaderName: data.loaderName, loaderConfig: data.loaderConfig, @@ -647,13 +667,40 @@ const processAndSaveChunks = async (data: IDocumentStoreLoaderForPreview) => { entity.loaders = JSON.stringify(existingLoaders) } await appServer.AppDataSource.getRepository(DocumentStore).save(entity) - // this method will run async, will have to be moved to a worker thread - _saveChunksToStorage(data, entity, newLoaderId).then(() => {}) - return getDocumentStoreFileChunks(data.storeId as string, newLoaderId) + const newLoaders = JSON.parse(entity.loaders) + const newLoader = newLoaders.find((ldr: IDocumentStoreLoader) => ldr.id === newDocLoaderId) + if (!newLoader) { + throw new Error(`Loader ${newDocLoaderId} not found`) + } + newLoader.source = addLoaderSource(newLoader, true) + return newLoader } catch (error) { throw new InternalFlowiseError( StatusCodes.INTERNAL_SERVER_ERROR, - `Error: documentStoreServices.processAndSaveChunks - ${getErrorMessage(error)}` + `Error: documentStoreServices.saveProcessingLoader - ${getErrorMessage(error)}` + ) + } +} + +const processLoader = async (data: IDocumentStoreLoaderForPreview, docLoaderId: string) => { + try { + const appServer = getRunningExpressApp() + const entity = await appServer.AppDataSource.getRepository(DocumentStore).findOneBy({ + id: data.storeId + }) + if (!entity) { + throw new InternalFlowiseError( + StatusCodes.NOT_FOUND, + `Error: documentStoreServices.processLoader - Document store ${data.storeId} not found` + ) + } + // this method will run async, will have to be moved to a worker thread + await _saveChunksToStorage(data, entity, docLoaderId) + return getDocumentStoreFileChunks(data.storeId as string, docLoaderId) + } catch (error) { + throw new InternalFlowiseError( + StatusCodes.INTERNAL_SERVER_ERROR, + `Error: documentStoreServices.processLoader - ${getErrorMessage(error)}` ) } } @@ -665,100 +712,111 @@ const _saveChunksToStorage = async (data: IDocumentStoreLoaderForPreview, entity const appServer = getRunningExpressApp() //step 1: restore the full paths, if any await _normalizeFilePaths(data, entity) + //step 2: split the file into chunks - previewChunks(data).then(async (response) => { - //step 3: remove all files associated with the loader - const existingLoaders = JSON.parse(entity.loaders) - const loader = existingLoaders.find((ldr: IDocumentStoreLoader) => ldr.id === newLoaderId) - if (data.id) { - const index = existingLoaders.indexOf(loader) - if (index > -1) { - existingLoaders.splice(index, 1) - if (!data.rehydrated) { - if (loader.files) { - loader.files.map(async (file: IDocumentStoreLoaderFile) => { + const response = await previewChunks(data) + + //step 3: remove all files associated with the loader + const existingLoaders = JSON.parse(entity.loaders) + const loader = existingLoaders.find((ldr: IDocumentStoreLoader) => ldr.id === newLoaderId) + if (data.id) { + const index = existingLoaders.indexOf(loader) + if (index > -1) { + existingLoaders.splice(index, 1) + if (!data.rehydrated) { + if (loader.files) { + loader.files.map(async (file: IDocumentStoreLoaderFile) => { + try { await removeSpecificFileFromStorage(DOCUMENT_STORE_BASE_FOLDER, entity.id, file.name) - }) - } + } catch (error) { + console.error(error) + } + }) } } } - //step 4: save new file to storage - let filesWithMetadata = [] - const keys = Object.getOwnPropertyNames(data.loaderConfig) - for (let i = 0; i < keys.length; i++) { - const input = data.loaderConfig[keys[i]] - if (!input) { - continue - } - if (typeof input !== 'string') { - continue - } - if (input.startsWith('[') && input.endsWith(']')) { - const files = JSON.parse(input) - const fileNames: string[] = [] - for (let j = 0; j < files.length; j++) { - const file = files[j] - if (re.test(file)) { - const fileMetadata = await _saveFileToStorage(file, entity) - fileNames.push(fileMetadata.name) - filesWithMetadata.push(fileMetadata) - } + } + + //step 4: save new file to storage + let filesWithMetadata = [] + const keys = Object.getOwnPropertyNames(data.loaderConfig) + for (let i = 0; i < keys.length; i++) { + const input = data.loaderConfig[keys[i]] + if (!input) { + continue + } + if (typeof input !== 'string') { + continue + } + if (input.startsWith('[') && input.endsWith(']')) { + const files = JSON.parse(input) + const fileNames: string[] = [] + for (let j = 0; j < files.length; j++) { + const file = files[j] + if (re.test(file)) { + const fileMetadata = await _saveFileToStorage(file, entity) + fileNames.push(fileMetadata.name) + filesWithMetadata.push(fileMetadata) } - data.loaderConfig[keys[i]] = 'FILE-STORAGE::' + JSON.stringify(fileNames) - } else if (re.test(input)) { - const fileNames: string[] = [] - const fileMetadata = await _saveFileToStorage(input, entity) - fileNames.push(fileMetadata.name) - filesWithMetadata.push(fileMetadata) - data.loaderConfig[keys[i]] = 'FILE-STORAGE::' + JSON.stringify(fileNames) - break } + data.loaderConfig[keys[i]] = 'FILE-STORAGE::' + JSON.stringify(fileNames) + } else if (re.test(input)) { + const fileNames: string[] = [] + const fileMetadata = await _saveFileToStorage(input, entity) + fileNames.push(fileMetadata.name) + filesWithMetadata.push(fileMetadata) + data.loaderConfig[keys[i]] = 'FILE-STORAGE::' + JSON.stringify(fileNames) + break } - //step 5: update with the new files and loaderConfig - if (filesWithMetadata.length > 0) { - loader.loaderConfig = data.loaderConfig - loader.files = filesWithMetadata - } - //step 6: update the loaders with the new loaderConfig - if (data.id) { - existingLoaders.push(loader) - } - //step 7: remove all previous chunks - await appServer.AppDataSource.getRepository(DocumentStoreFileChunk).delete({ docId: newLoaderId }) - if (response.chunks) { - //step 8: now save the new chunks - const totalChars = response.chunks.reduce((acc, chunk) => { - if (chunk.pageContent) { - return acc + chunk.pageContent.length - } - return acc - }, 0) - response.chunks.map(async (chunk: IDocument, index: number) => { - const docChunk: DocumentStoreFileChunk = { - docId: newLoaderId, - storeId: data.storeId || '', - id: uuidv4(), - chunkNo: index + 1, - pageContent: chunk.pageContent, - metadata: JSON.stringify(chunk.metadata) - } - const dChunk = appServer.AppDataSource.getRepository(DocumentStoreFileChunk).create(docChunk) - await appServer.AppDataSource.getRepository(DocumentStoreFileChunk).save(dChunk) - }) - // update the loader with the new metrics - loader.totalChunks = response.totalChunks - loader.totalChars = totalChars - } - loader.status = 'SYNC' - // have a flag and iterate over the loaders and update the entity status to SYNC - const allSynced = existingLoaders.every((ldr: IDocumentStoreLoader) => ldr.status === 'SYNC') - entity.status = allSynced ? DocumentStoreStatus.SYNC : DocumentStoreStatus.STALE - entity.loaders = JSON.stringify(existingLoaders) - //step 9: update the entity in the database - await appServer.AppDataSource.getRepository(DocumentStore).save(entity) - return - }) + } + + //step 5: update with the new files and loaderConfig + if (filesWithMetadata.length > 0) { + loader.loaderConfig = data.loaderConfig + loader.files = filesWithMetadata + } + + //step 6: update the loaders with the new loaderConfig + if (data.id) { + existingLoaders.push(loader) + } + + //step 7: remove all previous chunks + await appServer.AppDataSource.getRepository(DocumentStoreFileChunk).delete({ docId: newLoaderId }) + if (response.chunks) { + //step 8: now save the new chunks + const totalChars = response.chunks.reduce((acc, chunk) => { + if (chunk.pageContent) { + return acc + chunk.pageContent.length + } + return acc + }, 0) + response.chunks.map(async (chunk: IDocument, index: number) => { + const docChunk: DocumentStoreFileChunk = { + docId: newLoaderId, + storeId: data.storeId || '', + id: uuidv4(), + chunkNo: index + 1, + pageContent: chunk.pageContent, + metadata: JSON.stringify(chunk.metadata) + } + const dChunk = appServer.AppDataSource.getRepository(DocumentStoreFileChunk).create(docChunk) + await appServer.AppDataSource.getRepository(DocumentStoreFileChunk).save(dChunk) + }) + // update the loader with the new metrics + loader.totalChunks = response.totalChunks + loader.totalChars = totalChars + } + loader.status = 'SYNC' + // have a flag and iterate over the loaders and update the entity status to SYNC + const allSynced = existingLoaders.every((ldr: IDocumentStoreLoader) => ldr.status === 'SYNC') + entity.status = allSynced ? DocumentStoreStatus.SYNC : DocumentStoreStatus.STALE + entity.loaders = JSON.stringify(existingLoaders) + + //step 9: update the entity in the database + await appServer.AppDataSource.getRepository(DocumentStore).save(entity) + + return } catch (error) { throw new InternalFlowiseError( StatusCodes.INTERNAL_SERVER_ERROR, @@ -960,11 +1018,16 @@ const _insertIntoVectorStoreWorkerThread = async (data: ICommonObject) => { // Get Vector Store Node Data const vStoreNodeData = _createVectorStoreNodeData(appServer, data, embeddingObj, recordManagerObj) + // Prepare docs for upserting + const filterOptions: ICommonObject = { + storeId: data.storeId + } + if (data.docId) { + filterOptions['docId'] = data.docId + } const chunks = await appServer.AppDataSource.getRepository(DocumentStoreFileChunk).find({ - where: { - storeId: data.storeId - } + where: filterOptions }) const docs: Document[] = chunks.map((chunk: DocumentStoreFileChunk) => { return new Document({ @@ -1248,6 +1311,263 @@ const _createVectorStoreObject = async ( return vStoreNodeInstance } +const upsertDocStoreMiddleware = async ( + storeId: string, + data: IDocumentStoreUpsertData, + files: Express.Multer.File[] = [], + isRefreshExisting = false +) => { + const appServer = getRunningExpressApp() + const docId = data.docId + const newLoader = typeof data.loader === 'string' ? JSON.parse(data.loader) : data.loader + const newSplitter = typeof data.splitter === 'string' ? JSON.parse(data.splitter) : data.splitter + const newVectorStore = typeof data.vectorStore === 'string' ? JSON.parse(data.vectorStore) : data.vectorStore + const newEmbedding = typeof data.embedding === 'string' ? JSON.parse(data.embedding) : data.embedding + const newRecordManager = typeof data.recordManager === 'string' ? JSON.parse(data.recordManager) : data.recordManager + + const getComponentLabelFromName = (nodeName: string) => { + const component = Object.values(appServer.nodesPool.componentNodes).find((node) => node.name === nodeName) + return component?.label || '' + } + + let loaderName = '' + let loaderId = '' + let loaderConfig: ICommonObject = {} + + let splitterName = '' + let splitterId = '' + let splitterConfig: ICommonObject = {} + + let vectorStoreName = '' + let vectorStoreConfig: ICommonObject = {} + + let embeddingName = '' + let embeddingConfig: ICommonObject = {} + + let recordManagerName = '' + let recordManagerConfig: ICommonObject = {} + + // Step 1: Get existing loader + if (docId) { + const entity = await appServer.AppDataSource.getRepository(DocumentStore).findOneBy({ id: storeId }) + if (!entity) { + throw new InternalFlowiseError(StatusCodes.NOT_FOUND, `Document store ${storeId} not found`) + } + const loaders = JSON.parse(entity.loaders) + const loader = loaders.find((ldr: IDocumentStoreLoader) => ldr.id === docId) + if (!loader) { + throw new InternalFlowiseError(StatusCodes.NOT_FOUND, `Document loader ${docId} not found`) + } + + // Loader + loaderName = loader.loaderName + loaderId = loader.loaderId + loaderConfig = { + ...loaderConfig, + ...loader?.loaderConfig + } + + // Splitter + splitterName = loader.splitterName + splitterId = loader.splitterId + splitterConfig = { + ...splitterConfig, + ...loader?.splitterConfig + } + + // Vector Store + vectorStoreName = JSON.parse(entity.vectorStoreConfig || '{}')?.name + vectorStoreConfig = JSON.parse(entity.vectorStoreConfig || '{}')?.config + + // Embedding + embeddingName = JSON.parse(entity.embeddingConfig || '{}')?.name + embeddingConfig = JSON.parse(entity.embeddingConfig || '{}')?.config + + // Record Manager + recordManagerName = JSON.parse(entity.recordManagerConfig || '{}')?.name + recordManagerConfig = JSON.parse(entity.recordManagerConfig || '{}')?.config + } + + // Step 2: Replace with new values + loaderName = newLoader?.name ? getComponentLabelFromName(newLoader?.name) : loaderName + loaderId = newLoader?.name || loaderId + loaderConfig = { + ...loaderConfig, + ...newLoader?.config + } + + splitterName = newSplitter?.name ? getComponentLabelFromName(newSplitter?.name) : splitterName + splitterId = newSplitter?.name || splitterId + splitterConfig = { + ...splitterConfig, + ...newSplitter?.config + } + + vectorStoreName = newVectorStore?.name || vectorStoreName + vectorStoreConfig = { + ...vectorStoreConfig, + ...newVectorStore?.config + } + + embeddingName = newEmbedding?.name || embeddingName + embeddingConfig = { + ...embeddingConfig, + ...newEmbedding?.config + } + + recordManagerName = newRecordManager?.name || recordManagerName + recordManagerConfig = { + recordManagerConfig, + ...newRecordManager?.config + } + + // Step 3: Replace with files + if (files.length) { + const filesLoaderConfig: ICommonObject = {} + for (const file of files) { + const fileNames: string[] = [] + const fileBuffer = fs.readFileSync(file.path) + // Address file name with special characters: https://github.com/expressjs/multer/issues/1104 + file.originalname = Buffer.from(file.originalname, 'latin1').toString('utf8') + + try { + await addArrayFilesToStorage(file.mimetype, fileBuffer, file.originalname, fileNames, DOCUMENT_STORE_BASE_FOLDER, storeId) + } catch (error) { + continue + } + + const mimePrefix = 'data:' + file.mimetype + ';base64' + const storagePath = mimePrefix + ',' + fileBuffer.toString('base64') + `,filename:${file.originalname}` + + const fileInputFieldFromMimeType = mapMimeTypeToInputField(file.mimetype) + + const fileExtension = path.extname(file.originalname) + + const fileInputFieldFromExt = mapExtToInputField(fileExtension) + + let fileInputField = 'txtFile' + + if (fileInputFieldFromExt !== 'txtFile') { + fileInputField = fileInputFieldFromExt + } else if (fileInputFieldFromMimeType !== 'txtFile') { + fileInputField = fileInputFieldFromExt + } + + if (loaderId === 'unstructuredFileLoader') { + fileInputField = 'fileObject' + } + + if (filesLoaderConfig[fileInputField]) { + const existingFileInputFieldArray = JSON.parse(filesLoaderConfig[fileInputField]) + const newFileInputFieldArray = [storagePath] + const updatedFieldArray = existingFileInputFieldArray.concat(newFileInputFieldArray) + filesLoaderConfig[fileInputField] = JSON.stringify(updatedFieldArray) + } else { + filesLoaderConfig[fileInputField] = JSON.stringify([storagePath]) + } + + fs.unlinkSync(file.path) + } + + loaderConfig = { + ...loaderConfig, + ...filesLoaderConfig + } + } + + // Step 4: Verification for must have components + if (!loaderName || !loaderId || !loaderConfig) { + throw new InternalFlowiseError(StatusCodes.INTERNAL_SERVER_ERROR, `Loader not configured`) + } + + if (!vectorStoreName || !vectorStoreConfig) { + throw new InternalFlowiseError(StatusCodes.INTERNAL_SERVER_ERROR, `Vector store not configured`) + } + + if (!embeddingName || !embeddingConfig) { + throw new InternalFlowiseError(StatusCodes.INTERNAL_SERVER_ERROR, `Embedding not configured`) + } + + // Step 5: Process & Upsert + const processData: IDocumentStoreLoaderForPreview = { + storeId, + loaderId, + loaderName, + loaderConfig, + splitterId, + splitterName, + splitterConfig + } + + if (isRefreshExisting) { + processData.id = docId + } + + try { + const newLoader = await saveProcessingLoader(processData) + const result = await processLoader(processData, newLoader.id || '') + const newDocId = result.docId + + const insertData = { + storeId, + docId: newDocId, + vectorStoreName, + vectorStoreConfig, + embeddingName, + embeddingConfig, + recordManagerName, + recordManagerConfig + } + + const res = await insertIntoVectorStore(insertData) + res.docId = newDocId + + return res + } catch (error) { + throw new InternalFlowiseError( + StatusCodes.INTERNAL_SERVER_ERROR, + `Error: documentStoreServices.upsertDocStoreMiddleware - ${getErrorMessage(error)}` + ) + } +} + +const refreshDocStoreMiddleware = async (storeId: string, data?: IDocumentStoreRefreshData) => { + const appServer = getRunningExpressApp() + + try { + const results = [] + let totalItems: IDocumentStoreUpsertData[] = [] + + if (!data || !data.items || data.items.length === 0) { + const entity = await appServer.AppDataSource.getRepository(DocumentStore).findOneBy({ id: storeId }) + if (!entity) { + throw new InternalFlowiseError(StatusCodes.NOT_FOUND, `Document store ${storeId} not found`) + } + + const loaders = JSON.parse(entity.loaders) + totalItems = loaders.map((ldr: IDocumentStoreLoader) => { + return { + docId: ldr.id + } + }) + } else { + totalItems = data.items + } + + for (const item of totalItems) { + const res = await upsertDocStoreMiddleware(storeId, item, [], true) + results.push(res) + } + + return results + } catch (error) { + throw new InternalFlowiseError( + StatusCodes.INTERNAL_SERVER_ERROR, + `Error: documentStoreServices.refreshDocStoreMiddleware - ${getErrorMessage(error)}` + ) + } +} + export default { updateDocumentStoreUsage, deleteDocumentStore, @@ -1260,7 +1580,8 @@ export default { getDocumentStoreFileChunks, updateDocumentStore, previewChunks, - processAndSaveChunks, + saveProcessingLoader, + processLoader, deleteDocumentStoreFileChunk, editDocumentStoreFileChunk, getDocumentLoaders, @@ -1271,5 +1592,7 @@ export default { saveVectorStoreConfig, queryVectorStore, deleteVectorStoreFromStore, - updateVectorStoreConfigOnly + updateVectorStoreConfigOnly, + upsertDocStoreMiddleware, + refreshDocStoreMiddleware } diff --git a/packages/server/src/utils/createAttachment.ts b/packages/server/src/utils/createAttachment.ts index 317dd8b75..87723ea08 100644 --- a/packages/server/src/utils/createAttachment.ts +++ b/packages/server/src/utils/createAttachment.ts @@ -39,6 +39,7 @@ export const createFileAttachment = async (req: Request) => { const files = (req.files as Express.Multer.File[]) || [] const fileAttachments = [] if (files.length) { + const isBase64 = req.body.base64 for (const file of files) { const fileBuffer = fs.readFileSync(file.path) const fileNames: string[] = [] @@ -70,13 +71,21 @@ export const createFileAttachment = async (req: Request) => { [fileInputField]: storagePath } } - const documents: IDocument[] = await fileLoaderNodeInstance.init(nodeData, '', options) - const pageContents = documents.map((doc) => doc.pageContent).join('\n') + + let content = '' + + if (isBase64) { + content = fileBuffer.toString('base64') + } else { + const documents: IDocument[] = await fileLoaderNodeInstance.init(nodeData, '', options) + content = documents.map((doc) => doc.pageContent).join('\n') + } + fileAttachments.push({ name: file.originalname, mimeType: file.mimetype, size: file.size, - content: pageContents + content }) } catch (error) { throw new Error(`Failed operation: createFileAttachment - ${getErrorMessage(error)}`) diff --git a/packages/ui/src/api/documentstore.js b/packages/ui/src/api/documentstore.js index a8470cbeb..909511830 100644 --- a/packages/ui/src/api/documentstore.js +++ b/packages/ui/src/api/documentstore.js @@ -14,7 +14,9 @@ const editChunkFromStore = (storeId, loaderId, chunkId, body) => const getFileChunks = (storeId, fileId, pageNo) => client.get(`/document-store/chunks/${storeId}/${fileId}/${pageNo}`) const previewChunks = (body) => client.post('/document-store/loader/preview', body) -const processChunks = (body) => client.post(`/document-store/loader/process`, body) +const processLoader = (body, loaderId) => client.post(`/document-store/loader/process/${loaderId}`, body) +const saveProcessingLoader = (body) => client.post(`/document-store/loader/save`, body) +const refreshLoader = (storeId) => client.post(`/document-store/refresh/${storeId}`) const insertIntoVectorStore = (body) => client.post(`/document-store/vectorstore/insert`, body) const saveVectorStoreConfig = (body) => client.post(`/document-store/vectorstore/save`, body) @@ -33,7 +35,7 @@ export default { getFileChunks, updateDocumentStore, previewChunks, - processChunks, + processLoader, getDocumentLoaders, deleteChunkFromStore, editChunkFromStore, @@ -45,5 +47,7 @@ export default { saveVectorStoreConfig, queryVectorStore, deleteVectorStoreDataFromStore, - updateVectorStoreConfig + updateVectorStoreConfig, + saveProcessingLoader, + refreshLoader } diff --git a/packages/ui/src/routes/MainRoutes.jsx b/packages/ui/src/routes/MainRoutes.jsx index 53afab081..be916f034 100644 --- a/packages/ui/src/routes/MainRoutes.jsx +++ b/packages/ui/src/routes/MainRoutes.jsx @@ -98,6 +98,10 @@ const MainRoutes = { path: '/document-stores/vector/:id', element: }, + { + path: '/document-stores/vector/:id/:docId', + element: + }, { path: '/document-stores/query/:id', element: diff --git a/packages/ui/src/ui-component/cards/DocumentStoreCard.jsx b/packages/ui/src/ui-component/cards/DocumentStoreCard.jsx index da2c2d4ad..8021bbead 100644 --- a/packages/ui/src/ui-component/cards/DocumentStoreCard.jsx +++ b/packages/ui/src/ui-component/cards/DocumentStoreCard.jsx @@ -60,7 +60,8 @@ const DocumentStoreCard = ({ data, images, onClick }) => { WebkitBoxOrient: 'vertical', textOverflow: 'ellipsis', overflow: 'hidden', - flex: 1 + flex: 1, + mr: 1 }} > {data.name} diff --git a/packages/ui/src/views/docstore/DocumentStoreDetail.jsx b/packages/ui/src/views/docstore/DocumentStoreDetail.jsx index a1f1b7821..986bd1e97 100644 --- a/packages/ui/src/views/docstore/DocumentStoreDetail.jsx +++ b/packages/ui/src/views/docstore/DocumentStoreDetail.jsx @@ -35,6 +35,8 @@ import ErrorBoundary from '@/ErrorBoundary' import { StyledButton } from '@/ui-component/button/StyledButton' import ViewHeader from '@/layout/MainLayout/ViewHeader' import DeleteDocStoreDialog from './DeleteDocStoreDialog' +import DocumentStoreStatus from '@/views/docstore/DocumentStoreStatus' +import ConfirmDialog from '@/ui-component/dialog/ConfirmDialog' // API import documentsApi from '@/api/documentstore' @@ -42,22 +44,18 @@ import documentsApi from '@/api/documentstore' // Hooks import useApi from '@/hooks/useApi' import useNotifier from '@/utils/useNotifier' +import { getFileName } from '@/utils/genericHelper' +import useConfirm from '@/hooks/useConfirm' // icons -import { - IconPlus, - IconRefresh, - IconListDetails, - IconTrash, - IconX, - IconVectorBezier2, - IconRowInsertTop, - IconZoomScan -} from '@tabler/icons-react' +import { IconPlus, IconRefresh, IconX, IconVectorBezier2 } from '@tabler/icons-react' import KeyboardArrowDownIcon from '@mui/icons-material/KeyboardArrowDown' import FileDeleteIcon from '@mui/icons-material/Delete' import FileEditIcon from '@mui/icons-material/Edit' import FileChunksIcon from '@mui/icons-material/AppRegistration' +import NoteAddIcon from '@mui/icons-material/NoteAdd' +import SearchIcon from '@mui/icons-material/Search' +import RefreshIcon from '@mui/icons-material/Refresh' import doc_store_details_emptySVG from '@/assets/images/doc_store_details_empty.svg' // store @@ -127,6 +125,7 @@ const DocumentStoreDetails = () => { const navigate = useNavigate() const dispatch = useDispatch() useNotifier() + const { confirm } = useConfirm() const enqueueSnackbar = (...args) => dispatch(enqueueSnackbarAction(...args)) const closeSnackbar = (...args) => dispatch(closeSnackbarAction(...args)) @@ -144,6 +143,9 @@ const DocumentStoreDetails = () => { const [showDeleteDocStoreDialog, setShowDeleteDocStoreDialog] = useState(false) const [deleteDocStoreDialogProps, setDeleteDocStoreDialogProps] = useState({}) + const [anchorEl, setAnchorEl] = useState(null) + const open = Boolean(anchorEl) + const URLpath = document.location.pathname.toString().split('/') const storeId = URLpath[URLpath.length - 1] === 'document-stores' ? '' : URLpath[URLpath.length - 1] @@ -212,9 +214,10 @@ const DocumentStoreDetails = () => { } catch (error) { setBackdropLoading(false) setError(error) - const errorData = error.response.data || `${error.response.status}: ${error.response.statusText}` enqueueSnackbar({ - message: `Failed to delete loader: ${errorData}`, + message: `Failed to delete Document Store: ${ + typeof error.response.data === 'object' ? error.response.data.message : error.response.data + }`, options: { key: new Date().getTime() + Math.random(), variant: 'error', @@ -249,9 +252,10 @@ const DocumentStoreDetails = () => { } catch (error) { setError(error) setBackdropLoading(false) - const errorData = error.response.data || `${error.response.status}: ${error.response.statusText}` enqueueSnackbar({ - message: `Failed to delete loader: ${errorData}`, + message: `Failed to delete Document Loader: ${ + typeof error.response.data === 'object' ? error.response.data.message : error.response.data + }`, options: { key: new Date().getTime() + Math.random(), variant: 'error', @@ -294,6 +298,55 @@ const DocumentStoreDetails = () => { setShowDeleteDocStoreDialog(true) } + const onStoreRefresh = async (storeId) => { + const confirmPayload = { + title: `Refresh all loaders and upsert all chunks?`, + description: `This will re-process all loaders and upsert all chunks. This action might take some time.`, + confirmButtonName: 'Refresh', + cancelButtonName: 'Cancel' + } + const isConfirmed = await confirm(confirmPayload) + + if (isConfirmed) { + setAnchorEl(null) + setBackdropLoading(true) + try { + const resp = await documentsApi.refreshLoader(storeId) + if (resp.data) { + enqueueSnackbar({ + message: 'Document store refresh successfully!', + options: { + key: new Date().getTime() + Math.random(), + variant: 'success', + action: (key) => ( + + ) + } + }) + } + setBackdropLoading(false) + } catch (error) { + setBackdropLoading(false) + enqueueSnackbar({ + message: `Failed to refresh document store: ${ + typeof error.response.data === 'object' ? error.response.data.message : error.response.data + }`, + options: { + key: new Date().getTime() + Math.random(), + variant: 'error', + action: (key) => ( + + ) + } + }) + } + } + } + const onEditClicked = () => { const data = { name: documentStore.name, @@ -316,6 +369,16 @@ const DocumentStoreDetails = () => { getSpecificDocumentStore.request(storeId) } + const handleClick = (event) => { + event.preventDefault() + event.stopPropagation() + setAnchorEl(event.currentTarget) + } + + const handleClose = () => { + setAnchorEl(null) + } + useEffect(() => { getSpecificDocumentStore.request(storeId) @@ -358,85 +421,86 @@ const DocumentStoreDetails = () => { onBack={() => navigate('/document-stores')} onEdit={() => onEditClicked()} > - onStoreDelete(documentStore.vectorStoreConfig, documentStore.recordManagerConfig)} - size='small' - color='error' - title='Delete Document Store' - sx={{ mr: 2 }} - > - - + {(documentStore?.status === 'STALE' || documentStore?.status === 'UPSERTING') && ( + + + + )} } onClick={listLoaders} > Add Document Loader - {(documentStore?.status === 'STALE' || documentStore?.status === 'UPSERTING') && ( - - )} - {documentStore?.status === 'UPSERTING' && ( - - )} - {documentStore?.totalChunks > 0 && documentStore?.status !== 'UPSERTING' && ( - <> - - - - )} - {documentStore?.totalChunks > 0 && documentStore?.status === 'UPSERTED' && ( - + + showStoredChunks('all')} + disableRipple > + + View & Edit Chunks + + showVectorStore(documentStore.id)} + disableRipple + > + + Upsert All Chunks + + showVectorStoreQuery(documentStore.id)} + disableRipple + > + Retrieval Query - - )} + + onStoreRefresh(documentStore.id)} + disableRipple + title='Re-process all loaders and upsert all chunks' + > + + Refresh + + + onStoreDelete(documentStore.vectorStoreConfig, documentStore.recordManagerConfig)} + disableRipple + > + + Delete + + + {getSpecificDocumentStore.data?.whereUsed?.length > 0 && (
{ documentStore?.recordManagerConfig ) } + onChunkUpsert={() => + navigate(`/document-stores/vector/${documentStore.id}/${loader.id}`) + } /> ))} @@ -630,6 +697,7 @@ const DocumentStoreDetails = () => { /> )} {isBackdropLoading && } + ) } @@ -649,6 +717,9 @@ function LoaderRow(props) { } const formatSources = (source) => { + if (source && typeof source === 'string' && source.includes('base64')) { + return getFileName(source) + } if (source && typeof source === 'string' && source.startsWith('[') && source.endsWith(']')) { return JSON.parse(source).join(', ') } @@ -710,6 +781,10 @@ function LoaderRow(props) { View & Edit Chunks + + + Upsert Chunks + @@ -730,6 +805,7 @@ LoaderRow.propTypes = { theme: PropTypes.any, onViewChunksClick: PropTypes.func, onEditClick: PropTypes.func, - onDeleteClick: PropTypes.func + onDeleteClick: PropTypes.func, + onChunkUpsert: PropTypes.func } export default DocumentStoreDetails diff --git a/packages/ui/src/views/docstore/DocumentStoreStatus.jsx b/packages/ui/src/views/docstore/DocumentStoreStatus.jsx index e295fb76c..544009559 100644 --- a/packages/ui/src/views/docstore/DocumentStoreStatus.jsx +++ b/packages/ui/src/views/docstore/DocumentStoreStatus.jsx @@ -10,22 +10,36 @@ const DocumentStoreStatus = ({ status, isTableView }) => { switch (status) { case 'STALE': return customization.isDarkMode - ? [theme.palette.grey[400], theme.palette.grey[600], theme.palette.grey[700]] + ? [theme.palette.grey[400], theme.palette.grey[600], theme.palette.grey[800]] : [theme.palette.grey[300], theme.palette.grey[500], theme.palette.grey[700]] case 'EMPTY': - return ['#673ab7', '#673ab7', '#673ab7'] + return customization.isDarkMode + ? ['#4a148c', '#6a1b9a', '#ffffff'] // Deep Purple + : ['#d1c4e9', '#9575cd', '#673ab7'] case 'SYNCING': + return customization.isDarkMode + ? ['#ff6f00', '#ff8f00', '#ffffff'] // Amber + : ['#fff8e1', '#ffe57f', '#ffc107'] case 'UPSERTING': - return ['#fff8e1', '#ffe57f', '#ffc107'] + return customization.isDarkMode + ? ['#01579b', '#0277bd', '#ffffff'] // Light Blue + : ['#e1f5fe', '#4fc3f7', '#0288d1'] case 'SYNC': + return customization.isDarkMode + ? ['#1b5e20', '#2e7d32', '#ffffff'] // Green + : ['#e8f5e9', '#81c784', '#43a047'] case 'UPSERTED': - return ['#cdf5d8', '#00e676', '#00c853'] + return customization.isDarkMode + ? ['#004d40', '#00695c', '#ffffff'] // Teal + : ['#e0f2f1', '#4db6ac', '#00897b'] case 'NEW': - return ['#e3f2fd', '#2196f3', '#1e88e5'] + return customization.isDarkMode + ? ['#0d47a1', '#1565c0', '#ffffff'] // Blue + : ['#e3f2fd', '#64b5f6', '#1e88e5'] default: return customization.isDarkMode ? [theme.palette.grey[300], theme.palette.grey[500], theme.palette.grey[700]] - : [theme.palette.grey[300], theme.palette.grey[500], theme.palette.grey[700]] + : [theme.palette.grey[200], theme.palette.grey[400], theme.palette.grey[600]] } } @@ -45,7 +59,8 @@ const DocumentStoreStatus = ({ status, isTableView }) => { paddingTop: '3px', paddingBottom: '3px', paddingLeft: '10px', - paddingRight: '10px' + paddingRight: '10px', + width: 'fit-content' }} >
{ setLoading(true) const config = prepareConfig() try { - const processResp = await documentStoreApi.processChunks(config) + const saveResp = await documentStoreApi.saveProcessingLoader(config) setLoading(false) - if (processResp.data) { + if (saveResp.data) { enqueueSnackbar({ message: 'File submitted for processing. Redirecting to Document Store..', options: { @@ -201,6 +201,8 @@ const LoaderConfigPreviewChunks = () => { ) } }) + // don't wait for the process to complete, redirect to document store + documentStoreApi.processLoader(config, saveResp.data?.id) navigate('/document-stores/' + storeId) } } catch (error) { diff --git a/packages/ui/src/views/docstore/ShowStoredChunks.jsx b/packages/ui/src/views/docstore/ShowStoredChunks.jsx index ebefe36eb..5a2db316b 100644 --- a/packages/ui/src/views/docstore/ShowStoredChunks.jsx +++ b/packages/ui/src/views/docstore/ShowStoredChunks.jsx @@ -310,7 +310,7 @@ const ShowStoredChunks = () => {
- {getChunksApi.data?.file?.totalChars?.toLocaleString()} characters + {getChunksApi.data?.characters?.toLocaleString()} characters
diff --git a/packages/ui/src/views/docstore/VectorStoreConfigure.jsx b/packages/ui/src/views/docstore/VectorStoreConfigure.jsx index edecb8441..52d4e0e6f 100644 --- a/packages/ui/src/views/docstore/VectorStoreConfigure.jsx +++ b/packages/ui/src/views/docstore/VectorStoreConfigure.jsx @@ -50,6 +50,10 @@ const VectorStoreConfigure = () => { useNotifier() const customization = useSelector((state) => state.customization) + const pathSegments = document.location.pathname.toString().split('/') + const storeId = pathSegments[3] || null + const docId = pathSegments[4] || null + const enqueueSnackbar = (...args) => dispatch(enqueueSnackbarAction(...args)) const closeSnackbar = (...args) => dispatch(closeSnackbarAction(...args)) @@ -213,7 +217,8 @@ const VectorStoreConfigure = () => { const prepareConfigData = () => { const data = { - storeId: storeId + storeId: storeId, + docId: docId } // Set embedding config if (selectedEmbeddingsProvider.inputs) { @@ -365,8 +370,6 @@ const VectorStoreConfigure = () => { // eslint-disable-next-line react-hooks/exhaustive-deps }, [saveVectorStoreConfigApi.error]) - const URLpath = document.location.pathname.toString().split('/') - const storeId = URLpath[URLpath.length - 1] === 'document-stores' ? '' : URLpath[URLpath.length - 1] useEffect(() => { getSpecificDocumentStoreApi.request(storeId) @@ -432,420 +435,426 @@ const VectorStoreConfigure = () => { {error ? ( ) : ( - - navigate(-1)} - > - {(Object.keys(selectedEmbeddingsProvider).length > 0 || - Object.keys(selectedVectorStoreProvider).length > 0) && ( - - )} - {(Object.keys(selectedEmbeddingsProvider).length > 0 || - Object.keys(selectedVectorStoreProvider).length > 0) && ( - - )} - {Object.keys(selectedEmbeddingsProvider).length > 0 && Object.keys(selectedVectorStoreProvider).length > 0 && ( - - )} - - - - - - - - {Object.keys(selectedEmbeddingsProvider).length === 0 ? ( - - ) : ( - - - -
- + {(Object.keys(selectedEmbeddingsProvider).length > 0 || + Object.keys(selectedVectorStoreProvider).length > 0) && ( + + )} + {(Object.keys(selectedEmbeddingsProvider).length > 0 || + Object.keys(selectedVectorStoreProvider).length > 0) && ( + + )} + {Object.keys(selectedEmbeddingsProvider).length > 0 && + Object.keys(selectedVectorStoreProvider).length > 0 && ( + + )} + + + + + + + + {Object.keys(selectedEmbeddingsProvider).length === 0 ? ( + + ) : ( + + +
- {selectedEmbeddingsProvider.label ? ( - +
- ) : ( - - )} -
- - {selectedEmbeddingsProvider.label} - -
-
- {Object.keys(selectedEmbeddingsProvider).length > 0 && ( - <> - - - - - )} -
- - {selectedEmbeddingsProvider && - Object.keys(selectedEmbeddingsProvider).length > 0 && - (selectedEmbeddingsProvider.inputParams ?? []) - .filter((inputParam) => !inputParam.hidden) - .map((inputParam, index) => ( - - ))} -
-
-
-
- )} -
- - {Object.keys(selectedVectorStoreProvider).length === 0 ? ( - - ) : ( - - - -
- -
- {selectedVectorStoreProvider.label ? ( - + {selectedEmbeddingsProvider.label ? ( + {selectedEmbeddingsProvider.label + ) : ( + + )} +
+ + {selectedEmbeddingsProvider.label} + +
+
- ) : ( - - )} + > + {Object.keys(selectedEmbeddingsProvider).length > 0 && ( + <> + + + + + )} +
+
+ {selectedEmbeddingsProvider && + Object.keys(selectedEmbeddingsProvider).length > 0 && + (selectedEmbeddingsProvider.inputParams ?? []) + .filter((inputParam) => !inputParam.hidden) + .map((inputParam, index) => ( + + ))}
- - {selectedVectorStoreProvider.label} - -
+
+
+
+ )} +
+ + {Object.keys(selectedVectorStoreProvider).length === 0 ? ( + + ) : ( + + +
- {Object.keys(selectedVectorStoreProvider).length > 0 && ( - <> - - - - - )} -
-
- {selectedVectorStoreProvider && - Object.keys(selectedVectorStoreProvider).length > 0 && - (selectedVectorStoreProvider.inputParams ?? []) - .filter((inputParam) => !inputParam.hidden) - .map((inputParam, index) => ( - - ))} -
-
-
-
- )} -
- - {Object.keys(selectedRecordManagerProvider).length === 0 ? ( - - ) : ( - - - -
- -
- {selectedRecordManagerProvider.label ? ( - +
- ) : ( - - )} + > + {selectedVectorStoreProvider.label ? ( + {selectedVectorStoreProvider.label + ) : ( + + )} +
+ + {selectedVectorStoreProvider.label} + +
+
+ {Object.keys(selectedVectorStoreProvider).length > 0 && ( + <> + + + + + )} +
+ + {selectedVectorStoreProvider && + Object.keys(selectedVectorStoreProvider).length > 0 && + (selectedVectorStoreProvider.inputParams ?? []) + .filter((inputParam) => !inputParam.hidden) + .map((inputParam, index) => ( + + ))}
- - {selectedRecordManagerProvider.label} - -
+ + +
+ )} + + + {Object.keys(selectedRecordManagerProvider).length === 0 ? ( + + ) : ( + + +
- {Object.keys(selectedRecordManagerProvider).length > 0 && ( - <> - - - - - )} + +
+ {selectedRecordManagerProvider.label ? ( + {selectedRecordManagerProvider.label + ) : ( + + )} +
+ + {selectedRecordManagerProvider.label} + +
+
+ {Object.keys(selectedRecordManagerProvider).length > 0 && ( + <> + + + + + )} +
+
+ {selectedRecordManagerProvider && + Object.keys(selectedRecordManagerProvider).length > 0 && + (selectedRecordManagerProvider.inputParams ?? []) + .filter((inputParam) => !inputParam.hidden) + .map((inputParam, index) => ( + <> + + + ))}
-
- {selectedRecordManagerProvider && - Object.keys(selectedRecordManagerProvider).length > 0 && - (selectedRecordManagerProvider.inputParams ?? []) - .filter((inputParam) => !inputParam.hidden) - .map((inputParam, index) => ( - <> - - - ))} -
-
-
-
- )} -
-
-
+ + + + )} + + +
+ )} + )}