Feature/Add ability to create new doc store on upsert (#3965)

add ability to create new doc store on upsert, update firecrawl properties
This commit is contained in:
Henry Heng 2025-02-01 14:28:50 +00:00 committed by GitHub
parent a49177f7fb
commit 20a797d2e0
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 122 additions and 3 deletions

View File

@ -679,6 +679,11 @@ paths:
type: string
format: binary
description: Files to be uploaded
docId:
type: string
nullable: true
example: '603a7b51-ae7c-4b0a-8865-e454ed2f6766'
description: Document ID to use existing configuration
loader:
type: string
nullable: true
@ -704,6 +709,32 @@ paths:
nullable: true
example: '{"name":"postgresRecordManager"}'
description: Record Manager configurations
metadata:
type: object
nullable: true
description: Metadata associated with the document
example: { 'foo': 'bar' }
replaceExisting:
type: boolean
nullable: true
description: Whether to replace existing document loader with the new upserted chunks. However this does not delete the existing embeddings in the vector store
createNewDocStore:
type: boolean
nullable: true
description: Whether to create a new document store
docStore:
type: object
nullable: true
description: Only when createNewDocStore is true, pass in the new document store configuration
properties:
name:
type: string
example: plainText
description: Name of the new document store to be created
description:
type: string
example: plainText
description: Description of the new document store to be created
required:
- files
required: true
@ -2350,16 +2381,37 @@ components:
docId:
type: string
format: uuid
nullable: true
description: Document ID within the store. If provided, existing configuration from the document will be used for the new document
metadata:
type: object
nullable: true
description: Metadata associated with the document
example: { 'foo': 'bar' }
replaceExisting:
type: boolean
nullable: true
description: Whether to replace existing document loader with the new upserted chunks. However this does not delete the existing embeddings in the vector store
createNewDocStore:
type: boolean
nullable: true
description: Whether to create a new document store
docStore:
type: object
nullable: true
description: Only when createNewDocStore is true, pass in the new document store configuration
properties:
name:
type: string
example: plainText
description: Name of the new document store to be created
description:
type: string
example: plainText
description: Description of the new document store to be created
loader:
type: object
nullable: true
properties:
name:
type: string
@ -2370,6 +2422,7 @@ components:
description: Configuration for the loader
splitter:
type: object
nullable: true
properties:
name:
type: string
@ -2380,6 +2433,7 @@ components:
description: Configuration for the text splitter
embedding:
type: object
nullable: true
properties:
name:
type: string
@ -2390,6 +2444,7 @@ components:
description: Configuration for the embedding generator
vectorStore:
type: object
nullable: true
properties:
name:
type: string
@ -2400,6 +2455,7 @@ components:
description: Configuration for the vector store
recordManager:
type: object
nullable: true
properties:
name:
type: string

View File

@ -266,7 +266,7 @@ class FireCrawl_DocumentLoaders implements INode {
this.name = 'fireCrawl'
this.type = 'Document'
this.icon = 'firecrawl.png'
this.version = 2.0
this.version = 2.1
this.category = 'Document Loaders'
this.description = 'Load data from URL using FireCrawl'
this.baseClasses = [this.type]
@ -307,6 +307,42 @@ class FireCrawl_DocumentLoaders implements INode {
}
],
default: 'crawl'
},
{
// maxCrawlPages
label: 'Max Crawl Pages',
name: 'maxCrawlPages',
type: 'string',
description: 'Maximum number of pages to crawl',
optional: true,
additionalParams: true
},
{
// generateImgAltText
label: 'Generate Image Alt Text',
name: 'generateImgAltText',
type: 'boolean',
description: 'Generate alt text for images',
optional: true,
additionalParams: true
},
{
// returnOnlyUrls
label: 'Return Only URLs',
name: 'returnOnlyUrls',
type: 'boolean',
description: 'Return only URLs of the crawled pages',
optional: true,
additionalParams: true
},
{
// onlyMainContent
label: 'Only Main Content',
name: 'onlyMainContent',
type: 'boolean',
description: 'Extract only the main content of the page',
optional: true,
additionalParams: true
}
// ... (other input parameters)
]

View File

@ -76,6 +76,8 @@ export interface IDocumentStoreUpsertData {
docId: string
metadata?: string | object
replaceExisting?: boolean
createNewDocStore?: boolean
docStore?: IDocumentStore
loader?: {
name: string
config: ICommonObject

View File

@ -32,7 +32,8 @@ import {
INodeData,
MODE,
IOverrideConfig,
IExecutePreviewLoader
IExecutePreviewLoader,
DocumentStoreDTO
} from '../../Interface'
import { DocumentStoreFileChunk } from '../../database/entities/DocumentStoreFileChunk'
import { v4 as uuidv4 } from 'uuid'
@ -1464,6 +1465,7 @@ const upsertDocStore = async (
}
}
const replaceExisting = data.replaceExisting ?? false
const createNewDocStore = data.createNewDocStore ?? false
const newLoader = typeof data.loader === 'string' ? JSON.parse(data.loader) : data.loader
const newSplitter = typeof data.splitter === 'string' ? JSON.parse(data.splitter) : data.splitter
const newVectorStore = typeof data.vectorStore === 'string' ? JSON.parse(data.vectorStore) : data.vectorStore
@ -1533,6 +1535,15 @@ const upsertDocStore = async (
recordManagerConfig = JSON.parse(entity.recordManagerConfig || '{}')?.config
}
if (createNewDocStore) {
const docStoreBody = typeof data.docStore === 'string' ? JSON.parse(data.docStore) : data.docStore
const newDocumentStore = docStoreBody ?? { name: `Document Store ${Date.now().toString()}` }
const docStore = DocumentStoreDTO.toEntity(newDocumentStore)
const documentStore = appDataSource.getRepository(DocumentStore).create(docStore)
const dbResponse = await appDataSource.getRepository(DocumentStore).save(documentStore)
storeId = dbResponse.id
}
// Step 2: Replace with new values
loaderName = newLoader?.name ? getComponentLabelFromName(newLoader?.name) : loaderName
loaderId = newLoader?.name || loaderId
@ -1687,6 +1698,7 @@ const upsertDocStore = async (
isVectorStoreInsert: true
})
res.docId = newDocId
if (createNewDocStore) res.storeId = storeId
return res
} catch (error) {

View File

@ -41,11 +41,13 @@ body_data = {
"docId": "${dialogProps.loaderId}",
"metadata": {}, # Add additional metadata to the document chunks
"replaceExisting": True, # Replace existing document with the new upserted chunks
"createNewDocStore": False, # Create a new document store
"splitter": json.dumps({"config":{"chunkSize":20000}}) # Override existing configuration
# "loader": "",
# "vectorStore": "",
# "embedding": "",
# "recordManager": "",
# "docStore": ""
}
headers = {
@ -71,11 +73,14 @@ formData.append("splitter", JSON.stringify({"config":{"chunkSize":20000}}));
formData.append("metadata", "{}");
// Replace existing document with the new upserted chunks
formData.append("replaceExisting", "true");
// Create a new document store
formData.append("createNewDocStore", "false");
// Override existing configuration
// formData.append("loader", "");
// formData.append("embedding", "");
// formData.append("vectorStore", "");
// formData.append("recordManager", "");
// formData.append("docStore", "");
async function query(formData) {
const response = await fetch(
@ -105,11 +110,13 @@ curl -X POST http://localhost:3000/api/v1/document-store/upsert/${dialogProps.st
-F "splitter={"config":{"chunkSize":20000}}" \\
-F "metadata={}" \\
-F "replaceExisting=true" \\
-F "createNewDocStore=false" \\
# Override existing configuration:
# -F "loader=" \\
# -F "embedding=" \\
# -F "vectorStore=" \\
# -F "recordManager="
# -F "recordManager=" \\
# -F "docStore="
\`\`\`
`
}
@ -135,6 +142,7 @@ output = query({
"docId": "${dialogProps.loaderId}",
"metadata": "{}", # Add additional metadata to the document chunks
"replaceExisting": True, # Replace existing document with the new upserted chunks
"createNewDocStore": False, # Create a new document store
# Override existing configuration
"loader": {
"config": {
@ -149,6 +157,7 @@ output = query({
# embedding: {},
# vectorStore: {},
# recordManager: {}
# docStore: {}
})
print(output)
\`\`\`
@ -174,6 +183,7 @@ query({
"docId": "${dialogProps.loaderId},
"metadata": "{}", // Add additional metadata to the document chunks
"replaceExisting": true, // Replace existing document with the new upserted chunks
"createNewDocStore": false, // Create a new document store
// Override existing configuration
"loader": {
"config": {
@ -188,6 +198,7 @@ query({
// embedding: {},
// vectorStore: {},
// recordManager: {}
// docStore: {}
}).then((response) => {
console.log(response);
});
@ -201,6 +212,7 @@ curl -X POST http://localhost:3000/api/v1/document-store/upsert/${dialogProps.st
"docId": "${dialogProps.loaderId}",
"metadata": "{}",
"replaceExisting": true,
"createNewDocStore": false,
"loader": {
"config": {
"text": "This is a new text"
@ -215,6 +227,7 @@ curl -X POST http://localhost:3000/api/v1/document-store/upsert/${dialogProps.st
// "embedding": {},
// "vectorStore": {},
// "recordManager": {}
// "docStore": {}
}'
\`\`\`