1046 lines
37 KiB
TypeScript
1046 lines
37 KiB
TypeScript
import { TextSplitter } from 'langchain/text_splitter'
|
|
import { Document, DocumentInterface } from '@langchain/core/documents'
|
|
import { BaseDocumentLoader } from 'langchain/document_loaders/base'
|
|
import { INode, INodeData, INodeParams, ICommonObject, INodeOutputsValue } from '../../../src/Interface'
|
|
import { getCredentialData, getCredentialParam, handleEscapeCharacters } from '../../../src/utils'
|
|
import axios, { AxiosResponse, AxiosRequestHeaders } from 'axios'
|
|
import { z } from 'zod'
|
|
|
|
// FirecrawlApp interfaces
|
|
interface FirecrawlAppConfig {
|
|
apiKey?: string | null
|
|
apiUrl?: string | null
|
|
}
|
|
|
|
interface FirecrawlDocumentMetadata {
|
|
title?: string
|
|
description?: string
|
|
language?: string
|
|
sourceURL?: string
|
|
statusCode?: number
|
|
error?: string
|
|
[key: string]: any
|
|
}
|
|
|
|
interface FirecrawlDocument {
|
|
markdown?: string
|
|
html?: string
|
|
rawHtml?: string
|
|
screenshot?: string
|
|
links?: string[]
|
|
actions?: {
|
|
screenshots?: string[]
|
|
}
|
|
metadata: FirecrawlDocumentMetadata
|
|
llm_extraction?: Record<string, any>
|
|
warning?: string
|
|
}
|
|
|
|
interface ScrapeResponse {
|
|
success: boolean
|
|
data?: FirecrawlDocument
|
|
error?: string
|
|
}
|
|
|
|
interface CrawlResponse {
|
|
success: boolean
|
|
id: string
|
|
url: string
|
|
error?: string
|
|
data?: FirecrawlDocument
|
|
}
|
|
|
|
interface CrawlStatusResponse {
|
|
status: string
|
|
total: number
|
|
completed: number
|
|
creditsUsed: number
|
|
expiresAt: string
|
|
next?: string
|
|
data?: FirecrawlDocument[]
|
|
}
|
|
|
|
interface ExtractResponse {
|
|
success: boolean
|
|
id: string
|
|
url: string
|
|
data?: Record<string, any>
|
|
}
|
|
|
|
interface SearchResult {
|
|
url: string
|
|
title: string
|
|
description: string
|
|
}
|
|
|
|
interface SearchResponse {
|
|
success: boolean
|
|
data?: SearchResult[]
|
|
warning?: string
|
|
}
|
|
|
|
interface SearchRequest {
|
|
query: string
|
|
limit?: number
|
|
tbs?: string
|
|
lang?: string
|
|
country?: string
|
|
location?: string
|
|
timeout?: number
|
|
ignoreInvalidURLs?: boolean
|
|
}
|
|
|
|
interface Params {
|
|
[key: string]: any
|
|
extractorOptions?: {
|
|
extractionSchema: z.ZodSchema | any
|
|
mode?: 'llm-extraction'
|
|
extractionPrompt?: string
|
|
}
|
|
}
|
|
|
|
interface ExtractRequest {
|
|
urls: string[]
|
|
prompt?: string
|
|
schema?: Record<string, any>
|
|
enableWebSearch?: boolean
|
|
ignoreSitemap?: boolean
|
|
includeSubdomains?: boolean
|
|
showSources?: boolean
|
|
scrapeOptions?: {
|
|
formats?: string[]
|
|
onlyMainContent?: boolean
|
|
includeTags?: string | string[]
|
|
excludeTags?: string | string[]
|
|
mobile?: boolean
|
|
skipTlsVerification?: boolean
|
|
timeout?: number
|
|
jsonOptions?: {
|
|
schema?: Record<string, any>
|
|
prompt?: string
|
|
}
|
|
}
|
|
}
|
|
|
|
interface ExtractStatusResponse {
|
|
success: boolean
|
|
data: any
|
|
status: 'completed' | 'pending' | 'processing' | 'failed' | 'cancelled'
|
|
expiresAt: string
|
|
}
|
|
|
|
// FirecrawlApp class (not exported)
|
|
class FirecrawlApp {
|
|
private apiKey: string
|
|
private apiUrl: string
|
|
|
|
constructor({ apiKey = null, apiUrl = null }: FirecrawlAppConfig) {
|
|
this.apiKey = apiKey || ''
|
|
this.apiUrl = apiUrl || 'https://api.firecrawl.dev'
|
|
if (!this.apiKey) {
|
|
throw new Error('No API key provided')
|
|
}
|
|
}
|
|
|
|
async scrapeUrl(url: string, params: Params | null = null): Promise<ScrapeResponse> {
|
|
const headers = this.prepareHeaders()
|
|
|
|
// Create a clean payload with only valid parameters
|
|
const validParams: any = {
|
|
url,
|
|
formats: ['markdown'],
|
|
onlyMainContent: true
|
|
}
|
|
|
|
// Add optional parameters if they exist
|
|
if (params?.scrapeOptions) {
|
|
if (params.scrapeOptions.includeTags) {
|
|
validParams.includeTags = Array.isArray(params.scrapeOptions.includeTags)
|
|
? params.scrapeOptions.includeTags
|
|
: params.scrapeOptions.includeTags.split(',')
|
|
}
|
|
if (params.scrapeOptions.excludeTags) {
|
|
validParams.excludeTags = Array.isArray(params.scrapeOptions.excludeTags)
|
|
? params.scrapeOptions.excludeTags
|
|
: params.scrapeOptions.excludeTags.split(',')
|
|
}
|
|
if (params.scrapeOptions.mobile !== undefined) {
|
|
validParams.mobile = params.scrapeOptions.mobile
|
|
}
|
|
if (params.scrapeOptions.skipTlsVerification !== undefined) {
|
|
validParams.skipTlsVerification = params.scrapeOptions.skipTlsVerification
|
|
}
|
|
if (params.scrapeOptions.timeout) {
|
|
validParams.timeout = params.scrapeOptions.timeout
|
|
}
|
|
}
|
|
|
|
// Add JSON options if they exist
|
|
if (params?.extractorOptions) {
|
|
validParams.jsonOptions = {
|
|
schema: params.extractorOptions.extractionSchema,
|
|
prompt: params.extractorOptions.extractionPrompt
|
|
}
|
|
}
|
|
|
|
try {
|
|
const parameters = {
|
|
...validParams,
|
|
integration: 'flowise'
|
|
}
|
|
const response: AxiosResponse = await this.postRequest(this.apiUrl + '/v1/scrape', parameters, headers)
|
|
if (response.status === 200) {
|
|
const responseData = response.data
|
|
if (responseData.success) {
|
|
return responseData
|
|
} else {
|
|
throw new Error(`Failed to scrape URL. Error: ${responseData.error}`)
|
|
}
|
|
} else {
|
|
this.handleError(response, 'scrape URL')
|
|
}
|
|
} catch (error: any) {
|
|
throw new Error(error.message)
|
|
}
|
|
return { success: false, error: 'Internal server error.' }
|
|
}
|
|
|
|
async crawlUrl(
|
|
url: string,
|
|
params: Params | null = null,
|
|
waitUntilDone: boolean = true,
|
|
pollInterval: number = 2,
|
|
idempotencyKey?: string
|
|
): Promise<CrawlResponse | CrawlStatusResponse> {
|
|
const headers = this.prepareHeaders(idempotencyKey)
|
|
|
|
// Create a clean payload with only valid parameters
|
|
const validParams: any = {
|
|
url
|
|
}
|
|
|
|
// Add scrape options with only non-empty values
|
|
const scrapeOptions: any = {
|
|
formats: ['markdown'],
|
|
onlyMainContent: true
|
|
}
|
|
|
|
// Add crawl-specific parameters if they exist and are not empty
|
|
if (params) {
|
|
const validCrawlParams = [
|
|
'excludePaths',
|
|
'includePaths',
|
|
'maxDepth',
|
|
'maxDiscoveryDepth',
|
|
'ignoreSitemap',
|
|
'ignoreQueryParameters',
|
|
'limit',
|
|
'allowBackwardLinks',
|
|
'allowExternalLinks',
|
|
'delay'
|
|
]
|
|
|
|
validCrawlParams.forEach((param) => {
|
|
if (params[param] !== undefined && params[param] !== null && params[param] !== '') {
|
|
validParams[param] = params[param]
|
|
}
|
|
})
|
|
}
|
|
|
|
// Add scrape options if they exist and are not empty
|
|
if (params?.scrapeOptions) {
|
|
if (params.scrapeOptions.includePaths) {
|
|
const includePaths = Array.isArray(params.scrapeOptions.includePaths)
|
|
? params.scrapeOptions.includePaths
|
|
: params.scrapeOptions.includePaths.split(',')
|
|
if (includePaths.length > 0) {
|
|
validParams.includePaths = includePaths
|
|
}
|
|
}
|
|
|
|
if (params.scrapeOptions.excludePaths) {
|
|
const excludePaths = Array.isArray(params.scrapeOptions.excludePaths)
|
|
? params.scrapeOptions.excludePaths
|
|
: params.scrapeOptions.excludePaths.split(',')
|
|
if (excludePaths.length > 0) {
|
|
validParams.excludePaths = excludePaths
|
|
}
|
|
}
|
|
|
|
if (params.scrapeOptions.limit) {
|
|
validParams.limit = params.scrapeOptions.limit
|
|
}
|
|
|
|
const validScrapeParams = ['mobile', 'skipTlsVerification', 'timeout', 'includeTags', 'excludeTags', 'onlyMainContent']
|
|
|
|
validScrapeParams.forEach((param) => {
|
|
if (params.scrapeOptions[param] !== undefined && params.scrapeOptions[param] !== null) {
|
|
scrapeOptions[param] = params.scrapeOptions[param]
|
|
}
|
|
})
|
|
}
|
|
|
|
// Only add scrapeOptions if it has more than just the default values
|
|
if (Object.keys(scrapeOptions).length > 2) {
|
|
validParams.scrapeOptions = scrapeOptions
|
|
}
|
|
|
|
try {
|
|
const parameters = {
|
|
...validParams,
|
|
integration: 'flowise'
|
|
}
|
|
const response: AxiosResponse = await this.postRequest(this.apiUrl + '/v1/crawl', parameters, headers)
|
|
if (response.status === 200) {
|
|
const crawlResponse = response.data as CrawlResponse
|
|
if (!crawlResponse.success) {
|
|
throw new Error(`Crawl request failed: ${crawlResponse.error || 'Unknown error'}`)
|
|
}
|
|
|
|
if (waitUntilDone) {
|
|
return this.monitorJobStatus(crawlResponse.id, headers, pollInterval)
|
|
} else {
|
|
return crawlResponse
|
|
}
|
|
} else {
|
|
this.handleError(response, 'start crawl job')
|
|
}
|
|
} catch (error: any) {
|
|
if (error.response?.data?.error) {
|
|
throw new Error(`Crawl failed: ${error.response.data.error}`)
|
|
}
|
|
throw new Error(`Crawl failed: ${error.message}`)
|
|
}
|
|
|
|
return { success: false, id: '', url: '' }
|
|
}
|
|
|
|
async extract(
|
|
request: ExtractRequest,
|
|
waitUntilDone: boolean = true,
|
|
pollInterval: number = 2
|
|
): Promise<ExtractResponse | ExtractStatusResponse> {
|
|
const headers = this.prepareHeaders()
|
|
|
|
// Create a clean payload with only valid parameters
|
|
const validParams: any = {
|
|
urls: request.urls
|
|
}
|
|
|
|
// Add optional parameters if they exist and are not empty
|
|
if (request.prompt) {
|
|
validParams.prompt = request.prompt
|
|
}
|
|
|
|
if (request.schema) {
|
|
validParams.schema = request.schema
|
|
}
|
|
|
|
const validExtractParams = ['enableWebSearch', 'ignoreSitemap', 'includeSubdomains', 'showSources'] as const
|
|
|
|
validExtractParams.forEach((param) => {
|
|
if (request[param] !== undefined && request[param] !== null) {
|
|
validParams[param] = request[param]
|
|
}
|
|
})
|
|
|
|
// Add scrape options if they exist
|
|
if (request.scrapeOptions) {
|
|
const scrapeOptions: any = {
|
|
formats: ['markdown'],
|
|
onlyMainContent: true
|
|
}
|
|
|
|
// Handle includeTags
|
|
if (request.scrapeOptions.includeTags) {
|
|
const includeTags = Array.isArray(request.scrapeOptions.includeTags)
|
|
? request.scrapeOptions.includeTags
|
|
: request.scrapeOptions.includeTags.split(',')
|
|
if (includeTags.length > 0) {
|
|
scrapeOptions.includeTags = includeTags
|
|
}
|
|
}
|
|
|
|
// Handle excludeTags
|
|
if (request.scrapeOptions.excludeTags) {
|
|
const excludeTags = Array.isArray(request.scrapeOptions.excludeTags)
|
|
? request.scrapeOptions.excludeTags
|
|
: request.scrapeOptions.excludeTags.split(',')
|
|
if (excludeTags.length > 0) {
|
|
scrapeOptions.excludeTags = excludeTags
|
|
}
|
|
}
|
|
|
|
// Add other scrape options if they exist and are not empty
|
|
const validScrapeParams = ['mobile', 'skipTlsVerification', 'timeout'] as const
|
|
|
|
validScrapeParams.forEach((param) => {
|
|
if (request.scrapeOptions?.[param] !== undefined && request.scrapeOptions?.[param] !== null) {
|
|
scrapeOptions[param] = request.scrapeOptions[param]
|
|
}
|
|
})
|
|
|
|
// Add JSON options if they exist
|
|
if (request.scrapeOptions.jsonOptions) {
|
|
scrapeOptions.jsonOptions = {}
|
|
if (request.scrapeOptions.jsonOptions.schema) {
|
|
scrapeOptions.jsonOptions.schema = request.scrapeOptions.jsonOptions.schema
|
|
}
|
|
if (request.scrapeOptions.jsonOptions.prompt) {
|
|
scrapeOptions.jsonOptions.prompt = request.scrapeOptions.jsonOptions.prompt
|
|
}
|
|
}
|
|
|
|
// Only add scrapeOptions if it has more than just the default values
|
|
if (Object.keys(scrapeOptions).length > 2) {
|
|
validParams.scrapeOptions = scrapeOptions
|
|
}
|
|
}
|
|
|
|
try {
|
|
const parameters = {
|
|
...validParams,
|
|
integration: 'flowise'
|
|
}
|
|
const response: AxiosResponse = await this.postRequest(this.apiUrl + '/v1/extract', parameters, headers)
|
|
if (response.status === 200) {
|
|
const extractResponse = response.data as ExtractResponse
|
|
if (waitUntilDone) {
|
|
return this.monitorExtractStatus(extractResponse.id, headers, pollInterval)
|
|
} else {
|
|
return extractResponse
|
|
}
|
|
} else {
|
|
this.handleError(response, 'start extract job')
|
|
}
|
|
} catch (error: any) {
|
|
throw new Error(error.message)
|
|
}
|
|
return { success: false, id: '', url: '' }
|
|
}
|
|
|
|
async search(request: SearchRequest): Promise<SearchResponse> {
|
|
const headers = this.prepareHeaders()
|
|
|
|
// Create a clean payload with only valid parameters
|
|
const validParams: any = {
|
|
query: request.query
|
|
}
|
|
|
|
// Add optional parameters if they exist and are not empty
|
|
const validSearchParams = ['limit', 'tbs', 'lang', 'country', 'location', 'timeout', 'ignoreInvalidURLs'] as const
|
|
|
|
validSearchParams.forEach((param) => {
|
|
if (request[param] !== undefined && request[param] !== null) {
|
|
validParams[param] = request[param]
|
|
}
|
|
})
|
|
|
|
try {
|
|
const parameters = {
|
|
...validParams,
|
|
integration: 'flowise'
|
|
}
|
|
const response: AxiosResponse = await this.postRequest(this.apiUrl + '/v1/search', parameters, headers)
|
|
if (response.status === 200) {
|
|
const searchResponse = response.data as SearchResponse
|
|
if (!searchResponse.success) {
|
|
throw new Error(`Search request failed: ${searchResponse.warning || 'Unknown error'}`)
|
|
}
|
|
return searchResponse
|
|
} else {
|
|
this.handleError(response, 'perform search')
|
|
}
|
|
} catch (error: any) {
|
|
throw new Error(error.message)
|
|
}
|
|
return { success: false }
|
|
}
|
|
|
|
private prepareHeaders(idempotencyKey?: string): AxiosRequestHeaders {
|
|
return {
|
|
'Content-Type': 'application/json',
|
|
Authorization: `Bearer ${this.apiKey}`,
|
|
...(idempotencyKey ? { 'x-idempotency-key': idempotencyKey } : {})
|
|
} as AxiosRequestHeaders & { 'x-idempotency-key'?: string }
|
|
}
|
|
|
|
private async postRequest(url: string, data: Params, headers: AxiosRequestHeaders): Promise<AxiosResponse> {
|
|
const result = await axios.post(url, data, { headers })
|
|
return result
|
|
}
|
|
|
|
private getRequest(url: string, headers: AxiosRequestHeaders): Promise<AxiosResponse> {
|
|
return axios.get(url, { headers })
|
|
}
|
|
|
|
private async monitorJobStatus(jobId: string, headers: AxiosRequestHeaders, checkInterval: number): Promise<CrawlStatusResponse> {
|
|
let isJobCompleted = false
|
|
while (!isJobCompleted) {
|
|
const statusResponse: AxiosResponse = await this.getRequest(this.apiUrl + `/v1/crawl/${jobId}`, headers)
|
|
if (statusResponse.status === 200) {
|
|
const statusData = statusResponse.data as CrawlStatusResponse
|
|
switch (statusData.status) {
|
|
case 'completed':
|
|
isJobCompleted = true
|
|
return statusData
|
|
case 'scraping':
|
|
case 'failed':
|
|
if (statusData.status === 'failed') {
|
|
throw new Error('Crawl job failed')
|
|
}
|
|
await new Promise((resolve) => setTimeout(resolve, Math.max(checkInterval, 2) * 1000))
|
|
break
|
|
default:
|
|
throw new Error(`Unknown crawl status: ${statusData.status}`)
|
|
}
|
|
} else {
|
|
this.handleError(statusResponse, 'check crawl status')
|
|
}
|
|
}
|
|
throw new Error('Failed to monitor job status')
|
|
}
|
|
|
|
private async monitorExtractStatus(jobId: string, headers: AxiosRequestHeaders, checkInterval: number): Promise<ExtractStatusResponse> {
|
|
let isJobCompleted = false
|
|
while (!isJobCompleted) {
|
|
const statusResponse: AxiosResponse = await this.getRequest(this.apiUrl + `/v1/extract/${jobId}`, headers)
|
|
if (statusResponse.status === 200) {
|
|
const statusData = statusResponse.data as ExtractStatusResponse
|
|
switch (statusData.status) {
|
|
case 'completed':
|
|
isJobCompleted = true
|
|
return statusData
|
|
case 'processing':
|
|
case 'failed':
|
|
if (statusData.status === 'failed') {
|
|
throw new Error('Extract job failed')
|
|
}
|
|
await new Promise((resolve) => setTimeout(resolve, Math.max(checkInterval, 2) * 1000))
|
|
break
|
|
default:
|
|
throw new Error(`Unknown extract status: ${statusData.status}`)
|
|
}
|
|
} else {
|
|
this.handleError(statusResponse, 'check extract status')
|
|
}
|
|
}
|
|
throw new Error('Failed to monitor extract status')
|
|
}
|
|
|
|
private handleError(response: AxiosResponse, action: string): void {
|
|
if ([402, 408, 409, 500].includes(response.status)) {
|
|
const errorMessage: string = response.data.error || 'Unknown error occurred'
|
|
throw new Error(`Failed to ${action}. Status code: ${response.status}. Error: ${errorMessage}`)
|
|
} else {
|
|
throw new Error(`Unexpected error occurred while trying to ${action}. Status code: ${response.status}`)
|
|
}
|
|
}
|
|
}
|
|
|
|
// FireCrawl Loader
|
|
interface FirecrawlLoaderParameters {
|
|
url?: string
|
|
query?: string
|
|
apiKey?: string
|
|
apiUrl?: string
|
|
mode?: 'crawl' | 'scrape' | 'extract' | 'search'
|
|
params?: Record<string, unknown>
|
|
}
|
|
|
|
export class FireCrawlLoader extends BaseDocumentLoader {
|
|
private apiKey: string
|
|
private apiUrl: string
|
|
private url?: string
|
|
private query?: string
|
|
private mode: 'crawl' | 'scrape' | 'extract' | 'search'
|
|
private params?: Record<string, unknown>
|
|
|
|
constructor(loaderParams: FirecrawlLoaderParameters) {
|
|
super()
|
|
const { apiKey, apiUrl, url, query, mode = 'crawl', params } = loaderParams
|
|
if (!apiKey) {
|
|
throw new Error('Firecrawl API key not set. You can set it as FIRECRAWL_API_KEY in your .env file, or pass it to Firecrawl.')
|
|
}
|
|
|
|
this.apiKey = apiKey
|
|
this.url = url
|
|
this.query = query
|
|
this.mode = mode
|
|
this.params = params
|
|
this.apiUrl = apiUrl || 'https://api.firecrawl.dev'
|
|
}
|
|
|
|
public async load(): Promise<DocumentInterface[]> {
|
|
const app = new FirecrawlApp({ apiKey: this.apiKey, apiUrl: this.apiUrl })
|
|
let firecrawlDocs: FirecrawlDocument[]
|
|
|
|
if (this.mode === 'search') {
|
|
if (!this.query) {
|
|
throw new Error('Firecrawl: Query is required for search mode')
|
|
}
|
|
const response = await app.search({ query: this.query, ...this.params })
|
|
if (!response.success) {
|
|
throw new Error(`Firecrawl: Failed to search. Warning: ${response.warning}`)
|
|
}
|
|
|
|
// Convert search results to FirecrawlDocument format
|
|
firecrawlDocs = (response.data || []).map((result) => ({
|
|
markdown: result.description,
|
|
metadata: {
|
|
title: result.title,
|
|
sourceURL: result.url,
|
|
description: result.description
|
|
}
|
|
}))
|
|
} else if (this.mode === 'scrape') {
|
|
if (!this.url) {
|
|
throw new Error('Firecrawl: URL is required for scrape mode')
|
|
}
|
|
const response = await app.scrapeUrl(this.url, this.params)
|
|
if (!response.success) {
|
|
throw new Error(`Firecrawl: Failed to scrape URL. Error: ${response.error}`)
|
|
}
|
|
firecrawlDocs = [response.data as FirecrawlDocument]
|
|
} else if (this.mode === 'crawl') {
|
|
if (!this.url) {
|
|
throw new Error('Firecrawl: URL is required for crawl mode')
|
|
}
|
|
const response = await app.crawlUrl(this.url, this.params)
|
|
if ('status' in response) {
|
|
if (response.status === 'failed') {
|
|
throw new Error('Firecrawl: Crawl job failed')
|
|
}
|
|
firecrawlDocs = response.data || []
|
|
} else {
|
|
if (!response.success) {
|
|
throw new Error(`Firecrawl: Failed to scrape URL. Error: ${response.error}`)
|
|
}
|
|
firecrawlDocs = [response.data as FirecrawlDocument]
|
|
}
|
|
} else if (this.mode === 'extract') {
|
|
if (!this.url) {
|
|
throw new Error('Firecrawl: URL is required for extract mode')
|
|
}
|
|
this.params!.urls = [this.url]
|
|
const response = await app.extract(this.params as any as ExtractRequest)
|
|
if (!response.success) {
|
|
throw new Error(`Firecrawl: Failed to extract URL.`)
|
|
}
|
|
|
|
// Convert extract response to document format
|
|
if ('data' in response && response.data) {
|
|
// Create a document from the extracted data
|
|
const extractedData = response.data
|
|
const content = JSON.stringify(extractedData, null, 2)
|
|
|
|
const metadata: Record<string, any> = {
|
|
source: this.url,
|
|
type: 'extracted_data'
|
|
}
|
|
|
|
// Add status and expiresAt if they exist in the response
|
|
if ('status' in response) {
|
|
metadata.status = response.status
|
|
}
|
|
if ('data' in response) {
|
|
metadata.data = response.data
|
|
}
|
|
if ('expiresAt' in response) {
|
|
metadata.expiresAt = response.expiresAt
|
|
}
|
|
|
|
return [
|
|
new Document({
|
|
pageContent: content,
|
|
metadata
|
|
})
|
|
]
|
|
}
|
|
return []
|
|
} else {
|
|
throw new Error(`Unrecognized mode '${this.mode}'. Expected one of 'crawl', 'scrape', 'extract', 'search'.`)
|
|
}
|
|
|
|
// Convert Firecrawl documents to LangChain documents
|
|
const documents = firecrawlDocs.map((doc) => {
|
|
// Use markdown content if available, otherwise fallback to HTML or empty string
|
|
const content = doc.markdown || doc.html || doc.rawHtml || ''
|
|
|
|
// Create a standard LangChain document
|
|
return new Document({
|
|
pageContent: content,
|
|
metadata: {
|
|
...doc.metadata,
|
|
source: doc.metadata?.sourceURL || this.url,
|
|
title: doc.metadata?.title,
|
|
description: doc.metadata?.description,
|
|
language: doc.metadata?.language,
|
|
statusCode: doc.metadata?.statusCode
|
|
}
|
|
})
|
|
})
|
|
|
|
return documents
|
|
}
|
|
}
|
|
|
|
// Flowise Node Class
|
|
class FireCrawl_DocumentLoaders implements INode {
|
|
label: string
|
|
name: string
|
|
description: string
|
|
type: string
|
|
icon: string
|
|
version: number
|
|
category: string
|
|
baseClasses: string[]
|
|
inputs: INodeParams[]
|
|
credential: INodeParams
|
|
outputs: INodeOutputsValue[]
|
|
|
|
constructor() {
|
|
this.label = 'FireCrawl'
|
|
this.name = 'fireCrawl'
|
|
this.type = 'Document'
|
|
this.icon = 'firecrawl.png'
|
|
this.version = 4.0
|
|
this.category = 'Document Loaders'
|
|
this.description = 'Load data from URL using FireCrawl'
|
|
this.baseClasses = [this.type]
|
|
this.credential = {
|
|
label: 'FireCrawl API',
|
|
name: 'credential',
|
|
type: 'credential',
|
|
credentialNames: ['fireCrawlApi']
|
|
}
|
|
this.inputs = [
|
|
{
|
|
label: 'Text Splitter',
|
|
name: 'textSplitter',
|
|
type: 'TextSplitter',
|
|
optional: true
|
|
},
|
|
{
|
|
label: 'Type',
|
|
type: 'options',
|
|
name: 'crawlerType',
|
|
options: [
|
|
{
|
|
label: 'Crawl',
|
|
name: 'crawl',
|
|
description: 'Crawl a URL and all accessible subpages'
|
|
},
|
|
{
|
|
label: 'Scrape',
|
|
name: 'scrape',
|
|
description: 'Scrape a URL and get its content'
|
|
},
|
|
{
|
|
label: 'Extract',
|
|
name: 'extract',
|
|
description: 'Extract data from a URL'
|
|
},
|
|
{
|
|
label: 'Search',
|
|
name: 'search',
|
|
description: 'Search the web using FireCrawl'
|
|
}
|
|
],
|
|
default: 'crawl'
|
|
},
|
|
{
|
|
label: 'URLs',
|
|
name: 'url',
|
|
type: 'string',
|
|
description: 'URL to be crawled/scraped/extracted',
|
|
placeholder: 'https://docs.flowiseai.com',
|
|
optional: true,
|
|
show: {
|
|
crawlerType: ['crawl', 'scrape', 'extract']
|
|
}
|
|
},
|
|
{
|
|
// includeTags
|
|
label: 'Include Tags',
|
|
name: 'includeTags',
|
|
type: 'string',
|
|
description: 'Tags to include in the output. Use comma to separate multiple tags.',
|
|
optional: true,
|
|
additionalParams: true,
|
|
show: {
|
|
crawlerType: ['scrape']
|
|
}
|
|
},
|
|
{
|
|
// excludeTags
|
|
label: 'Exclude Tags',
|
|
name: 'excludeTags',
|
|
type: 'string',
|
|
description: 'Tags to exclude from the output. Use comma to separate multiple tags.',
|
|
optional: true,
|
|
additionalParams: true,
|
|
show: {
|
|
crawlerType: ['scrape']
|
|
}
|
|
},
|
|
{
|
|
// onlyMainContent
|
|
label: 'Only Main Content',
|
|
name: 'onlyMainContent',
|
|
type: 'boolean',
|
|
description: 'Extract only the main content of the page',
|
|
optional: true,
|
|
additionalParams: true,
|
|
show: {
|
|
crawlerType: ['scrape']
|
|
}
|
|
},
|
|
{
|
|
// limit
|
|
label: 'Limit',
|
|
name: 'limit',
|
|
type: 'string',
|
|
description: 'Maximum number of pages to crawl',
|
|
optional: true,
|
|
additionalParams: true,
|
|
default: '10000',
|
|
show: {
|
|
crawlerType: ['crawl']
|
|
}
|
|
},
|
|
{
|
|
label: 'Include Paths',
|
|
name: 'includePaths',
|
|
type: 'string',
|
|
description:
|
|
'URL pathname regex patterns that include matching URLs in the crawl. Only the paths that match the specified patterns will be included in the response.',
|
|
placeholder: `blog/.*, news/.*`,
|
|
optional: true,
|
|
additionalParams: true,
|
|
show: {
|
|
crawlerType: ['crawl']
|
|
}
|
|
},
|
|
{
|
|
label: 'Exclude Paths',
|
|
name: 'excludePaths',
|
|
type: 'string',
|
|
description: 'URL pathname regex patterns that exclude matching URLs from the crawl.',
|
|
placeholder: `blog/.*, news/.*`,
|
|
optional: true,
|
|
additionalParams: true,
|
|
show: {
|
|
crawlerType: ['crawl']
|
|
}
|
|
},
|
|
{
|
|
label: 'Schema',
|
|
name: 'extractSchema',
|
|
type: 'json',
|
|
description: 'JSON schema for data extraction',
|
|
optional: true,
|
|
additionalParams: true,
|
|
show: {
|
|
crawlerType: ['extract']
|
|
}
|
|
},
|
|
{
|
|
label: 'Prompt',
|
|
name: 'extractPrompt',
|
|
type: 'string',
|
|
description: 'Prompt for data extraction',
|
|
optional: true,
|
|
additionalParams: true,
|
|
show: {
|
|
crawlerType: ['extract']
|
|
}
|
|
},
|
|
{
|
|
label: 'Query',
|
|
name: 'searchQuery',
|
|
type: 'string',
|
|
description: 'Search query to find relevant content',
|
|
optional: true,
|
|
show: {
|
|
crawlerType: ['search']
|
|
}
|
|
},
|
|
{
|
|
label: 'Limit',
|
|
name: 'searchLimit',
|
|
type: 'string',
|
|
description: 'Maximum number of results to return',
|
|
optional: true,
|
|
additionalParams: true,
|
|
default: '5',
|
|
show: {
|
|
crawlerType: ['search']
|
|
}
|
|
},
|
|
{
|
|
label: 'Language',
|
|
name: 'searchLang',
|
|
type: 'string',
|
|
description: 'Language code for search results (e.g., en, es, fr)',
|
|
optional: true,
|
|
additionalParams: true,
|
|
default: 'en',
|
|
show: {
|
|
crawlerType: ['search']
|
|
}
|
|
},
|
|
{
|
|
label: 'Country',
|
|
name: 'searchCountry',
|
|
type: 'string',
|
|
description: 'Country code for search results (e.g., us, uk, ca)',
|
|
optional: true,
|
|
additionalParams: true,
|
|
default: 'us',
|
|
show: {
|
|
crawlerType: ['search']
|
|
}
|
|
},
|
|
{
|
|
label: 'Timeout',
|
|
name: 'searchTimeout',
|
|
type: 'number',
|
|
description: 'Timeout in milliseconds for search operation',
|
|
optional: true,
|
|
additionalParams: true,
|
|
default: 60000,
|
|
show: {
|
|
crawlerType: ['search']
|
|
}
|
|
}
|
|
]
|
|
this.outputs = [
|
|
{
|
|
label: 'Document',
|
|
name: 'document',
|
|
description: 'Array of document objects containing metadata and pageContent',
|
|
baseClasses: [...this.baseClasses, 'json']
|
|
},
|
|
{
|
|
label: 'Text',
|
|
name: 'text',
|
|
description: 'Concatenated string from pageContent of documents',
|
|
baseClasses: ['string', 'json']
|
|
}
|
|
]
|
|
}
|
|
|
|
async init(nodeData: INodeData, _: string, options: ICommonObject): Promise<any> {
|
|
const textSplitter = nodeData.inputs?.textSplitter as TextSplitter
|
|
const metadata = nodeData.inputs?.metadata
|
|
const url = nodeData.inputs?.url as string
|
|
const crawlerType = nodeData.inputs?.crawlerType as string
|
|
const limit = nodeData.inputs?.limit as string
|
|
const onlyMainContent = nodeData.inputs?.onlyMainContent as boolean
|
|
const credentialData = await getCredentialData(nodeData.credential ?? '', options)
|
|
const firecrawlApiToken = getCredentialParam('firecrawlApiToken', credentialData, nodeData)
|
|
const firecrawlApiUrl = getCredentialParam('firecrawlApiUrl', credentialData, nodeData, 'https://api.firecrawl.dev')
|
|
const output = nodeData.outputs?.output as string
|
|
|
|
// Validate URL only for non-search methods
|
|
if (crawlerType !== 'search' && !url) {
|
|
throw new Error('Firecrawl: URL is required for ' + crawlerType + ' mode')
|
|
}
|
|
|
|
const includePaths = nodeData.inputs?.includePaths ? (nodeData.inputs.includePaths.split(',') as string[]) : undefined
|
|
const excludePaths = nodeData.inputs?.excludePaths ? (nodeData.inputs.excludePaths.split(',') as string[]) : undefined
|
|
|
|
const includeTags = nodeData.inputs?.includeTags ? (nodeData.inputs.includeTags.split(',') as string[]) : undefined
|
|
const excludeTags = nodeData.inputs?.excludeTags ? (nodeData.inputs.excludeTags.split(',') as string[]) : undefined
|
|
|
|
const extractSchema = nodeData.inputs?.extractSchema
|
|
const extractPrompt = nodeData.inputs?.extractPrompt as string
|
|
|
|
const searchQuery = nodeData.inputs?.searchQuery as string
|
|
const searchLimit = nodeData.inputs?.searchLimit as string
|
|
const searchLang = nodeData.inputs?.searchLang as string
|
|
const searchCountry = nodeData.inputs?.searchCountry as string
|
|
const searchTimeout = nodeData.inputs?.searchTimeout as number
|
|
|
|
const input: FirecrawlLoaderParameters = {
|
|
url,
|
|
query: searchQuery,
|
|
mode: crawlerType as 'crawl' | 'scrape' | 'extract' | 'search',
|
|
apiKey: firecrawlApiToken,
|
|
apiUrl: firecrawlApiUrl,
|
|
params: {
|
|
scrapeOptions: {
|
|
includePaths,
|
|
excludePaths,
|
|
limit: limit ? parseInt(limit, 10) : 1000,
|
|
includeTags,
|
|
excludeTags
|
|
},
|
|
schema: extractSchema || undefined,
|
|
prompt: extractPrompt || undefined
|
|
}
|
|
}
|
|
|
|
// Add search-specific parameters only when in search mode
|
|
if (crawlerType === 'search') {
|
|
if (!searchQuery) {
|
|
throw new Error('Firecrawl: Search query is required for search mode')
|
|
}
|
|
input.params = {
|
|
limit: searchLimit ? parseInt(searchLimit, 10) : 5,
|
|
lang: searchLang,
|
|
country: searchCountry,
|
|
timeout: searchTimeout
|
|
}
|
|
}
|
|
|
|
if (onlyMainContent === true) {
|
|
const scrapeOptions = input.params?.scrapeOptions as any
|
|
input.params!.scrapeOptions = {
|
|
...scrapeOptions,
|
|
onlyMainContent: true
|
|
}
|
|
}
|
|
|
|
const loader = new FireCrawlLoader(input)
|
|
let docs = []
|
|
|
|
// Load documents
|
|
docs = await loader.load()
|
|
|
|
// Apply text splitting if configured
|
|
if (textSplitter && docs.length > 0) {
|
|
docs = await textSplitter.splitDocuments(docs)
|
|
}
|
|
|
|
// Apply metadata if provided
|
|
if (metadata) {
|
|
const parsedMetadata = typeof metadata === 'object' ? metadata : JSON.parse(metadata)
|
|
docs = docs.map((doc) => ({
|
|
...doc,
|
|
metadata: {
|
|
...doc.metadata,
|
|
...parsedMetadata
|
|
}
|
|
}))
|
|
}
|
|
|
|
// Return based on output type
|
|
if (output === 'document') {
|
|
return docs
|
|
} else {
|
|
let finaltext = ''
|
|
for (const doc of docs) {
|
|
finaltext += `${doc.pageContent}\n`
|
|
}
|
|
return handleEscapeCharacters(finaltext, false)
|
|
}
|
|
}
|
|
}
|
|
|
|
module.exports = { nodeClass: FireCrawl_DocumentLoaders }
|
|
|
|
// FOR TESTING PURPOSES
|
|
// export { FireCrawl_DocumentLoaders }
|