Flowise/packages/components/nodes/documentloaders/FireCrawl/FireCrawl.ts

443 lines
15 KiB
TypeScript

import { TextSplitter } from 'langchain/text_splitter'
import { Document, DocumentInterface } from '@langchain/core/documents'
import { BaseDocumentLoader } from 'langchain/document_loaders/base'
import { INode, INodeData, INodeParams, ICommonObject, INodeOutputsValue } from '../../../src/Interface'
import { getCredentialData, getCredentialParam, handleEscapeCharacters } from '../../../src/utils'
import axios, { AxiosResponse, AxiosRequestHeaders } from 'axios'
import { z } from 'zod'
import { zodToJsonSchema } from 'zod-to-json-schema'
// FirecrawlApp interfaces
interface FirecrawlAppConfig {
apiKey?: string | null
apiUrl?: string | null
}
interface FirecrawlDocumentMetadata {
title?: string
description?: string
language?: string
// ... (other metadata fields)
[key: string]: any
}
interface FirecrawlDocument {
id?: string
url?: string
content: string
markdown?: string
html?: string
llm_extraction?: Record<string, any>
createdAt?: Date
updatedAt?: Date
type?: string
metadata: FirecrawlDocumentMetadata
childrenLinks?: string[]
provider?: string
warning?: string
index?: number
}
interface ScrapeResponse {
success: boolean
data?: FirecrawlDocument
error?: string
}
interface CrawlResponse {
success: boolean
jobId?: string
data?: FirecrawlDocument[]
error?: string
}
interface Params {
[key: string]: any
extractorOptions?: {
extractionSchema: z.ZodSchema | any
mode?: 'llm-extraction'
extractionPrompt?: string
}
}
// FirecrawlApp class (not exported)
class FirecrawlApp {
private apiKey: string
private apiUrl: string
constructor({ apiKey = null, apiUrl = null }: FirecrawlAppConfig) {
this.apiKey = apiKey || ''
this.apiUrl = apiUrl || 'https://api.firecrawl.dev'
if (!this.apiKey) {
throw new Error('No API key provided')
}
}
async scrapeUrl(url: string, params: Params | null = null): Promise<ScrapeResponse> {
const headers = this.prepareHeaders()
let jsonData: Params = { url, ...params }
if (params?.extractorOptions?.extractionSchema) {
let schema = params.extractorOptions.extractionSchema
if (schema instanceof z.ZodSchema) {
schema = zodToJsonSchema(schema)
}
jsonData = {
...jsonData,
extractorOptions: {
...params.extractorOptions,
extractionSchema: schema,
mode: params.extractorOptions.mode || 'llm-extraction'
}
}
}
try {
const response: AxiosResponse = await this.postRequest(this.apiUrl + '/v0/scrape', jsonData, headers)
if (response.status === 200) {
const responseData = response.data
if (responseData.success) {
return responseData
} else {
throw new Error(`Failed to scrape URL. Error: ${responseData.error}`)
}
} else {
this.handleError(response, 'scrape URL')
}
} catch (error: any) {
throw new Error(error.message)
}
return { success: false, error: 'Internal server error.' }
}
async crawlUrl(
url: string,
params: Params | null = null,
waitUntilDone: boolean = true,
pollInterval: number = 2,
idempotencyKey?: string
): Promise<CrawlResponse | any> {
const headers = this.prepareHeaders(idempotencyKey)
let jsonData: Params = { url, ...params }
try {
const response: AxiosResponse = await this.postRequest(this.apiUrl + '/v0/crawl', jsonData, headers)
if (response.status === 200) {
const jobId: string = response.data.jobId
if (waitUntilDone) {
return this.monitorJobStatus(jobId, headers, pollInterval)
} else {
return { success: true, jobId }
}
} else {
this.handleError(response, 'start crawl job')
}
} catch (error: any) {
throw new Error(error.message)
}
return { success: false, error: 'Internal server error.' }
}
private prepareHeaders(idempotencyKey?: string): AxiosRequestHeaders {
return {
'Content-Type': 'application/json',
Authorization: `Bearer ${this.apiKey}`,
...(idempotencyKey ? { 'x-idempotency-key': idempotencyKey } : {})
} as AxiosRequestHeaders & { 'x-idempotency-key'?: string }
}
private postRequest(url: string, data: Params, headers: AxiosRequestHeaders): Promise<AxiosResponse> {
return axios.post(url, data, { headers })
}
private getRequest(url: string, headers: AxiosRequestHeaders): Promise<AxiosResponse> {
return axios.get(url, { headers })
}
private async monitorJobStatus(jobId: string, headers: AxiosRequestHeaders, checkInterval: number): Promise<any> {
let isJobCompleted = false
while (!isJobCompleted) {
const statusResponse: AxiosResponse = await this.getRequest(this.apiUrl + `/v0/crawl/status/${jobId}`, headers)
if (statusResponse.status === 200) {
const statusData = statusResponse.data
switch (statusData.status) {
case 'completed':
isJobCompleted = true
if ('data' in statusData) {
return statusData.data
} else {
throw new Error('Crawl job completed but no data was returned')
}
case 'active':
case 'paused':
case 'pending':
case 'queued':
await new Promise((resolve) => setTimeout(resolve, Math.max(checkInterval, 2) * 1000))
break
default:
throw new Error(`Crawl job failed or was stopped. Status: ${statusData.status}`)
}
} else {
this.handleError(statusResponse, 'check crawl status')
}
}
}
private handleError(response: AxiosResponse, action: string): void {
if ([402, 408, 409, 500].includes(response.status)) {
const errorMessage: string = response.data.error || 'Unknown error occurred'
throw new Error(`Failed to ${action}. Status code: ${response.status}. Error: ${errorMessage}`)
} else {
throw new Error(`Unexpected error occurred while trying to ${action}. Status code: ${response.status}`)
}
}
}
// FireCrawl Loader
interface FirecrawlLoaderParameters {
url: string
apiKey?: string
apiUrl?: string
mode?: 'crawl' | 'scrape'
params?: Record<string, unknown>
}
class FireCrawlLoader extends BaseDocumentLoader {
private apiKey: string
private apiUrl: string
private url: string
private mode: 'crawl' | 'scrape'
private params?: Record<string, unknown>
constructor(loaderParams: FirecrawlLoaderParameters) {
super()
const { apiKey, apiUrl, url, mode = 'crawl', params } = loaderParams
if (!apiKey) {
throw new Error('Firecrawl API key not set. You can set it as FIRECRAWL_API_KEY in your .env file, or pass it to Firecrawl.')
}
this.apiKey = apiKey
this.url = url
this.mode = mode
this.params = params
this.apiUrl = apiUrl || 'https://api.firecrawl.dev'
}
public async load(): Promise<DocumentInterface[]> {
const app = new FirecrawlApp({ apiKey: this.apiKey, apiUrl: this.apiUrl })
let firecrawlDocs: FirecrawlDocument[]
if (this.mode === 'scrape') {
const response = await app.scrapeUrl(this.url, this.params)
if (!response.success) {
throw new Error(`Firecrawl: Failed to scrape URL. Error: ${response.error}`)
}
firecrawlDocs = [response.data as FirecrawlDocument]
} else if (this.mode === 'crawl') {
const response = await app.crawlUrl(this.url, this.params, true)
firecrawlDocs = response as FirecrawlDocument[]
} else {
throw new Error(`Unrecognized mode '${this.mode}'. Expected one of 'crawl', 'scrape'.`)
}
return firecrawlDocs.map(
(doc) =>
new Document({
pageContent: doc.markdown || '',
metadata: doc.metadata || {}
})
)
}
}
// Flowise Node Class
class FireCrawl_DocumentLoaders implements INode {
label: string
name: string
description: string
type: string
icon: string
version: number
category: string
baseClasses: string[]
inputs: INodeParams[]
credential: INodeParams
outputs: INodeOutputsValue[]
constructor() {
this.label = 'FireCrawl'
this.name = 'fireCrawl'
this.type = 'Document'
this.icon = 'firecrawl.png'
this.version = 2.1
this.category = 'Document Loaders'
this.description = 'Load data from URL using FireCrawl'
this.baseClasses = [this.type]
this.credential = {
label: 'FireCrawl API',
name: 'credential',
type: 'credential',
credentialNames: ['fireCrawlApi']
}
this.inputs = [
{
label: 'Text Splitter',
name: 'textSplitter',
type: 'TextSplitter',
optional: true
},
{
label: 'URLs',
name: 'url',
type: 'string',
description: 'URL to be crawled/scraped',
placeholder: 'https://docs.flowiseai.com'
},
{
label: 'Crawler type',
type: 'options',
name: 'crawlerType',
options: [
{
label: 'Crawl',
name: 'crawl',
description: 'Crawl a URL and all accessible subpages'
},
{
label: 'Scrape',
name: 'scrape',
description: 'Scrape a URL and get its content'
}
],
default: 'crawl'
},
{
// maxCrawlPages
label: 'Max Crawl Pages',
name: 'maxCrawlPages',
type: 'string',
description: 'Maximum number of pages to crawl',
optional: true,
additionalParams: true
},
{
// generateImgAltText
label: 'Generate Image Alt Text',
name: 'generateImgAltText',
type: 'boolean',
description: 'Generate alt text for images',
optional: true,
additionalParams: true
},
{
// returnOnlyUrls
label: 'Return Only URLs',
name: 'returnOnlyUrls',
type: 'boolean',
description: 'Return only URLs of the crawled pages',
optional: true,
additionalParams: true
},
{
// onlyMainContent
label: 'Only Main Content',
name: 'onlyMainContent',
type: 'boolean',
description: 'Extract only the main content of the page',
optional: true,
additionalParams: true
}
// ... (other input parameters)
]
this.outputs = [
{
label: 'Document',
name: 'document',
description: 'Array of document objects containing metadata and pageContent',
baseClasses: [...this.baseClasses, 'json']
},
{
label: 'Text',
name: 'text',
description: 'Concatenated string from pageContent of documents',
baseClasses: ['string', 'json']
}
]
}
async init(nodeData: INodeData, _: string, options: ICommonObject): Promise<any> {
const textSplitter = nodeData.inputs?.textSplitter as TextSplitter
const metadata = nodeData.inputs?.metadata
const url = nodeData.inputs?.url as string
const crawlerType = nodeData.inputs?.crawlerType as string
const maxCrawlPages = nodeData.inputs?.maxCrawlPages as string
const generateImgAltText = nodeData.inputs?.generateImgAltText as boolean
const returnOnlyUrls = nodeData.inputs?.returnOnlyUrls as boolean
const onlyMainContent = nodeData.inputs?.onlyMainContent as boolean
const credentialData = await getCredentialData(nodeData.credential ?? '', options)
const firecrawlApiToken = getCredentialParam('firecrawlApiToken', credentialData, nodeData)
const firecrawlApiUrl = getCredentialParam('firecrawlApiUrl', credentialData, nodeData, 'https://api.firecrawl.dev')
const output = nodeData.outputs?.output as string
const urlPatternsExcludes = nodeData.inputs?.urlPatternsExcludes
? (nodeData.inputs.urlPatternsExcludes.split(',') as string[])
: undefined
const urlPatternsIncludes = nodeData.inputs?.urlPatternsIncludes
? (nodeData.inputs.urlPatternsIncludes.split(',') as string[])
: undefined
const input: FirecrawlLoaderParameters = {
url,
mode: crawlerType as 'crawl' | 'scrape',
apiKey: firecrawlApiToken,
apiUrl: firecrawlApiUrl,
params: {
crawlerOptions: {
includes: urlPatternsIncludes,
excludes: urlPatternsExcludes,
generateImgAltText,
returnOnlyUrls,
limit: maxCrawlPages ? parseFloat(maxCrawlPages) : undefined
},
pageOptions: {
onlyMainContent
}
}
}
const loader = new FireCrawlLoader(input)
let docs = []
if (textSplitter) {
docs = await loader.loadAndSplit(textSplitter)
} else {
docs = await loader.load()
}
if (metadata) {
const parsedMetadata = typeof metadata === 'object' ? metadata : JSON.parse(metadata)
let finaldocs = []
for (const doc of docs) {
const newdoc = {
...doc,
metadata: {
...doc.metadata,
...parsedMetadata
}
}
finaldocs.push(newdoc)
}
return finaldocs
}
if (output === 'document') {
return docs
} else {
let finaltext = ''
for (const doc of docs) {
finaltext += `${doc.pageContent}\n`
}
return handleEscapeCharacters(finaltext, false)
}
}
}
module.exports = { nodeClass: FireCrawl_DocumentLoaders }