From 656f6cad819e8cf84b5764e4a6a153466f4fae67 Mon Sep 17 00:00:00 2001 From: William Espegren <131612909+WilliamEspegren@users.noreply.github.com> Date: Tue, 2 Jul 2024 01:00:52 +0200 Subject: [PATCH] Feature/Spider (open-source web scraper & crawler) (#2738) * Add Spider Scraper & Crawler * fix pnpm lint * chore: Update metadata to be correct format * fix pnpm lint --- .../credentials/SpiderApi.credential.ts | 25 +++ .../nodes/documentloaders/Spider/Spider.ts | 175 ++++++++++++++++++ .../nodes/documentloaders/Spider/SpiderApp.ts | 116 ++++++++++++ .../nodes/documentloaders/Spider/spider.svg | 1 + 4 files changed, 317 insertions(+) create mode 100644 packages/components/credentials/SpiderApi.credential.ts create mode 100644 packages/components/nodes/documentloaders/Spider/Spider.ts create mode 100644 packages/components/nodes/documentloaders/Spider/SpiderApp.ts create mode 100644 packages/components/nodes/documentloaders/Spider/spider.svg diff --git a/packages/components/credentials/SpiderApi.credential.ts b/packages/components/credentials/SpiderApi.credential.ts new file mode 100644 index 000000000..4586161dc --- /dev/null +++ b/packages/components/credentials/SpiderApi.credential.ts @@ -0,0 +1,25 @@ +import { INodeParams, INodeCredential } from '../src/Interface' + +class SpiderApiCredential implements INodeCredential { + label: string + name: string + version: number + description: string + inputs: INodeParams[] + + constructor() { + this.label = 'Spider API' + this.name = 'spiderApi' + this.version = 1.0 + this.description = 'Get your API key from the Spider dashboard.' + this.inputs = [ + { + label: 'Spider API Key', + name: 'spiderApiKey', + type: 'password' + } + ] + } +} + +module.exports = { credClass: SpiderApiCredential } diff --git a/packages/components/nodes/documentloaders/Spider/Spider.ts b/packages/components/nodes/documentloaders/Spider/Spider.ts new file mode 100644 index 000000000..5cea4c638 --- /dev/null +++ b/packages/components/nodes/documentloaders/Spider/Spider.ts @@ -0,0 +1,175 @@ +import { TextSplitter } from 'langchain/text_splitter' +import { Document, DocumentInterface } from '@langchain/core/documents' +import { BaseDocumentLoader } from 'langchain/document_loaders/base' +import { INode, INodeData, INodeParams, ICommonObject } from '../../../src/Interface' +import { getCredentialData, getCredentialParam } from '../../../src/utils' +import SpiderApp from './SpiderApp' + +interface SpiderLoaderParameters { + url: string + apiKey?: string + mode?: 'crawl' | 'scrape' + params?: Record +} + +class SpiderLoader extends BaseDocumentLoader { + private apiKey: string + private url: string + private mode: 'crawl' | 'scrape' + private params?: Record + + constructor(loaderParams: SpiderLoaderParameters) { + super() + const { apiKey, url, mode = 'crawl', params } = loaderParams + if (!apiKey) { + throw new Error('Spider API key not set. You can set it as SPIDER_API_KEY in your .env file, or pass it to Spider.') + } + + this.apiKey = apiKey + this.url = url + this.mode = mode + this.params = params + } + + public async load(): Promise { + const app = new SpiderApp({ apiKey: this.apiKey }) + let spiderDocs: any[] + + if (this.mode === 'scrape') { + const response = await app.scrapeUrl(this.url, this.params) + if (!response.success) { + throw new Error(`Spider: Failed to scrape URL. Error: ${response.error}`) + } + spiderDocs = [response.data] + } else if (this.mode === 'crawl') { + const response = await app.crawlUrl(this.url, this.params) + if (!response.success) { + throw new Error(`Spider: Failed to crawl URL. Error: ${response.error}`) + } + spiderDocs = response.data + } else { + throw new Error(`Unrecognized mode '${this.mode}'. Expected one of 'crawl', 'scrape'.`) + } + + return spiderDocs.map( + (doc) => + new Document({ + pageContent: doc.content || '', + metadata: { source: doc.url } + }) + ) + } +} + +class Spider_DocumentLoaders implements INode { + label: string + name: string + description: string + type: string + icon: string + version: number + category: string + baseClasses: string[] + inputs: INodeParams[] + credential: INodeParams + + constructor() { + this.label = 'Spider Document Loaders' + this.name = 'spiderDocumentLoaders' + this.version = 1.0 + this.type = 'Document' + this.icon = 'spider.svg' + this.category = 'Document Loaders' + this.description = 'Scrape & Crawl the web with Spider' + this.baseClasses = [this.type] + this.inputs = [ + { + label: 'Text Splitter', + name: 'textSplitter', + type: 'TextSplitter', + optional: true + }, + { + label: 'Mode', + name: 'mode', + type: 'options', + options: [ + { + label: 'Scrape', + name: 'scrape', + description: 'Scrape a single page' + }, + { + label: 'Crawl', + name: 'crawl', + description: 'Crawl a website and extract pages within the same domain' + } + ], + default: 'scrape' + }, + { + label: 'Web Page URL', + name: 'url', + type: 'string', + placeholder: 'https://spider.cloud' + }, + { + label: 'Additional Parameters', + name: 'params', + description: + 'Find all the available parameters in the Spider API documentation', + additionalParams: true, + placeholder: '{ "anti_bot": true }', + type: 'json', + optional: true + } + ] + this.credential = { + label: 'Credential', + name: 'credential', + type: 'credential', + credentialNames: ['spiderApi'] + } + } + + async init(nodeData: INodeData, _: string, options: ICommonObject): Promise { + const textSplitter = nodeData.inputs?.textSplitter as TextSplitter + const url = nodeData.inputs?.url as string + const mode = nodeData.inputs?.mode as 'crawl' | 'scrape' + let params = nodeData.inputs?.params || {} + const credentialData = await getCredentialData(nodeData.credential ?? '', options) + const spiderApiKey = getCredentialParam('spiderApiKey', credentialData, nodeData) + + if (typeof params === 'string') { + try { + params = JSON.parse(params) + } catch (e) { + throw new Error('Invalid JSON string provided for params') + } + } + + // Ensure return_format is set to markdown + params.return_format = 'markdown' + + const input: SpiderLoaderParameters = { + url, + mode: mode as 'crawl' | 'scrape', + apiKey: spiderApiKey, + params: params as Record + } + + const loader = new SpiderLoader(input) + + let docs = [] + + if (textSplitter) { + docs = await loader.loadAndSplit(textSplitter) + } else { + docs = await loader.load() + } + + return docs + } +} + +module.exports = { nodeClass: Spider_DocumentLoaders } diff --git a/packages/components/nodes/documentloaders/Spider/SpiderApp.ts b/packages/components/nodes/documentloaders/Spider/SpiderApp.ts new file mode 100644 index 000000000..e2bc1d5bf --- /dev/null +++ b/packages/components/nodes/documentloaders/Spider/SpiderApp.ts @@ -0,0 +1,116 @@ +import axios, { AxiosResponse, AxiosRequestHeaders } from 'axios' + +interface SpiderAppConfig { + apiKey?: string | null + apiUrl?: string | null +} + +interface SpiderDocumentMetadata { + title?: string + description?: string + language?: string + [key: string]: any +} + +interface SpiderDocument { + id?: string + url?: string + content: string + markdown?: string + html?: string + createdAt?: Date + updatedAt?: Date + type?: string + metadata: SpiderDocumentMetadata +} + +interface ScrapeResponse { + success: boolean + data?: SpiderDocument + error?: string +} + +interface CrawlResponse { + success: boolean + data?: SpiderDocument[] + error?: string +} + +interface Params { + [key: string]: any +} + +class SpiderApp { + private apiKey: string + private apiUrl: string + + constructor({ apiKey = null, apiUrl = null }: SpiderAppConfig) { + this.apiKey = apiKey || '' + this.apiUrl = apiUrl || 'https://api.spider.cloud/v1' + if (!this.apiKey) { + throw new Error('No API key provided') + } + } + + async scrapeUrl(url: string, params: Params | null = null): Promise { + const headers = this.prepareHeaders() + const jsonData: Params = { url, limit: 1, ...params } + + try { + const response: AxiosResponse = await this.postRequest('crawl', jsonData, headers) + if (response.status === 200) { + const responseData = response.data + if (responseData[0].status) { + return { success: true, data: responseData[0] } + } else { + throw new Error(`Failed to scrape URL. Error: ${responseData.error}`) + } + } else { + this.handleError(response, 'scrape URL') + } + } catch (error: any) { + throw new Error(error.message) + } + return { success: false, error: 'Internal server error.' } + } + + async crawlUrl(url: string, params: Params | null = null, idempotencyKey?: string): Promise { + const headers = this.prepareHeaders(idempotencyKey) + const jsonData: Params = { url, ...params } + + try { + const response: AxiosResponse = await this.postRequest('crawl', jsonData, headers) + if (response.status === 200) { + return { success: true, data: response.data } + } else { + this.handleError(response, 'start crawl job') + } + } catch (error: any) { + throw new Error(error.message) + } + return { success: false, error: 'Internal server error.' } + } + + private prepareHeaders(idempotencyKey?: string): AxiosRequestHeaders { + return { + 'Content-Type': 'application/json', + Authorization: `Bearer ${this.apiKey}`, + ...(idempotencyKey ? { 'x-idempotency-key': idempotencyKey } : {}) + } as AxiosRequestHeaders & { 'x-idempotency-key'?: string } + } + + private postRequest(url: string, data: Params, headers: AxiosRequestHeaders): Promise { + return axios.post(`${this.apiUrl}/${url}`, data, { headers }) + } + + private handleError(response: AxiosResponse, action: string): void { + if ([402, 408, 409, 500].includes(response.status)) { + const errorMessage: string = response.data.error || 'Unknown error occurred' + throw new Error(`Failed to ${action}. Status code: ${response.status}. Error: ${errorMessage}`) + } else { + throw new Error(`Unexpected error occurred while trying to ${action}. Status code: ${response.status}`) + } + } +} + +export default SpiderApp diff --git a/packages/components/nodes/documentloaders/Spider/spider.svg b/packages/components/nodes/documentloaders/Spider/spider.svg new file mode 100644 index 000000000..604a09d01 --- /dev/null +++ b/packages/components/nodes/documentloaders/Spider/spider.svg @@ -0,0 +1 @@ +Spider v1 Logo