From 656f6cad819e8cf84b5764e4a6a153466f4fae67 Mon Sep 17 00:00:00 2001
From: William Espegren <131612909+WilliamEspegren@users.noreply.github.com>
Date: Tue, 2 Jul 2024 01:00:52 +0200
Subject: [PATCH] Feature/Spider (open-source web scraper & crawler) (#2738)
* Add Spider Scraper & Crawler
* fix pnpm lint
* chore: Update metadata to be correct format
* fix pnpm lint
---
.../credentials/SpiderApi.credential.ts | 25 +++
.../nodes/documentloaders/Spider/Spider.ts | 175 ++++++++++++++++++
.../nodes/documentloaders/Spider/SpiderApp.ts | 116 ++++++++++++
.../nodes/documentloaders/Spider/spider.svg | 1 +
4 files changed, 317 insertions(+)
create mode 100644 packages/components/credentials/SpiderApi.credential.ts
create mode 100644 packages/components/nodes/documentloaders/Spider/Spider.ts
create mode 100644 packages/components/nodes/documentloaders/Spider/SpiderApp.ts
create mode 100644 packages/components/nodes/documentloaders/Spider/spider.svg
diff --git a/packages/components/credentials/SpiderApi.credential.ts b/packages/components/credentials/SpiderApi.credential.ts
new file mode 100644
index 000000000..4586161dc
--- /dev/null
+++ b/packages/components/credentials/SpiderApi.credential.ts
@@ -0,0 +1,25 @@
+import { INodeParams, INodeCredential } from '../src/Interface'
+
+class SpiderApiCredential implements INodeCredential {
+ label: string
+ name: string
+ version: number
+ description: string
+ inputs: INodeParams[]
+
+ constructor() {
+ this.label = 'Spider API'
+ this.name = 'spiderApi'
+ this.version = 1.0
+ this.description = 'Get your API key from the Spider dashboard.'
+ this.inputs = [
+ {
+ label: 'Spider API Key',
+ name: 'spiderApiKey',
+ type: 'password'
+ }
+ ]
+ }
+}
+
+module.exports = { credClass: SpiderApiCredential }
diff --git a/packages/components/nodes/documentloaders/Spider/Spider.ts b/packages/components/nodes/documentloaders/Spider/Spider.ts
new file mode 100644
index 000000000..5cea4c638
--- /dev/null
+++ b/packages/components/nodes/documentloaders/Spider/Spider.ts
@@ -0,0 +1,175 @@
+import { TextSplitter } from 'langchain/text_splitter'
+import { Document, DocumentInterface } from '@langchain/core/documents'
+import { BaseDocumentLoader } from 'langchain/document_loaders/base'
+import { INode, INodeData, INodeParams, ICommonObject } from '../../../src/Interface'
+import { getCredentialData, getCredentialParam } from '../../../src/utils'
+import SpiderApp from './SpiderApp'
+
+interface SpiderLoaderParameters {
+ url: string
+ apiKey?: string
+ mode?: 'crawl' | 'scrape'
+ params?: Record
+}
+
+class SpiderLoader extends BaseDocumentLoader {
+ private apiKey: string
+ private url: string
+ private mode: 'crawl' | 'scrape'
+ private params?: Record
+
+ constructor(loaderParams: SpiderLoaderParameters) {
+ super()
+ const { apiKey, url, mode = 'crawl', params } = loaderParams
+ if (!apiKey) {
+ throw new Error('Spider API key not set. You can set it as SPIDER_API_KEY in your .env file, or pass it to Spider.')
+ }
+
+ this.apiKey = apiKey
+ this.url = url
+ this.mode = mode
+ this.params = params
+ }
+
+ public async load(): Promise {
+ const app = new SpiderApp({ apiKey: this.apiKey })
+ let spiderDocs: any[]
+
+ if (this.mode === 'scrape') {
+ const response = await app.scrapeUrl(this.url, this.params)
+ if (!response.success) {
+ throw new Error(`Spider: Failed to scrape URL. Error: ${response.error}`)
+ }
+ spiderDocs = [response.data]
+ } else if (this.mode === 'crawl') {
+ const response = await app.crawlUrl(this.url, this.params)
+ if (!response.success) {
+ throw new Error(`Spider: Failed to crawl URL. Error: ${response.error}`)
+ }
+ spiderDocs = response.data
+ } else {
+ throw new Error(`Unrecognized mode '${this.mode}'. Expected one of 'crawl', 'scrape'.`)
+ }
+
+ return spiderDocs.map(
+ (doc) =>
+ new Document({
+ pageContent: doc.content || '',
+ metadata: { source: doc.url }
+ })
+ )
+ }
+}
+
+class Spider_DocumentLoaders implements INode {
+ label: string
+ name: string
+ description: string
+ type: string
+ icon: string
+ version: number
+ category: string
+ baseClasses: string[]
+ inputs: INodeParams[]
+ credential: INodeParams
+
+ constructor() {
+ this.label = 'Spider Document Loaders'
+ this.name = 'spiderDocumentLoaders'
+ this.version = 1.0
+ this.type = 'Document'
+ this.icon = 'spider.svg'
+ this.category = 'Document Loaders'
+ this.description = 'Scrape & Crawl the web with Spider'
+ this.baseClasses = [this.type]
+ this.inputs = [
+ {
+ label: 'Text Splitter',
+ name: 'textSplitter',
+ type: 'TextSplitter',
+ optional: true
+ },
+ {
+ label: 'Mode',
+ name: 'mode',
+ type: 'options',
+ options: [
+ {
+ label: 'Scrape',
+ name: 'scrape',
+ description: 'Scrape a single page'
+ },
+ {
+ label: 'Crawl',
+ name: 'crawl',
+ description: 'Crawl a website and extract pages within the same domain'
+ }
+ ],
+ default: 'scrape'
+ },
+ {
+ label: 'Web Page URL',
+ name: 'url',
+ type: 'string',
+ placeholder: 'https://spider.cloud'
+ },
+ {
+ label: 'Additional Parameters',
+ name: 'params',
+ description:
+ 'Find all the available parameters in the Spider API documentation',
+ additionalParams: true,
+ placeholder: '{ "anti_bot": true }',
+ type: 'json',
+ optional: true
+ }
+ ]
+ this.credential = {
+ label: 'Credential',
+ name: 'credential',
+ type: 'credential',
+ credentialNames: ['spiderApi']
+ }
+ }
+
+ async init(nodeData: INodeData, _: string, options: ICommonObject): Promise {
+ const textSplitter = nodeData.inputs?.textSplitter as TextSplitter
+ const url = nodeData.inputs?.url as string
+ const mode = nodeData.inputs?.mode as 'crawl' | 'scrape'
+ let params = nodeData.inputs?.params || {}
+ const credentialData = await getCredentialData(nodeData.credential ?? '', options)
+ const spiderApiKey = getCredentialParam('spiderApiKey', credentialData, nodeData)
+
+ if (typeof params === 'string') {
+ try {
+ params = JSON.parse(params)
+ } catch (e) {
+ throw new Error('Invalid JSON string provided for params')
+ }
+ }
+
+ // Ensure return_format is set to markdown
+ params.return_format = 'markdown'
+
+ const input: SpiderLoaderParameters = {
+ url,
+ mode: mode as 'crawl' | 'scrape',
+ apiKey: spiderApiKey,
+ params: params as Record
+ }
+
+ const loader = new SpiderLoader(input)
+
+ let docs = []
+
+ if (textSplitter) {
+ docs = await loader.loadAndSplit(textSplitter)
+ } else {
+ docs = await loader.load()
+ }
+
+ return docs
+ }
+}
+
+module.exports = { nodeClass: Spider_DocumentLoaders }
diff --git a/packages/components/nodes/documentloaders/Spider/SpiderApp.ts b/packages/components/nodes/documentloaders/Spider/SpiderApp.ts
new file mode 100644
index 000000000..e2bc1d5bf
--- /dev/null
+++ b/packages/components/nodes/documentloaders/Spider/SpiderApp.ts
@@ -0,0 +1,116 @@
+import axios, { AxiosResponse, AxiosRequestHeaders } from 'axios'
+
+interface SpiderAppConfig {
+ apiKey?: string | null
+ apiUrl?: string | null
+}
+
+interface SpiderDocumentMetadata {
+ title?: string
+ description?: string
+ language?: string
+ [key: string]: any
+}
+
+interface SpiderDocument {
+ id?: string
+ url?: string
+ content: string
+ markdown?: string
+ html?: string
+ createdAt?: Date
+ updatedAt?: Date
+ type?: string
+ metadata: SpiderDocumentMetadata
+}
+
+interface ScrapeResponse {
+ success: boolean
+ data?: SpiderDocument
+ error?: string
+}
+
+interface CrawlResponse {
+ success: boolean
+ data?: SpiderDocument[]
+ error?: string
+}
+
+interface Params {
+ [key: string]: any
+}
+
+class SpiderApp {
+ private apiKey: string
+ private apiUrl: string
+
+ constructor({ apiKey = null, apiUrl = null }: SpiderAppConfig) {
+ this.apiKey = apiKey || ''
+ this.apiUrl = apiUrl || 'https://api.spider.cloud/v1'
+ if (!this.apiKey) {
+ throw new Error('No API key provided')
+ }
+ }
+
+ async scrapeUrl(url: string, params: Params | null = null): Promise {
+ const headers = this.prepareHeaders()
+ const jsonData: Params = { url, limit: 1, ...params }
+
+ try {
+ const response: AxiosResponse = await this.postRequest('crawl', jsonData, headers)
+ if (response.status === 200) {
+ const responseData = response.data
+ if (responseData[0].status) {
+ return { success: true, data: responseData[0] }
+ } else {
+ throw new Error(`Failed to scrape URL. Error: ${responseData.error}`)
+ }
+ } else {
+ this.handleError(response, 'scrape URL')
+ }
+ } catch (error: any) {
+ throw new Error(error.message)
+ }
+ return { success: false, error: 'Internal server error.' }
+ }
+
+ async crawlUrl(url: string, params: Params | null = null, idempotencyKey?: string): Promise {
+ const headers = this.prepareHeaders(idempotencyKey)
+ const jsonData: Params = { url, ...params }
+
+ try {
+ const response: AxiosResponse = await this.postRequest('crawl', jsonData, headers)
+ if (response.status === 200) {
+ return { success: true, data: response.data }
+ } else {
+ this.handleError(response, 'start crawl job')
+ }
+ } catch (error: any) {
+ throw new Error(error.message)
+ }
+ return { success: false, error: 'Internal server error.' }
+ }
+
+ private prepareHeaders(idempotencyKey?: string): AxiosRequestHeaders {
+ return {
+ 'Content-Type': 'application/json',
+ Authorization: `Bearer ${this.apiKey}`,
+ ...(idempotencyKey ? { 'x-idempotency-key': idempotencyKey } : {})
+ } as AxiosRequestHeaders & { 'x-idempotency-key'?: string }
+ }
+
+ private postRequest(url: string, data: Params, headers: AxiosRequestHeaders): Promise {
+ return axios.post(`${this.apiUrl}/${url}`, data, { headers })
+ }
+
+ private handleError(response: AxiosResponse, action: string): void {
+ if ([402, 408, 409, 500].includes(response.status)) {
+ const errorMessage: string = response.data.error || 'Unknown error occurred'
+ throw new Error(`Failed to ${action}. Status code: ${response.status}. Error: ${errorMessage}`)
+ } else {
+ throw new Error(`Unexpected error occurred while trying to ${action}. Status code: ${response.status}`)
+ }
+ }
+}
+
+export default SpiderApp
diff --git a/packages/components/nodes/documentloaders/Spider/spider.svg b/packages/components/nodes/documentloaders/Spider/spider.svg
new file mode 100644
index 000000000..604a09d01
--- /dev/null
+++ b/packages/components/nodes/documentloaders/Spider/spider.svg
@@ -0,0 +1 @@
+