Feature/Spider (open-source web scraper & crawler) (#2738)

* Add Spider Scraper & Crawler

* fix pnpm lint

* chore: Update metadata to be correct format

* fix pnpm lint
This commit is contained in:
William Espegren 2024-07-02 01:00:52 +02:00 committed by GitHub
parent efc6e02828
commit 656f6cad81
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 317 additions and 0 deletions

View File

@ -0,0 +1,25 @@
import { INodeParams, INodeCredential } from '../src/Interface'
class SpiderApiCredential implements INodeCredential {
label: string
name: string
version: number
description: string
inputs: INodeParams[]
constructor() {
this.label = 'Spider API'
this.name = 'spiderApi'
this.version = 1.0
this.description = 'Get your API key from the <a target="_blank" href="https://spider.cloud">Spider</a> dashboard.'
this.inputs = [
{
label: 'Spider API Key',
name: 'spiderApiKey',
type: 'password'
}
]
}
}
module.exports = { credClass: SpiderApiCredential }

View File

@ -0,0 +1,175 @@
import { TextSplitter } from 'langchain/text_splitter'
import { Document, DocumentInterface } from '@langchain/core/documents'
import { BaseDocumentLoader } from 'langchain/document_loaders/base'
import { INode, INodeData, INodeParams, ICommonObject } from '../../../src/Interface'
import { getCredentialData, getCredentialParam } from '../../../src/utils'
import SpiderApp from './SpiderApp'
interface SpiderLoaderParameters {
url: string
apiKey?: string
mode?: 'crawl' | 'scrape'
params?: Record<string, unknown>
}
class SpiderLoader extends BaseDocumentLoader {
private apiKey: string
private url: string
private mode: 'crawl' | 'scrape'
private params?: Record<string, unknown>
constructor(loaderParams: SpiderLoaderParameters) {
super()
const { apiKey, url, mode = 'crawl', params } = loaderParams
if (!apiKey) {
throw new Error('Spider API key not set. You can set it as SPIDER_API_KEY in your .env file, or pass it to Spider.')
}
this.apiKey = apiKey
this.url = url
this.mode = mode
this.params = params
}
public async load(): Promise<DocumentInterface[]> {
const app = new SpiderApp({ apiKey: this.apiKey })
let spiderDocs: any[]
if (this.mode === 'scrape') {
const response = await app.scrapeUrl(this.url, this.params)
if (!response.success) {
throw new Error(`Spider: Failed to scrape URL. Error: ${response.error}`)
}
spiderDocs = [response.data]
} else if (this.mode === 'crawl') {
const response = await app.crawlUrl(this.url, this.params)
if (!response.success) {
throw new Error(`Spider: Failed to crawl URL. Error: ${response.error}`)
}
spiderDocs = response.data
} else {
throw new Error(`Unrecognized mode '${this.mode}'. Expected one of 'crawl', 'scrape'.`)
}
return spiderDocs.map(
(doc) =>
new Document({
pageContent: doc.content || '',
metadata: { source: doc.url }
})
)
}
}
class Spider_DocumentLoaders implements INode {
label: string
name: string
description: string
type: string
icon: string
version: number
category: string
baseClasses: string[]
inputs: INodeParams[]
credential: INodeParams
constructor() {
this.label = 'Spider Document Loaders'
this.name = 'spiderDocumentLoaders'
this.version = 1.0
this.type = 'Document'
this.icon = 'spider.svg'
this.category = 'Document Loaders'
this.description = 'Scrape & Crawl the web with Spider'
this.baseClasses = [this.type]
this.inputs = [
{
label: 'Text Splitter',
name: 'textSplitter',
type: 'TextSplitter',
optional: true
},
{
label: 'Mode',
name: 'mode',
type: 'options',
options: [
{
label: 'Scrape',
name: 'scrape',
description: 'Scrape a single page'
},
{
label: 'Crawl',
name: 'crawl',
description: 'Crawl a website and extract pages within the same domain'
}
],
default: 'scrape'
},
{
label: 'Web Page URL',
name: 'url',
type: 'string',
placeholder: 'https://spider.cloud'
},
{
label: 'Additional Parameters',
name: 'params',
description:
'Find all the available parameters in the <a _target="blank" href="https://spider.cloud/docs/api">Spider API documentation</a>',
additionalParams: true,
placeholder: '{ "anti_bot": true }',
type: 'json',
optional: true
}
]
this.credential = {
label: 'Credential',
name: 'credential',
type: 'credential',
credentialNames: ['spiderApi']
}
}
async init(nodeData: INodeData, _: string, options: ICommonObject): Promise<any> {
const textSplitter = nodeData.inputs?.textSplitter as TextSplitter
const url = nodeData.inputs?.url as string
const mode = nodeData.inputs?.mode as 'crawl' | 'scrape'
let params = nodeData.inputs?.params || {}
const credentialData = await getCredentialData(nodeData.credential ?? '', options)
const spiderApiKey = getCredentialParam('spiderApiKey', credentialData, nodeData)
if (typeof params === 'string') {
try {
params = JSON.parse(params)
} catch (e) {
throw new Error('Invalid JSON string provided for params')
}
}
// Ensure return_format is set to markdown
params.return_format = 'markdown'
const input: SpiderLoaderParameters = {
url,
mode: mode as 'crawl' | 'scrape',
apiKey: spiderApiKey,
params: params as Record<string, unknown>
}
const loader = new SpiderLoader(input)
let docs = []
if (textSplitter) {
docs = await loader.loadAndSplit(textSplitter)
} else {
docs = await loader.load()
}
return docs
}
}
module.exports = { nodeClass: Spider_DocumentLoaders }

View File

@ -0,0 +1,116 @@
import axios, { AxiosResponse, AxiosRequestHeaders } from 'axios'
interface SpiderAppConfig {
apiKey?: string | null
apiUrl?: string | null
}
interface SpiderDocumentMetadata {
title?: string
description?: string
language?: string
[key: string]: any
}
interface SpiderDocument {
id?: string
url?: string
content: string
markdown?: string
html?: string
createdAt?: Date
updatedAt?: Date
type?: string
metadata: SpiderDocumentMetadata
}
interface ScrapeResponse {
success: boolean
data?: SpiderDocument
error?: string
}
interface CrawlResponse {
success: boolean
data?: SpiderDocument[]
error?: string
}
interface Params {
[key: string]: any
}
class SpiderApp {
private apiKey: string
private apiUrl: string
constructor({ apiKey = null, apiUrl = null }: SpiderAppConfig) {
this.apiKey = apiKey || ''
this.apiUrl = apiUrl || 'https://api.spider.cloud/v1'
if (!this.apiKey) {
throw new Error('No API key provided')
}
}
async scrapeUrl(url: string, params: Params | null = null): Promise<ScrapeResponse> {
const headers = this.prepareHeaders()
const jsonData: Params = { url, limit: 1, ...params }
try {
const response: AxiosResponse = await this.postRequest('crawl', jsonData, headers)
if (response.status === 200) {
const responseData = response.data
if (responseData[0].status) {
return { success: true, data: responseData[0] }
} else {
throw new Error(`Failed to scrape URL. Error: ${responseData.error}`)
}
} else {
this.handleError(response, 'scrape URL')
}
} catch (error: any) {
throw new Error(error.message)
}
return { success: false, error: 'Internal server error.' }
}
async crawlUrl(url: string, params: Params | null = null, idempotencyKey?: string): Promise<CrawlResponse | any> {
const headers = this.prepareHeaders(idempotencyKey)
const jsonData: Params = { url, ...params }
try {
const response: AxiosResponse = await this.postRequest('crawl', jsonData, headers)
if (response.status === 200) {
return { success: true, data: response.data }
} else {
this.handleError(response, 'start crawl job')
}
} catch (error: any) {
throw new Error(error.message)
}
return { success: false, error: 'Internal server error.' }
}
private prepareHeaders(idempotencyKey?: string): AxiosRequestHeaders {
return {
'Content-Type': 'application/json',
Authorization: `Bearer ${this.apiKey}`,
...(idempotencyKey ? { 'x-idempotency-key': idempotencyKey } : {})
} as AxiosRequestHeaders & { 'x-idempotency-key'?: string }
}
private postRequest(url: string, data: Params, headers: AxiosRequestHeaders): Promise<AxiosResponse> {
return axios.post(`${this.apiUrl}/${url}`, data, { headers })
}
private handleError(response: AxiosResponse, action: string): void {
if ([402, 408, 409, 500].includes(response.status)) {
const errorMessage: string = response.data.error || 'Unknown error occurred'
throw new Error(`Failed to ${action}. Status code: ${response.status}. Error: ${errorMessage}`)
} else {
throw new Error(`Unexpected error occurred while trying to ${action}. Status code: ${response.status}`)
}
}
}
export default SpiderApp

View File

@ -0,0 +1 @@
<svg height="30" width="30" viewBox="0 0 36 34" xml:space="preserve" xmlns="http://www.w3.org/2000/svg" class="fill-accent-foreground transition-all group-hover:scale-110"><title>Spider v1 Logo</title><path fill-rule="evenodd" clip-rule="evenodd" d="M9.13883 7.06589V0.164429L13.0938 0.164429V6.175L14.5178 7.4346C15.577 6.68656 16.7337 6.27495 17.945 6.27495C19.1731 6.27495 20.3451 6.69807 21.4163 7.46593L22.8757 6.175V0.164429L26.8307 0.164429V7.06589V7.95679L26.1634 8.54706L24.0775 10.3922C24.3436 10.8108 24.5958 11.2563 24.8327 11.7262L26.0467 11.4215L28.6971 8.08749L31.793 10.5487L28.7257 14.407L28.3089 14.9313L27.6592 15.0944L26.2418 15.4502C26.3124 15.7082 26.3793 15.9701 26.4422 16.2355L28.653 16.6566L29.092 16.7402L29.4524 17.0045L35.3849 21.355L33.0461 24.5444L27.474 20.4581L27.0719 20.3816C27.1214 21.0613 27.147 21.7543 27.147 22.4577C27.147 22.5398 27.1466 22.6214 27.1459 22.7024L29.5889 23.7911L30.3219 24.1177L30.62 24.8629L33.6873 32.5312L30.0152 34L27.246 27.0769L26.7298 26.8469C25.5612 32.2432 22.0701 33.8808 17.945 33.8808C13.8382 33.8808 10.3598 32.2577 9.17593 26.9185L8.82034 27.0769L6.05109 34L2.37897 32.5312L5.44629 24.8629L5.74435 24.1177L6.47743 23.7911L8.74487 22.7806C8.74366 22.6739 8.74305 22.5663 8.74305 22.4577C8.74305 21.7616 8.76804 21.0758 8.81654 20.4028L8.52606 20.4581L2.95395 24.5444L0.615112 21.355L6.54761 17.0045L6.908 16.7402L7.34701 16.6566L9.44264 16.2575C9.50917 15.9756 9.5801 15.6978 9.65528 15.4242L8.34123 15.0944L7.69155 14.9313L7.27471 14.407L4.20739 10.5487L7.30328 8.08749L9.95376 11.4215L11.0697 11.7016C11.3115 11.2239 11.5692 10.7716 11.8412 10.3473L9.80612 8.54706L9.13883 7.95679V7.06589Z"></path></svg>

After

Width:  |  Height:  |  Size: 1.6 KiB