Feature/Spider (open-source web scraper & crawler) (#2738)
* Add Spider Scraper & Crawler * fix pnpm lint * chore: Update metadata to be correct format * fix pnpm lint
This commit is contained in:
parent
efc6e02828
commit
656f6cad81
|
|
@ -0,0 +1,25 @@
|
|||
import { INodeParams, INodeCredential } from '../src/Interface'
|
||||
|
||||
class SpiderApiCredential implements INodeCredential {
|
||||
label: string
|
||||
name: string
|
||||
version: number
|
||||
description: string
|
||||
inputs: INodeParams[]
|
||||
|
||||
constructor() {
|
||||
this.label = 'Spider API'
|
||||
this.name = 'spiderApi'
|
||||
this.version = 1.0
|
||||
this.description = 'Get your API key from the <a target="_blank" href="https://spider.cloud">Spider</a> dashboard.'
|
||||
this.inputs = [
|
||||
{
|
||||
label: 'Spider API Key',
|
||||
name: 'spiderApiKey',
|
||||
type: 'password'
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
|
||||
module.exports = { credClass: SpiderApiCredential }
|
||||
|
|
@ -0,0 +1,175 @@
|
|||
import { TextSplitter } from 'langchain/text_splitter'
|
||||
import { Document, DocumentInterface } from '@langchain/core/documents'
|
||||
import { BaseDocumentLoader } from 'langchain/document_loaders/base'
|
||||
import { INode, INodeData, INodeParams, ICommonObject } from '../../../src/Interface'
|
||||
import { getCredentialData, getCredentialParam } from '../../../src/utils'
|
||||
import SpiderApp from './SpiderApp'
|
||||
|
||||
interface SpiderLoaderParameters {
|
||||
url: string
|
||||
apiKey?: string
|
||||
mode?: 'crawl' | 'scrape'
|
||||
params?: Record<string, unknown>
|
||||
}
|
||||
|
||||
class SpiderLoader extends BaseDocumentLoader {
|
||||
private apiKey: string
|
||||
private url: string
|
||||
private mode: 'crawl' | 'scrape'
|
||||
private params?: Record<string, unknown>
|
||||
|
||||
constructor(loaderParams: SpiderLoaderParameters) {
|
||||
super()
|
||||
const { apiKey, url, mode = 'crawl', params } = loaderParams
|
||||
if (!apiKey) {
|
||||
throw new Error('Spider API key not set. You can set it as SPIDER_API_KEY in your .env file, or pass it to Spider.')
|
||||
}
|
||||
|
||||
this.apiKey = apiKey
|
||||
this.url = url
|
||||
this.mode = mode
|
||||
this.params = params
|
||||
}
|
||||
|
||||
public async load(): Promise<DocumentInterface[]> {
|
||||
const app = new SpiderApp({ apiKey: this.apiKey })
|
||||
let spiderDocs: any[]
|
||||
|
||||
if (this.mode === 'scrape') {
|
||||
const response = await app.scrapeUrl(this.url, this.params)
|
||||
if (!response.success) {
|
||||
throw new Error(`Spider: Failed to scrape URL. Error: ${response.error}`)
|
||||
}
|
||||
spiderDocs = [response.data]
|
||||
} else if (this.mode === 'crawl') {
|
||||
const response = await app.crawlUrl(this.url, this.params)
|
||||
if (!response.success) {
|
||||
throw new Error(`Spider: Failed to crawl URL. Error: ${response.error}`)
|
||||
}
|
||||
spiderDocs = response.data
|
||||
} else {
|
||||
throw new Error(`Unrecognized mode '${this.mode}'. Expected one of 'crawl', 'scrape'.`)
|
||||
}
|
||||
|
||||
return spiderDocs.map(
|
||||
(doc) =>
|
||||
new Document({
|
||||
pageContent: doc.content || '',
|
||||
metadata: { source: doc.url }
|
||||
})
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
class Spider_DocumentLoaders implements INode {
|
||||
label: string
|
||||
name: string
|
||||
description: string
|
||||
type: string
|
||||
icon: string
|
||||
version: number
|
||||
category: string
|
||||
baseClasses: string[]
|
||||
inputs: INodeParams[]
|
||||
credential: INodeParams
|
||||
|
||||
constructor() {
|
||||
this.label = 'Spider Document Loaders'
|
||||
this.name = 'spiderDocumentLoaders'
|
||||
this.version = 1.0
|
||||
this.type = 'Document'
|
||||
this.icon = 'spider.svg'
|
||||
this.category = 'Document Loaders'
|
||||
this.description = 'Scrape & Crawl the web with Spider'
|
||||
this.baseClasses = [this.type]
|
||||
this.inputs = [
|
||||
{
|
||||
label: 'Text Splitter',
|
||||
name: 'textSplitter',
|
||||
type: 'TextSplitter',
|
||||
optional: true
|
||||
},
|
||||
{
|
||||
label: 'Mode',
|
||||
name: 'mode',
|
||||
type: 'options',
|
||||
options: [
|
||||
{
|
||||
label: 'Scrape',
|
||||
name: 'scrape',
|
||||
description: 'Scrape a single page'
|
||||
},
|
||||
{
|
||||
label: 'Crawl',
|
||||
name: 'crawl',
|
||||
description: 'Crawl a website and extract pages within the same domain'
|
||||
}
|
||||
],
|
||||
default: 'scrape'
|
||||
},
|
||||
{
|
||||
label: 'Web Page URL',
|
||||
name: 'url',
|
||||
type: 'string',
|
||||
placeholder: 'https://spider.cloud'
|
||||
},
|
||||
{
|
||||
label: 'Additional Parameters',
|
||||
name: 'params',
|
||||
description:
|
||||
'Find all the available parameters in the <a _target="blank" href="https://spider.cloud/docs/api">Spider API documentation</a>',
|
||||
additionalParams: true,
|
||||
placeholder: '{ "anti_bot": true }',
|
||||
type: 'json',
|
||||
optional: true
|
||||
}
|
||||
]
|
||||
this.credential = {
|
||||
label: 'Credential',
|
||||
name: 'credential',
|
||||
type: 'credential',
|
||||
credentialNames: ['spiderApi']
|
||||
}
|
||||
}
|
||||
|
||||
async init(nodeData: INodeData, _: string, options: ICommonObject): Promise<any> {
|
||||
const textSplitter = nodeData.inputs?.textSplitter as TextSplitter
|
||||
const url = nodeData.inputs?.url as string
|
||||
const mode = nodeData.inputs?.mode as 'crawl' | 'scrape'
|
||||
let params = nodeData.inputs?.params || {}
|
||||
const credentialData = await getCredentialData(nodeData.credential ?? '', options)
|
||||
const spiderApiKey = getCredentialParam('spiderApiKey', credentialData, nodeData)
|
||||
|
||||
if (typeof params === 'string') {
|
||||
try {
|
||||
params = JSON.parse(params)
|
||||
} catch (e) {
|
||||
throw new Error('Invalid JSON string provided for params')
|
||||
}
|
||||
}
|
||||
|
||||
// Ensure return_format is set to markdown
|
||||
params.return_format = 'markdown'
|
||||
|
||||
const input: SpiderLoaderParameters = {
|
||||
url,
|
||||
mode: mode as 'crawl' | 'scrape',
|
||||
apiKey: spiderApiKey,
|
||||
params: params as Record<string, unknown>
|
||||
}
|
||||
|
||||
const loader = new SpiderLoader(input)
|
||||
|
||||
let docs = []
|
||||
|
||||
if (textSplitter) {
|
||||
docs = await loader.loadAndSplit(textSplitter)
|
||||
} else {
|
||||
docs = await loader.load()
|
||||
}
|
||||
|
||||
return docs
|
||||
}
|
||||
}
|
||||
|
||||
module.exports = { nodeClass: Spider_DocumentLoaders }
|
||||
|
|
@ -0,0 +1,116 @@
|
|||
import axios, { AxiosResponse, AxiosRequestHeaders } from 'axios'
|
||||
|
||||
interface SpiderAppConfig {
|
||||
apiKey?: string | null
|
||||
apiUrl?: string | null
|
||||
}
|
||||
|
||||
interface SpiderDocumentMetadata {
|
||||
title?: string
|
||||
description?: string
|
||||
language?: string
|
||||
[key: string]: any
|
||||
}
|
||||
|
||||
interface SpiderDocument {
|
||||
id?: string
|
||||
url?: string
|
||||
content: string
|
||||
markdown?: string
|
||||
html?: string
|
||||
createdAt?: Date
|
||||
updatedAt?: Date
|
||||
type?: string
|
||||
metadata: SpiderDocumentMetadata
|
||||
}
|
||||
|
||||
interface ScrapeResponse {
|
||||
success: boolean
|
||||
data?: SpiderDocument
|
||||
error?: string
|
||||
}
|
||||
|
||||
interface CrawlResponse {
|
||||
success: boolean
|
||||
data?: SpiderDocument[]
|
||||
error?: string
|
||||
}
|
||||
|
||||
interface Params {
|
||||
[key: string]: any
|
||||
}
|
||||
|
||||
class SpiderApp {
|
||||
private apiKey: string
|
||||
private apiUrl: string
|
||||
|
||||
constructor({ apiKey = null, apiUrl = null }: SpiderAppConfig) {
|
||||
this.apiKey = apiKey || ''
|
||||
this.apiUrl = apiUrl || 'https://api.spider.cloud/v1'
|
||||
if (!this.apiKey) {
|
||||
throw new Error('No API key provided')
|
||||
}
|
||||
}
|
||||
|
||||
async scrapeUrl(url: string, params: Params | null = null): Promise<ScrapeResponse> {
|
||||
const headers = this.prepareHeaders()
|
||||
const jsonData: Params = { url, limit: 1, ...params }
|
||||
|
||||
try {
|
||||
const response: AxiosResponse = await this.postRequest('crawl', jsonData, headers)
|
||||
if (response.status === 200) {
|
||||
const responseData = response.data
|
||||
if (responseData[0].status) {
|
||||
return { success: true, data: responseData[0] }
|
||||
} else {
|
||||
throw new Error(`Failed to scrape URL. Error: ${responseData.error}`)
|
||||
}
|
||||
} else {
|
||||
this.handleError(response, 'scrape URL')
|
||||
}
|
||||
} catch (error: any) {
|
||||
throw new Error(error.message)
|
||||
}
|
||||
return { success: false, error: 'Internal server error.' }
|
||||
}
|
||||
|
||||
async crawlUrl(url: string, params: Params | null = null, idempotencyKey?: string): Promise<CrawlResponse | any> {
|
||||
const headers = this.prepareHeaders(idempotencyKey)
|
||||
const jsonData: Params = { url, ...params }
|
||||
|
||||
try {
|
||||
const response: AxiosResponse = await this.postRequest('crawl', jsonData, headers)
|
||||
if (response.status === 200) {
|
||||
return { success: true, data: response.data }
|
||||
} else {
|
||||
this.handleError(response, 'start crawl job')
|
||||
}
|
||||
} catch (error: any) {
|
||||
throw new Error(error.message)
|
||||
}
|
||||
return { success: false, error: 'Internal server error.' }
|
||||
}
|
||||
|
||||
private prepareHeaders(idempotencyKey?: string): AxiosRequestHeaders {
|
||||
return {
|
||||
'Content-Type': 'application/json',
|
||||
Authorization: `Bearer ${this.apiKey}`,
|
||||
...(idempotencyKey ? { 'x-idempotency-key': idempotencyKey } : {})
|
||||
} as AxiosRequestHeaders & { 'x-idempotency-key'?: string }
|
||||
}
|
||||
|
||||
private postRequest(url: string, data: Params, headers: AxiosRequestHeaders): Promise<AxiosResponse> {
|
||||
return axios.post(`${this.apiUrl}/${url}`, data, { headers })
|
||||
}
|
||||
|
||||
private handleError(response: AxiosResponse, action: string): void {
|
||||
if ([402, 408, 409, 500].includes(response.status)) {
|
||||
const errorMessage: string = response.data.error || 'Unknown error occurred'
|
||||
throw new Error(`Failed to ${action}. Status code: ${response.status}. Error: ${errorMessage}`)
|
||||
} else {
|
||||
throw new Error(`Unexpected error occurred while trying to ${action}. Status code: ${response.status}`)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
export default SpiderApp
|
||||
|
|
@ -0,0 +1 @@
|
|||
<svg height="30" width="30" viewBox="0 0 36 34" xml:space="preserve" xmlns="http://www.w3.org/2000/svg" class="fill-accent-foreground transition-all group-hover:scale-110"><title>Spider v1 Logo</title><path fill-rule="evenodd" clip-rule="evenodd" d="M9.13883 7.06589V0.164429L13.0938 0.164429V6.175L14.5178 7.4346C15.577 6.68656 16.7337 6.27495 17.945 6.27495C19.1731 6.27495 20.3451 6.69807 21.4163 7.46593L22.8757 6.175V0.164429L26.8307 0.164429V7.06589V7.95679L26.1634 8.54706L24.0775 10.3922C24.3436 10.8108 24.5958 11.2563 24.8327 11.7262L26.0467 11.4215L28.6971 8.08749L31.793 10.5487L28.7257 14.407L28.3089 14.9313L27.6592 15.0944L26.2418 15.4502C26.3124 15.7082 26.3793 15.9701 26.4422 16.2355L28.653 16.6566L29.092 16.7402L29.4524 17.0045L35.3849 21.355L33.0461 24.5444L27.474 20.4581L27.0719 20.3816C27.1214 21.0613 27.147 21.7543 27.147 22.4577C27.147 22.5398 27.1466 22.6214 27.1459 22.7024L29.5889 23.7911L30.3219 24.1177L30.62 24.8629L33.6873 32.5312L30.0152 34L27.246 27.0769L26.7298 26.8469C25.5612 32.2432 22.0701 33.8808 17.945 33.8808C13.8382 33.8808 10.3598 32.2577 9.17593 26.9185L8.82034 27.0769L6.05109 34L2.37897 32.5312L5.44629 24.8629L5.74435 24.1177L6.47743 23.7911L8.74487 22.7806C8.74366 22.6739 8.74305 22.5663 8.74305 22.4577C8.74305 21.7616 8.76804 21.0758 8.81654 20.4028L8.52606 20.4581L2.95395 24.5444L0.615112 21.355L6.54761 17.0045L6.908 16.7402L7.34701 16.6566L9.44264 16.2575C9.50917 15.9756 9.5801 15.6978 9.65528 15.4242L8.34123 15.0944L7.69155 14.9313L7.27471 14.407L4.20739 10.5487L7.30328 8.08749L9.95376 11.4215L11.0697 11.7016C11.3115 11.2239 11.5692 10.7716 11.8412 10.3473L9.80612 8.54706L9.13883 7.95679V7.06589Z"></path></svg>
|
||||
|
After Width: | Height: | Size: 1.6 KiB |
Loading…
Reference in New Issue