diff --git a/packages/components/nodes/documentloaders/Cheerio/Cheerio.ts b/packages/components/nodes/documentloaders/Cheerio/Cheerio.ts new file mode 100644 index 000000000..8be10f742 --- /dev/null +++ b/packages/components/nodes/documentloaders/Cheerio/Cheerio.ts @@ -0,0 +1,64 @@ +import { INode, INodeData, INodeParams } from '../../../src/Interface' +import { TextSplitter } from 'langchain/text_splitter' +import { CheerioWebBaseLoader } from 'langchain/document_loaders/web/cheerio' + +class Cheerio_DocumentLoaders implements INode { + label: string + name: string + description: string + type: string + icon: string + category: string + baseClasses: string[] + inputs: INodeParams[] + + constructor() { + this.label = 'Cheerio Web Scraper' + this.name = 'cheerioWebScraper' + this.type = 'Document' + this.icon = 'cheerio.svg' + this.category = 'Document Loaders' + this.description = `Load data from webpages` + this.baseClasses = [this.type] + this.inputs = [ + { + label: 'URL', + name: 'url', + type: 'string' + }, + { + label: 'Text Splitter', + name: 'textSplitter', + type: 'TextSplitter', + optional: true + } + ] + } + + async init(nodeData: INodeData): Promise { + const textSplitter = nodeData.inputs?.textSplitter as TextSplitter + let url = nodeData.inputs?.url as string + + var urlPattern = new RegExp( + '^(https?:\\/\\/)?' + // validate protocol + '((([a-z\\d]([a-z\\d-]*[a-z\\d])*)\\.)+[a-z]{2,}|' + // validate domain name + '((\\d{1,3}\\.){3}\\d{1,3}))' + // validate OR ip (v4) address + '(\\:\\d+)?(\\/[-a-z\\d%_.~+]*)*' + // validate port and path + '(\\?[;&a-z\\d%_.~+=-]*)?' + // validate query string + '(\\#[-a-z\\d_]*)?$', + 'i' + ) // validate fragment locator + + const loader = new CheerioWebBaseLoader(urlPattern.test(url.trim()) ? url.trim() : '') + + if (textSplitter) { + const docs = await loader.loadAndSplit(textSplitter) + return docs + } else { + const docs = await loader.load() + return docs + } + } +} + +module.exports = { nodeClass: Cheerio_DocumentLoaders } diff --git a/packages/components/nodes/documentloaders/Cheerio/cheerio.svg b/packages/components/nodes/documentloaders/Cheerio/cheerio.svg new file mode 100644 index 000000000..8e3334b9f --- /dev/null +++ b/packages/components/nodes/documentloaders/Cheerio/cheerio.svg @@ -0,0 +1,5 @@ + + + + + \ No newline at end of file diff --git a/packages/components/package.json b/packages/components/package.json index d75cda9d5..1775a8cfd 100644 --- a/packages/components/package.json +++ b/packages/components/package.json @@ -20,6 +20,7 @@ "@huggingface/inference": "^1.6.3", "@pinecone-database/pinecone": "^0.0.12", "axios": "^0.27.2", + "cheerio": "^1.0.0-rc.12", "chromadb": "^1.3.1", "d3-dsv": "2", "dotenv": "^16.0.0",