diff --git a/packages/components/credentials/ApifyApi.credential.ts b/packages/components/credentials/ApifyApi.credential.ts new file mode 100644 index 000000000..c961fd385 --- /dev/null +++ b/packages/components/credentials/ApifyApi.credential.ts @@ -0,0 +1,26 @@ +import { INodeParams, INodeCredential } from '../src/Interface' + +class ApifyApiCredential implements INodeCredential { + label: string + name: string + version: number + description: string + inputs: INodeParams[] + + constructor() { + this.label = 'Apify API' + this.name = 'apifyApi' + this.version = 1.0 + this.description = + 'You can find the Apify API token on your Apify account page.' + this.inputs = [ + { + label: 'Apify API', + name: 'apifyApiToken', + type: 'password' + } + ] + } +} + +module.exports = { credClass: ApifyApiCredential } diff --git a/packages/components/nodes/documentloaders/ApifyWebsiteContentCrawler/ApifyWebsiteContentCrawler.ts b/packages/components/nodes/documentloaders/ApifyWebsiteContentCrawler/ApifyWebsiteContentCrawler.ts new file mode 100644 index 000000000..a5e6a6e03 --- /dev/null +++ b/packages/components/nodes/documentloaders/ApifyWebsiteContentCrawler/ApifyWebsiteContentCrawler.ts @@ -0,0 +1,139 @@ +import { INode, INodeData, INodeParams, ICommonObject } from '../../../src/Interface' +import { getCredentialData, getCredentialParam } from '../../../src/utils' +import { TextSplitter } from 'langchain/text_splitter' +import { ApifyDatasetLoader } from 'langchain/document_loaders/web/apify_dataset' +import { Document } from 'langchain/document' + +class ApifyWebsiteContentCrawler_DocumentLoaders implements INode { + label: string + name: string + description: string + type: string + icon: string + version: number + category: string + baseClasses: string[] + inputs: INodeParams[] + credential: INodeParams + + constructor() { + this.label = 'Apify Website Content Crawler' + this.name = 'apifyWebsiteContentCrawler' + this.type = 'Document' + this.icon = 'apify-symbol-transparent.svg' + this.version = 1.0 + this.category = 'Document Loaders' + this.description = 'Load data from Apify Website Content Crawler' + this.baseClasses = [this.type] + this.inputs = [ + { + label: 'Start URLs', + name: 'urls', + type: 'string', + description: 'One or more URLs of pages where the crawler will start, separated by commas.', + placeholder: 'https://js.langchain.com/docs/' + }, + { + label: 'Crawler type', + type: 'options', + name: 'crawlerType', + options: [ + { + label: 'Headless web browser (Chrome+Playwright)', + name: 'playwright:chrome' + }, + { + label: 'Stealthy web browser (Firefox+Playwright)', + name: 'playwright:firefox' + }, + { + label: 'Raw HTTP client (Cheerio)', + name: 'cheerio' + }, + { + label: 'Raw HTTP client with JavaScript execution (JSDOM) [experimental]', + name: 'jsdom' + } + ], + description: + 'Select the crawling engine, see documentation for additional information.', + default: 'playwright:firefox' + }, + { + label: 'Max crawling depth', + name: 'maxCrawlDepth', + type: 'number', + optional: true, + default: 1 + }, + { + label: 'Max crawl pages', + name: 'maxCrawlPages', + type: 'number', + optional: true, + default: 3 + }, + { + label: 'Additional input', + name: 'additionalInput', + type: 'json', + default: JSON.stringify({}), + description: + 'For additional input options for the crawler see documentation.', + optional: true + }, + { + label: 'Text Splitter', + name: 'textSplitter', + type: 'TextSplitter', + optional: true + } + ] + this.credential = { + label: 'Connect Apify API', + name: 'credential', + type: 'credential', + credentialNames: ['apifyApi'] + } + } + + async init(nodeData: INodeData, _: string, options: ICommonObject): Promise { + const textSplitter = nodeData.inputs?.textSplitter as TextSplitter + + // Get input options and merge with additional input + const urls = nodeData.inputs?.urls as string + const crawlerType = nodeData.inputs?.crawlerType as string + const maxCrawlDepth = nodeData.inputs?.maxCrawlDepth as string + const maxCrawlPages = nodeData.inputs?.maxCrawlPages as string + const additionalInput = + typeof nodeData.inputs?.additionalInput === 'object' + ? nodeData.inputs?.additionalInput + : JSON.parse(nodeData.inputs?.additionalInput as string) + const input = { + startUrls: urls.split(',').map((url) => ({ url: url.trim() })), + crawlerType, + maxCrawlDepth: parseInt(maxCrawlDepth, 10), + maxCrawlPages: parseInt(maxCrawlPages, 10), + ...additionalInput + } + + // Get Apify API token from credential data + const credentialData = await getCredentialData(nodeData.credential ?? '', options) + const apifyApiToken = getCredentialParam('apifyApiToken', credentialData, nodeData) + + const loader = await ApifyDatasetLoader.fromActorCall('apify/website-content-crawler', input, { + datasetMappingFunction: (item) => + new Document({ + pageContent: (item.text || '') as string, + metadata: { source: item.url } + }), + clientOptions: { + token: apifyApiToken + } + }) + + return textSplitter ? loader.loadAndSplit(textSplitter) : loader.load() + } +} + +module.exports = { nodeClass: ApifyWebsiteContentCrawler_DocumentLoaders } diff --git a/packages/components/nodes/documentloaders/ApifyWebsiteContentCrawler/apify-symbol-transparent.svg b/packages/components/nodes/documentloaders/ApifyWebsiteContentCrawler/apify-symbol-transparent.svg new file mode 100644 index 000000000..423a3328d --- /dev/null +++ b/packages/components/nodes/documentloaders/ApifyWebsiteContentCrawler/apify-symbol-transparent.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/packages/components/package.json b/packages/components/package.json index 7f55010d7..6a9fc437f 100644 --- a/packages/components/package.json +++ b/packages/components/package.json @@ -26,6 +26,7 @@ "@qdrant/js-client-rest": "^1.2.2", "@supabase/supabase-js": "^2.29.0", "@types/js-yaml": "^4.0.5", + "apify-client": "^2.7.1", "@types/jsdom": "^21.1.1", "axios": "^0.27.2", "cheerio": "^1.0.0-rc.12",