From 2c0a8723f941fba6d956f5d5c4a5e89fd11db80b Mon Sep 17 00:00:00 2001 From: drobnikj Date: Fri, 23 Jun 2023 13:41:59 +0200 Subject: [PATCH 1/5] feat: add document loader for Apify Website Content Crawler --- .../ApifyWebsiteContentCrawler.ts | 68 +++++++++++++++++++ .../apify-symbol-transparent.svg | 1 + packages/components/package.json | 1 + 3 files changed, 70 insertions(+) create mode 100644 packages/components/nodes/documentloaders/ApifyWebsiteContentCrawler/ApifyWebsiteContentCrawler.ts create mode 100644 packages/components/nodes/documentloaders/ApifyWebsiteContentCrawler/apify-symbol-transparent.svg diff --git a/packages/components/nodes/documentloaders/ApifyWebsiteContentCrawler/ApifyWebsiteContentCrawler.ts b/packages/components/nodes/documentloaders/ApifyWebsiteContentCrawler/ApifyWebsiteContentCrawler.ts new file mode 100644 index 000000000..f292ada3c --- /dev/null +++ b/packages/components/nodes/documentloaders/ApifyWebsiteContentCrawler/ApifyWebsiteContentCrawler.ts @@ -0,0 +1,68 @@ +import { INode, INodeData, INodeParams } from '../../../src/Interface' +import { TextSplitter } from 'langchain/text_splitter' +import { ApifyDatasetLoader } from 'langchain/document_loaders/web/apify_dataset' +import { Document } from 'langchain/document' + +class ApifyWebsiteContentCrawler_DocumentLoaders implements INode { + label: string + name: string + description: string + type: string + icon: string + category: string + baseClasses: string[] + inputs: INodeParams[] + + constructor() { + this.label = 'Apify Website Content Crawler' + this.name = 'apifyWebsiteContentCrawler' + this.type = 'Document' + this.icon = 'apify-symbol-transparent.svg' + this.category = 'Document Loaders' + this.description = 'Load data from Apify Website Content Crawler' + this.baseClasses = [this.type] + this.inputs = [ + { + label: 'Apify API Token', + name: 'apifyApiToken', + type: 'password' + }, + { + label: 'Input', + name: 'input', + type: 'json', + default: JSON.stringify({ + startUrls: [{ url: 'https://js.langchain.com/docs/' }], + maxCrawlPages: 1 + }) + }, + { + label: 'Text Splitter', + name: 'textSplitter', + type: 'TextSplitter', + optional: true + } + ] + } + + async init(nodeData: INodeData): Promise { + const textSplitter = nodeData.inputs?.textSplitter as TextSplitter + const apifyApiToken = nodeData.inputs?.apifyApiToken as string + const input = typeof nodeData.inputs?.input === 'object' ? nodeData.inputs?.input : JSON.parse(nodeData.inputs?.input as string) + + const loader = await ApifyDatasetLoader.fromActorCall('apify/website-content-crawler', input, { + datasetMappingFunction: (item) => + new Document({ + pageContent: (item.text || '') as string, + metadata: { source: item.url } + }), + clientOptions: { + token: apifyApiToken + } + }) + + return textSplitter ? loader.loadAndSplit(textSplitter) : loader.load() + } +} + +module.exports = { nodeClass: ApifyWebsiteContentCrawler_DocumentLoaders } diff --git a/packages/components/nodes/documentloaders/ApifyWebsiteContentCrawler/apify-symbol-transparent.svg b/packages/components/nodes/documentloaders/ApifyWebsiteContentCrawler/apify-symbol-transparent.svg new file mode 100644 index 000000000..423a3328d --- /dev/null +++ b/packages/components/nodes/documentloaders/ApifyWebsiteContentCrawler/apify-symbol-transparent.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/packages/components/package.json b/packages/components/package.json index e5e0ba008..bc55fb70b 100644 --- a/packages/components/package.json +++ b/packages/components/package.json @@ -22,6 +22,7 @@ "@pinecone-database/pinecone": "^0.0.12", "@supabase/supabase-js": "^2.21.0", "@types/js-yaml": "^4.0.5", + "apify-client": "^2.7.1", "axios": "^0.27.2", "cheerio": "^1.0.0-rc.12", "chromadb": "^1.4.2", From 3aa301119b578652f79c740dbadd39295800cea2 Mon Sep 17 00:00:00 2001 From: drobnikj Date: Tue, 1 Aug 2023 10:22:51 +0200 Subject: [PATCH 2/5] feat: add apifyApiToken credentials v1 --- packages/components/credentials/ApifyApi.ts | 26 +++++++++++++++++++ .../ApifyWebsiteContentCrawler.ts | 24 +++++++++++------ 2 files changed, 42 insertions(+), 8 deletions(-) create mode 100644 packages/components/credentials/ApifyApi.ts diff --git a/packages/components/credentials/ApifyApi.ts b/packages/components/credentials/ApifyApi.ts new file mode 100644 index 000000000..d3e7e870a --- /dev/null +++ b/packages/components/credentials/ApifyApi.ts @@ -0,0 +1,26 @@ +import { INodeParams, INodeCredential } from '../src/Interface' + +class ApifyApi implements INodeCredential { + label: string + name: string + version: number + description: string + inputs: INodeParams[] + + constructor() { + this.label = 'Apify API' + this.name = 'apifyApi' + this.version = 1.0 + this.description = + 'You can find the Apify API token on your Apify account page.' + this.inputs = [ + { + label: 'Apify API', + name: 'apifyApiToken', + type: 'password' + } + ] + } +} + +module.exports = { credClass: ApifyApi } diff --git a/packages/components/nodes/documentloaders/ApifyWebsiteContentCrawler/ApifyWebsiteContentCrawler.ts b/packages/components/nodes/documentloaders/ApifyWebsiteContentCrawler/ApifyWebsiteContentCrawler.ts index f292ada3c..1cf826cf7 100644 --- a/packages/components/nodes/documentloaders/ApifyWebsiteContentCrawler/ApifyWebsiteContentCrawler.ts +++ b/packages/components/nodes/documentloaders/ApifyWebsiteContentCrawler/ApifyWebsiteContentCrawler.ts @@ -1,4 +1,5 @@ -import { INode, INodeData, INodeParams } from '../../../src/Interface' +import { INode, INodeData, INodeParams, ICommonObject } from '../../../src/Interface' +import { getCredentialData, getCredentialParam } from '../../../src/utils' import { TextSplitter } from 'langchain/text_splitter' import { ApifyDatasetLoader } from 'langchain/document_loaders/web/apify_dataset' import { Document } from 'langchain/document' @@ -9,24 +10,22 @@ class ApifyWebsiteContentCrawler_DocumentLoaders implements INode { description: string type: string icon: string + version: number category: string baseClasses: string[] inputs: INodeParams[] + credential: INodeParams constructor() { this.label = 'Apify Website Content Crawler' this.name = 'apifyWebsiteContentCrawler' this.type = 'Document' this.icon = 'apify-symbol-transparent.svg' + this.version = 1.0 this.category = 'Document Loaders' this.description = 'Load data from Apify Website Content Crawler' this.baseClasses = [this.type] this.inputs = [ - { - label: 'Apify API Token', - name: 'apifyApiToken', - type: 'password' - }, { label: 'Input', name: 'input', @@ -43,13 +42,22 @@ class ApifyWebsiteContentCrawler_DocumentLoaders implements INode { optional: true } ] + this.credential = { + label: 'Connect Apify API', + name: 'credential', + type: 'credential', + credentialNames: ['apifyApi'] + } } - async init(nodeData: INodeData): Promise { + async init(nodeData: INodeData, _: string, options: ICommonObject): Promise { const textSplitter = nodeData.inputs?.textSplitter as TextSplitter - const apifyApiToken = nodeData.inputs?.apifyApiToken as string const input = typeof nodeData.inputs?.input === 'object' ? nodeData.inputs?.input : JSON.parse(nodeData.inputs?.input as string) + // Get Apify API token from credential data + const credentialData = await getCredentialData(nodeData.credential ?? '', options) + const apifyApiToken = getCredentialParam('apifyApiToken', credentialData, nodeData) + const loader = await ApifyDatasetLoader.fromActorCall('apify/website-content-crawler', input, { datasetMappingFunction: (item) => new Document({ From 5146f6bde375814d54eb6cbaf9ede7cc080a2c6c Mon Sep 17 00:00:00 2001 From: drobnikj Date: Tue, 1 Aug 2023 11:19:57 +0200 Subject: [PATCH 3/5] feat: improve apify content crawler input --- .../ApifyWebsiteContentCrawler.ts | 77 +++++++++++++++++-- 1 file changed, 70 insertions(+), 7 deletions(-) diff --git a/packages/components/nodes/documentloaders/ApifyWebsiteContentCrawler/ApifyWebsiteContentCrawler.ts b/packages/components/nodes/documentloaders/ApifyWebsiteContentCrawler/ApifyWebsiteContentCrawler.ts index 1cf826cf7..9fd0764ca 100644 --- a/packages/components/nodes/documentloaders/ApifyWebsiteContentCrawler/ApifyWebsiteContentCrawler.ts +++ b/packages/components/nodes/documentloaders/ApifyWebsiteContentCrawler/ApifyWebsiteContentCrawler.ts @@ -27,13 +27,60 @@ class ApifyWebsiteContentCrawler_DocumentLoaders implements INode { this.baseClasses = [this.type] this.inputs = [ { - label: 'Input', - name: 'input', + label: 'Start URLs', + name: 'urls', + type: 'string', + description: 'One or more URLs of pages where the crawler will start, separated by commas.', + placeholder: 'https://js.langchain.com/docs/' + }, + { + label: 'Crawler type', + type: 'options', + name: 'crawlerType', + options: [ + { + label: 'Headless web browser (Chrome+Playwright)', + name: 'playwright:chrome' + }, + { + label: 'Stealthy web browser (Firefox+Playwright)', + name: 'playwright:firefox' + }, + { + label: 'Raw HTTP client (Cheerio)', + name: 'cheerio' + }, + { + label: 'Raw HTTP client with JavaScript execution (JSDOM) [experimental]', + name: 'jsdom' + } + ], + description: + 'Select the crawling engine, see documentation for additional information.', + default: 'playwright:firefox' + }, + { + label: 'Max crawling depth', + name: 'maxCrawlDepth', + type: 'number', + optional: true, + default: 1 + }, + { + label: 'Max crawl pages', + name: 'maxCrawlPages', + type: 'number', + optional: true, + default: 3 + }, + { + label: 'Additional input', + name: 'additionalInput', type: 'json', - default: JSON.stringify({ - startUrls: [{ url: 'https://js.langchain.com/docs/' }], - maxCrawlPages: 1 - }) + default: JSON.stringify({}), + description: + 'For additional input options for the crawler see documentation.', + optional: true }, { label: 'Text Splitter', @@ -52,7 +99,23 @@ class ApifyWebsiteContentCrawler_DocumentLoaders implements INode { async init(nodeData: INodeData, _: string, options: ICommonObject): Promise { const textSplitter = nodeData.inputs?.textSplitter as TextSplitter - const input = typeof nodeData.inputs?.input === 'object' ? nodeData.inputs?.input : JSON.parse(nodeData.inputs?.input as string) + + // Get input options and merge with additional input + const urls = nodeData.inputs?.urls as string + const crawlerType = nodeData.inputs?.crawlerType as string + const maxCrawlDepth = nodeData.inputs?.maxCrawlDepth as number + const maxCrawlPages = nodeData.inputs?.maxCrawlPages as number + const additionalInput = + typeof nodeData.inputs?.additionalInput === 'object' + ? nodeData.inputs?.additionalInput + : JSON.parse(nodeData.inputs?.additionalInput as string) + const input = { + startUrls: urls.split(',').map((url) => ({ url: url.trim() })), + crawlerType, + maxCrawlDepth, + maxCrawlPages, + ...additionalInput + } // Get Apify API token from credential data const credentialData = await getCredentialData(nodeData.credential ?? '', options) From e09bad04403441f272ed0780e839b2bc1922dc28 Mon Sep 17 00:00:00 2001 From: drobnikj Date: Mon, 7 Aug 2023 16:57:44 +0200 Subject: [PATCH 4/5] fix: account link --- packages/components/credentials/ApifyApi.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/components/credentials/ApifyApi.ts b/packages/components/credentials/ApifyApi.ts index d3e7e870a..c7e7322ad 100644 --- a/packages/components/credentials/ApifyApi.ts +++ b/packages/components/credentials/ApifyApi.ts @@ -12,7 +12,7 @@ class ApifyApi implements INodeCredential { this.name = 'apifyApi' this.version = 1.0 this.description = - 'You can find the Apify API token on your Apify account page.' + 'You can find the Apify API token on your Apify account page.' this.inputs = [ { label: 'Apify API', From 83d8e96f9c94f4374a6dcdc45a78e28183b64188 Mon Sep 17 00:00:00 2001 From: drobnikj Date: Wed, 9 Aug 2023 09:49:39 +0200 Subject: [PATCH 5/5] fix: fix credentials and parsing of numbers --- .../credentials/{ApifyApi.ts => ApifyApi.credential.ts} | 4 ++-- .../ApifyWebsiteContentCrawler.ts | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) rename packages/components/credentials/{ApifyApi.ts => ApifyApi.credential.ts} (85%) diff --git a/packages/components/credentials/ApifyApi.ts b/packages/components/credentials/ApifyApi.credential.ts similarity index 85% rename from packages/components/credentials/ApifyApi.ts rename to packages/components/credentials/ApifyApi.credential.ts index c7e7322ad..c961fd385 100644 --- a/packages/components/credentials/ApifyApi.ts +++ b/packages/components/credentials/ApifyApi.credential.ts @@ -1,6 +1,6 @@ import { INodeParams, INodeCredential } from '../src/Interface' -class ApifyApi implements INodeCredential { +class ApifyApiCredential implements INodeCredential { label: string name: string version: number @@ -23,4 +23,4 @@ class ApifyApi implements INodeCredential { } } -module.exports = { credClass: ApifyApi } +module.exports = { credClass: ApifyApiCredential } diff --git a/packages/components/nodes/documentloaders/ApifyWebsiteContentCrawler/ApifyWebsiteContentCrawler.ts b/packages/components/nodes/documentloaders/ApifyWebsiteContentCrawler/ApifyWebsiteContentCrawler.ts index 9fd0764ca..a5e6a6e03 100644 --- a/packages/components/nodes/documentloaders/ApifyWebsiteContentCrawler/ApifyWebsiteContentCrawler.ts +++ b/packages/components/nodes/documentloaders/ApifyWebsiteContentCrawler/ApifyWebsiteContentCrawler.ts @@ -103,8 +103,8 @@ class ApifyWebsiteContentCrawler_DocumentLoaders implements INode { // Get input options and merge with additional input const urls = nodeData.inputs?.urls as string const crawlerType = nodeData.inputs?.crawlerType as string - const maxCrawlDepth = nodeData.inputs?.maxCrawlDepth as number - const maxCrawlPages = nodeData.inputs?.maxCrawlPages as number + const maxCrawlDepth = nodeData.inputs?.maxCrawlDepth as string + const maxCrawlPages = nodeData.inputs?.maxCrawlPages as string const additionalInput = typeof nodeData.inputs?.additionalInput === 'object' ? nodeData.inputs?.additionalInput @@ -112,8 +112,8 @@ class ApifyWebsiteContentCrawler_DocumentLoaders implements INode { const input = { startUrls: urls.split(',').map((url) => ({ url: url.trim() })), crawlerType, - maxCrawlDepth, - maxCrawlPages, + maxCrawlDepth: parseInt(maxCrawlDepth, 10), + maxCrawlPages: parseInt(maxCrawlPages, 10), ...additionalInput }