diff --git a/packages/components/nodes/documentloaders/Playwright/Playwright.ts b/packages/components/nodes/documentloaders/Playwright/Playwright.ts index 6b7790af1..6e22d55d3 100644 --- a/packages/components/nodes/documentloaders/Playwright/Playwright.ts +++ b/packages/components/nodes/documentloaders/Playwright/Playwright.ts @@ -2,7 +2,7 @@ import { INode, INodeData, INodeParams } from '../../../src/Interface' import { TextSplitter } from 'langchain/text_splitter' import { PlaywrightWebBaseLoader } from 'langchain/document_loaders/web/playwright' import { test } from 'linkifyjs' -import { getAvailableURLs } from '../../../src' +import { webCrawl } from '../../../src' class Playwright_DocumentLoaders implements INode { label: string @@ -35,19 +35,20 @@ class Playwright_DocumentLoaders implements INode { optional: true }, { - label: 'Web Scrap for Relative Links', - name: 'webScrap', + label: 'Web Crawl for Relative Links', + name: 'boolWebCrawl', type: 'boolean', optional: true, additionalParams: true }, { - label: 'Web Scrap Links Limit', + label: 'Web Crawl Links Limit', name: 'limit', type: 'number', default: 10, optional: true, - additionalParams: true + additionalParams: true, + description: 'Set 0 to crawl all relative links' }, { label: 'Metadata', @@ -62,7 +63,7 @@ class Playwright_DocumentLoaders implements INode { async init(nodeData: INodeData): Promise { const textSplitter = nodeData.inputs?.textSplitter as TextSplitter const metadata = nodeData.inputs?.metadata - const webScrap = nodeData.inputs?.webScrap as boolean + const boolWebCrawl = nodeData.inputs?.boolWebCrawl as boolean let limit = nodeData.inputs?.limit as string let url = nodeData.inputs?.url as string @@ -71,25 +72,32 @@ class Playwright_DocumentLoaders implements INode { throw new Error('Invalid URL') } - const playwrightLoader = async (url: string): Promise => { - let docs = [] - const loader = new PlaywrightWebBaseLoader(url) - if (textSplitter) { - docs = await loader.loadAndSplit(textSplitter) - } else { - docs = await loader.load() + async function playwrightLoader(url: string): Promise { + try { + let docs = [] + const loader = new PlaywrightWebBaseLoader(url) + if (textSplitter) { + docs = await loader.loadAndSplit(textSplitter) + } else { + docs = await loader.load() + } + return docs + } catch (err) { + if (process.env.DEBUG === 'true') console.error(`error in CheerioWebBaseLoader: ${err.message}, on page: ${url}`) } - return docs } - let availableUrls: string[] let docs = [] - if (webScrap) { - if (!limit) limit = '10' - availableUrls = await getAvailableURLs(url, parseInt(limit)) - for (let i = 0; i < availableUrls.length; i++) { - docs.push(...(await playwrightLoader(availableUrls[i]))) + if (boolWebCrawl) { + if (process.env.DEBUG === 'true') console.info('Start Web Crawl') + if (!limit) throw new Error('Please set a limit to crawl') + else if (parseInt(limit) < 0) throw new Error('Limit cannot be less than 0') + const pages: string[] = await webCrawl(url, parseInt(limit)) + if (process.env.DEBUG === 'true') console.info(`pages: ${JSON.stringify(pages)}, length: ${pages.length}`) + for (const page of pages) { + docs.push(...(await playwrightLoader(page))) } + if (process.env.DEBUG === 'true') console.info('Finish Web Crawl') } else { docs = await playwrightLoader(url) }