diff --git a/packages/components/nodes/documentloaders/Cheerio/Cheerio.ts b/packages/components/nodes/documentloaders/Cheerio/Cheerio.ts index 20096ec69..bfb6abdbd 100644 --- a/packages/components/nodes/documentloaders/Cheerio/Cheerio.ts +++ b/packages/components/nodes/documentloaders/Cheerio/Cheerio.ts @@ -2,6 +2,7 @@ import { INode, INodeData, INodeParams } from '../../../src/Interface' import { TextSplitter } from 'langchain/text_splitter' import { CheerioWebBaseLoader } from 'langchain/document_loaders/web/cheerio' import { test } from 'linkifyjs' +import { getAvailableURLs } from '../../../src' class Cheerio_DocumentLoaders implements INode { label: string name: string @@ -38,6 +39,21 @@ class Cheerio_DocumentLoaders implements INode { type: 'json', optional: true, additionalParams: true + }, + { + label: 'Web Scrap for Relative Links', + name: 'webScrap', + type: 'boolean', + optional: true, + additionalParams: true + }, + { + label: 'Web Scrap Links Limit', + name: 'limit', + type: 'number', + default: 10, + optional: true, + additionalParams: true } ] } @@ -45,6 +61,8 @@ class Cheerio_DocumentLoaders implements INode { async init(nodeData: INodeData): Promise { const textSplitter = nodeData.inputs?.textSplitter as TextSplitter const metadata = nodeData.inputs?.metadata + const webScrap = nodeData.inputs?.webScrap as boolean + let limit = nodeData.inputs?.limit as string let url = nodeData.inputs?.url as string url = url.trim() @@ -52,13 +70,27 @@ class Cheerio_DocumentLoaders implements INode { throw new Error('Invalid URL') } - const loader = new CheerioWebBaseLoader(url) - let docs = [] + const cheerioLoader = async (url: string): Promise => { + let docs = [] + const loader = new CheerioWebBaseLoader(url) + if (textSplitter) { + docs = await loader.loadAndSplit(textSplitter) + } else { + docs = await loader.load() + } + return docs + } - if (textSplitter) { - docs = await loader.loadAndSplit(textSplitter) + let availableUrls: string[] + let docs = [] + if (webScrap) { + if (!limit) limit = '10' + availableUrls = await getAvailableURLs(url, parseInt(limit)) + for (let i = 0; i < availableUrls.length; i++) { + docs.push(...(await cheerioLoader(availableUrls[i]))) + } } else { - docs = await loader.load() + docs = await cheerioLoader(url) } if (metadata) { diff --git a/packages/components/src/utils.ts b/packages/components/src/utils.ts index a2a415254..debb4903d 100644 --- a/packages/components/src/utils.ts +++ b/packages/components/src/utils.ts @@ -1,3 +1,5 @@ +import axios from 'axios' +import { load } from 'cheerio' import * as fs from 'fs' import * as path from 'path' @@ -173,3 +175,42 @@ export const getBlob = (fileBase64Str: string) => { } return bufferArray } + +export const getAvailableURLs = async (url: string, limit: number) => { + try { + const availableUrls: string[] = [] + + console.info(`Crawling: ${url}`) + availableUrls.push(url) + + const response = await axios.get(url) + const $ = load(response.data) + + const relativeLinks = $("a[href^='/']") + console.info(`Available Relative Links: ${relativeLinks.length}`) + if (relativeLinks.length === 0) return availableUrls + + limit = Math.min(limit + 1, relativeLinks.length) // limit + 1 is because index start from 0 and index 0 is occupy by url + console.info(`True Limit: ${limit}`) + + // availableUrls.length cannot exceed limit + for (let i = 0; availableUrls.length < limit; i++) { + if (i === limit) break // some links are repetitive so it won't added into the array which cause the length to be lesser + console.info(`index: ${i}`) + const element = relativeLinks[i] + + const relativeUrl = $(element).attr('href') + if (!relativeUrl) continue + + const absoluteUrl = new URL(relativeUrl, url).toString() + if (!availableUrls.includes(absoluteUrl)) { + availableUrls.push(absoluteUrl) + console.info(`Found unique relative link: ${absoluteUrl}`) + } + } + + return availableUrls + } catch (err) { + throw new Error(`getAvailableURLs: ${err?.message}`) + } +}