Merge pull request #109 from FlowiseAI/feature/GetRelativeLinks

This commit is contained in:
Ong Chung Yau 2023-05-16 06:38:34 +07:00 committed by GitHub
commit 1d9181f099
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 78 additions and 5 deletions

View File

@ -2,6 +2,7 @@ import { INode, INodeData, INodeParams } from '../../../src/Interface'
import { TextSplitter } from 'langchain/text_splitter'
import { CheerioWebBaseLoader } from 'langchain/document_loaders/web/cheerio'
import { test } from 'linkifyjs'
import { getAvailableURLs } from '../../../src'
class Cheerio_DocumentLoaders implements INode {
label: string
name: string
@ -38,6 +39,21 @@ class Cheerio_DocumentLoaders implements INode {
type: 'json',
optional: true,
additionalParams: true
},
{
label: 'Web Scrap for Relative Links',
name: 'webScrap',
type: 'boolean',
optional: true,
additionalParams: true
},
{
label: 'Web Scrap Links Limit',
name: 'limit',
type: 'number',
default: 10,
optional: true,
additionalParams: true
}
]
}
@ -45,6 +61,8 @@ class Cheerio_DocumentLoaders implements INode {
async init(nodeData: INodeData): Promise<any> {
const textSplitter = nodeData.inputs?.textSplitter as TextSplitter
const metadata = nodeData.inputs?.metadata
const webScrap = nodeData.inputs?.webScrap as boolean
let limit = nodeData.inputs?.limit as string
let url = nodeData.inputs?.url as string
url = url.trim()
@ -52,14 +70,28 @@ class Cheerio_DocumentLoaders implements INode {
throw new Error('Invalid URL')
}
const loader = new CheerioWebBaseLoader(url)
const cheerioLoader = async (url: string): Promise<any> => {
let docs = []
const loader = new CheerioWebBaseLoader(url)
if (textSplitter) {
docs = await loader.loadAndSplit(textSplitter)
} else {
docs = await loader.load()
}
return docs
}
let availableUrls: string[]
let docs = []
if (webScrap) {
if (!limit) limit = '10'
availableUrls = await getAvailableURLs(url, parseInt(limit))
for (let i = 0; i < availableUrls.length; i++) {
docs.push(...(await cheerioLoader(availableUrls[i])))
}
} else {
docs = await cheerioLoader(url)
}
if (metadata) {
const parsedMetadata = typeof metadata === 'object' ? metadata : JSON.parse(metadata)

View File

@ -1,3 +1,5 @@
import axios from 'axios'
import { load } from 'cheerio'
import * as fs from 'fs'
import * as path from 'path'
@ -173,3 +175,42 @@ export const getBlob = (fileBase64Str: string) => {
}
return bufferArray
}
export const getAvailableURLs = async (url: string, limit: number) => {
try {
const availableUrls: string[] = []
console.info(`Crawling: ${url}`)
availableUrls.push(url)
const response = await axios.get(url)
const $ = load(response.data)
const relativeLinks = $("a[href^='/']")
console.info(`Available Relative Links: ${relativeLinks.length}`)
if (relativeLinks.length === 0) return availableUrls
limit = Math.min(limit + 1, relativeLinks.length) // limit + 1 is because index start from 0 and index 0 is occupy by url
console.info(`True Limit: ${limit}`)
// availableUrls.length cannot exceed limit
for (let i = 0; availableUrls.length < limit; i++) {
if (i === limit) break // some links are repetitive so it won't added into the array which cause the length to be lesser
console.info(`index: ${i}`)
const element = relativeLinks[i]
const relativeUrl = $(element).attr('href')
if (!relativeUrl) continue
const absoluteUrl = new URL(relativeUrl, url).toString()
if (!availableUrls.includes(absoluteUrl)) {
availableUrls.push(absoluteUrl)
console.info(`Found unique relative link: ${absoluteUrl}`)
}
}
return availableUrls
} catch (err) {
throw new Error(`getAvailableURLs: ${err?.message}`)
}
}