add web scrap for relative links
This commit is contained in:
parent
9f53111aac
commit
77d405e755
|
|
@ -2,6 +2,7 @@ import { INode, INodeData, INodeParams } from '../../../src/Interface'
|
|||
import { TextSplitter } from 'langchain/text_splitter'
|
||||
import { CheerioWebBaseLoader } from 'langchain/document_loaders/web/cheerio'
|
||||
import { test } from 'linkifyjs'
|
||||
import { getAvailableURLs } from '../../../src'
|
||||
class Cheerio_DocumentLoaders implements INode {
|
||||
label: string
|
||||
name: string
|
||||
|
|
@ -38,6 +39,21 @@ class Cheerio_DocumentLoaders implements INode {
|
|||
type: 'json',
|
||||
optional: true,
|
||||
additionalParams: true
|
||||
},
|
||||
{
|
||||
label: 'Web Scrap for Relative Links',
|
||||
name: 'webScrap',
|
||||
type: 'boolean',
|
||||
optional: true,
|
||||
additionalParams: true
|
||||
},
|
||||
{
|
||||
label: 'Web Scrap Links Limit',
|
||||
name: 'limit',
|
||||
type: 'number',
|
||||
default: 10,
|
||||
optional: true,
|
||||
additionalParams: true
|
||||
}
|
||||
]
|
||||
}
|
||||
|
|
@ -45,6 +61,8 @@ class Cheerio_DocumentLoaders implements INode {
|
|||
async init(nodeData: INodeData): Promise<any> {
|
||||
const textSplitter = nodeData.inputs?.textSplitter as TextSplitter
|
||||
const metadata = nodeData.inputs?.metadata
|
||||
const webScrap = nodeData.inputs?.webScrap as boolean
|
||||
let limit = nodeData.inputs?.limit as string
|
||||
|
||||
let url = nodeData.inputs?.url as string
|
||||
url = url.trim()
|
||||
|
|
@ -52,13 +70,27 @@ class Cheerio_DocumentLoaders implements INode {
|
|||
throw new Error('Invalid URL')
|
||||
}
|
||||
|
||||
const loader = new CheerioWebBaseLoader(url)
|
||||
let docs = []
|
||||
const cheerioLoader = async (url: string): Promise<any> => {
|
||||
let docs = []
|
||||
const loader = new CheerioWebBaseLoader(url)
|
||||
if (textSplitter) {
|
||||
docs = await loader.loadAndSplit(textSplitter)
|
||||
} else {
|
||||
docs = await loader.load()
|
||||
}
|
||||
return docs
|
||||
}
|
||||
|
||||
if (textSplitter) {
|
||||
docs = await loader.loadAndSplit(textSplitter)
|
||||
let availableUrls: string[]
|
||||
let docs = []
|
||||
if (webScrap) {
|
||||
if (!limit) limit = "10"
|
||||
availableUrls = await getAvailableURLs(url, parseInt(limit))
|
||||
for (let i = 0; i < availableUrls.length; i++) {
|
||||
docs.push(...(await cheerioLoader(availableUrls[i])))
|
||||
}
|
||||
} else {
|
||||
docs = await loader.load()
|
||||
docs = await cheerioLoader(url)
|
||||
}
|
||||
|
||||
if (metadata) {
|
||||
|
|
|
|||
|
|
@ -1,3 +1,5 @@
|
|||
import axios from 'axios'
|
||||
import { load } from 'cheerio'
|
||||
import * as fs from 'fs'
|
||||
import * as path from 'path'
|
||||
|
||||
|
|
@ -173,3 +175,42 @@ export const getBlob = (fileBase64Str: string) => {
|
|||
}
|
||||
return bufferArray
|
||||
}
|
||||
|
||||
export const getAvailableURLs = async (url: string, limit: number) => {
|
||||
try {
|
||||
const availableUrls: string[] = []
|
||||
|
||||
console.info(`Crawling: ${url}`)
|
||||
availableUrls.push(url)
|
||||
|
||||
const response = await axios.get(url)
|
||||
const $ = load(response.data)
|
||||
|
||||
const relativeLinks = $("a[href^='/']")
|
||||
console.info(`Available Relative Links: ${relativeLinks.length}`)
|
||||
if (relativeLinks.length === 0) return availableUrls
|
||||
|
||||
limit = Math.min(limit + 1, relativeLinks.length) // limit + 1 is because index start from 0 and index 0 is occupy by url
|
||||
console.info(`True Limit: ${limit}`)
|
||||
|
||||
// availableUrls.length cannot exceed limit
|
||||
for (let i = 0; availableUrls.length < limit; i++) {
|
||||
if (i === limit) break // some links are repetitive so it won't added into the array which cause the length to be lesser
|
||||
console.info(`index: ${i}`)
|
||||
const element = relativeLinks[i]
|
||||
|
||||
const relativeUrl = $(element).attr('href')
|
||||
if (!relativeUrl) continue
|
||||
|
||||
const absoluteUrl = new URL(relativeUrl, url).toString()
|
||||
if (!availableUrls.includes(absoluteUrl)) {
|
||||
availableUrls.push(absoluteUrl)
|
||||
console.info(`Found unique relative link: ${absoluteUrl}`)
|
||||
}
|
||||
}
|
||||
|
||||
return availableUrls
|
||||
} catch (err) {
|
||||
throw new Error(`getAvailableURLs: ${err?.message}`)
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue