add web scrap for relative links
This commit is contained in:
parent
9f53111aac
commit
77d405e755
|
|
@ -2,6 +2,7 @@ import { INode, INodeData, INodeParams } from '../../../src/Interface'
|
||||||
import { TextSplitter } from 'langchain/text_splitter'
|
import { TextSplitter } from 'langchain/text_splitter'
|
||||||
import { CheerioWebBaseLoader } from 'langchain/document_loaders/web/cheerio'
|
import { CheerioWebBaseLoader } from 'langchain/document_loaders/web/cheerio'
|
||||||
import { test } from 'linkifyjs'
|
import { test } from 'linkifyjs'
|
||||||
|
import { getAvailableURLs } from '../../../src'
|
||||||
class Cheerio_DocumentLoaders implements INode {
|
class Cheerio_DocumentLoaders implements INode {
|
||||||
label: string
|
label: string
|
||||||
name: string
|
name: string
|
||||||
|
|
@ -38,6 +39,21 @@ class Cheerio_DocumentLoaders implements INode {
|
||||||
type: 'json',
|
type: 'json',
|
||||||
optional: true,
|
optional: true,
|
||||||
additionalParams: true
|
additionalParams: true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
label: 'Web Scrap for Relative Links',
|
||||||
|
name: 'webScrap',
|
||||||
|
type: 'boolean',
|
||||||
|
optional: true,
|
||||||
|
additionalParams: true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
label: 'Web Scrap Links Limit',
|
||||||
|
name: 'limit',
|
||||||
|
type: 'number',
|
||||||
|
default: 10,
|
||||||
|
optional: true,
|
||||||
|
additionalParams: true
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
|
|
@ -45,6 +61,8 @@ class Cheerio_DocumentLoaders implements INode {
|
||||||
async init(nodeData: INodeData): Promise<any> {
|
async init(nodeData: INodeData): Promise<any> {
|
||||||
const textSplitter = nodeData.inputs?.textSplitter as TextSplitter
|
const textSplitter = nodeData.inputs?.textSplitter as TextSplitter
|
||||||
const metadata = nodeData.inputs?.metadata
|
const metadata = nodeData.inputs?.metadata
|
||||||
|
const webScrap = nodeData.inputs?.webScrap as boolean
|
||||||
|
let limit = nodeData.inputs?.limit as string
|
||||||
|
|
||||||
let url = nodeData.inputs?.url as string
|
let url = nodeData.inputs?.url as string
|
||||||
url = url.trim()
|
url = url.trim()
|
||||||
|
|
@ -52,14 +70,28 @@ class Cheerio_DocumentLoaders implements INode {
|
||||||
throw new Error('Invalid URL')
|
throw new Error('Invalid URL')
|
||||||
}
|
}
|
||||||
|
|
||||||
const loader = new CheerioWebBaseLoader(url)
|
const cheerioLoader = async (url: string): Promise<any> => {
|
||||||
let docs = []
|
let docs = []
|
||||||
|
const loader = new CheerioWebBaseLoader(url)
|
||||||
if (textSplitter) {
|
if (textSplitter) {
|
||||||
docs = await loader.loadAndSplit(textSplitter)
|
docs = await loader.loadAndSplit(textSplitter)
|
||||||
} else {
|
} else {
|
||||||
docs = await loader.load()
|
docs = await loader.load()
|
||||||
}
|
}
|
||||||
|
return docs
|
||||||
|
}
|
||||||
|
|
||||||
|
let availableUrls: string[]
|
||||||
|
let docs = []
|
||||||
|
if (webScrap) {
|
||||||
|
if (!limit) limit = "10"
|
||||||
|
availableUrls = await getAvailableURLs(url, parseInt(limit))
|
||||||
|
for (let i = 0; i < availableUrls.length; i++) {
|
||||||
|
docs.push(...(await cheerioLoader(availableUrls[i])))
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
docs = await cheerioLoader(url)
|
||||||
|
}
|
||||||
|
|
||||||
if (metadata) {
|
if (metadata) {
|
||||||
const parsedMetadata = typeof metadata === 'object' ? metadata : JSON.parse(metadata)
|
const parsedMetadata = typeof metadata === 'object' ? metadata : JSON.parse(metadata)
|
||||||
|
|
|
||||||
|
|
@ -1,3 +1,5 @@
|
||||||
|
import axios from 'axios'
|
||||||
|
import { load } from 'cheerio'
|
||||||
import * as fs from 'fs'
|
import * as fs from 'fs'
|
||||||
import * as path from 'path'
|
import * as path from 'path'
|
||||||
|
|
||||||
|
|
@ -173,3 +175,42 @@ export const getBlob = (fileBase64Str: string) => {
|
||||||
}
|
}
|
||||||
return bufferArray
|
return bufferArray
|
||||||
}
|
}
|
||||||
|
|
||||||
|
export const getAvailableURLs = async (url: string, limit: number) => {
|
||||||
|
try {
|
||||||
|
const availableUrls: string[] = []
|
||||||
|
|
||||||
|
console.info(`Crawling: ${url}`)
|
||||||
|
availableUrls.push(url)
|
||||||
|
|
||||||
|
const response = await axios.get(url)
|
||||||
|
const $ = load(response.data)
|
||||||
|
|
||||||
|
const relativeLinks = $("a[href^='/']")
|
||||||
|
console.info(`Available Relative Links: ${relativeLinks.length}`)
|
||||||
|
if (relativeLinks.length === 0) return availableUrls
|
||||||
|
|
||||||
|
limit = Math.min(limit + 1, relativeLinks.length) // limit + 1 is because index start from 0 and index 0 is occupy by url
|
||||||
|
console.info(`True Limit: ${limit}`)
|
||||||
|
|
||||||
|
// availableUrls.length cannot exceed limit
|
||||||
|
for (let i = 0; availableUrls.length < limit; i++) {
|
||||||
|
if (i === limit) break // some links are repetitive so it won't added into the array which cause the length to be lesser
|
||||||
|
console.info(`index: ${i}`)
|
||||||
|
const element = relativeLinks[i]
|
||||||
|
|
||||||
|
const relativeUrl = $(element).attr('href')
|
||||||
|
if (!relativeUrl) continue
|
||||||
|
|
||||||
|
const absoluteUrl = new URL(relativeUrl, url).toString()
|
||||||
|
if (!availableUrls.includes(absoluteUrl)) {
|
||||||
|
availableUrls.push(absoluteUrl)
|
||||||
|
console.info(`Found unique relative link: ${absoluteUrl}`)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return availableUrls
|
||||||
|
} catch (err) {
|
||||||
|
throw new Error(`getAvailableURLs: ${err?.message}`)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue