Merge pull request #948 from matthias/feature/CheerioSelector
Added CSS selector to Cheerio
This commit is contained in:
commit
c0d9484958
|
|
@ -1,8 +1,10 @@
|
|||
import { INode, INodeData, INodeParams } from '../../../src/Interface'
|
||||
import { TextSplitter } from 'langchain/text_splitter'
|
||||
import { CheerioWebBaseLoader } from 'langchain/document_loaders/web/cheerio'
|
||||
import { CheerioWebBaseLoader, WebBaseLoaderParams } from 'langchain/document_loaders/web/cheerio'
|
||||
import { test } from 'linkifyjs'
|
||||
import { parse } from 'css-what'
|
||||
import { webCrawl, xmlScrape } from '../../../src'
|
||||
import { SelectorType } from 'cheerio'
|
||||
|
||||
class Cheerio_DocumentLoaders implements INode {
|
||||
label: string
|
||||
|
|
@ -18,7 +20,7 @@ class Cheerio_DocumentLoaders implements INode {
|
|||
constructor() {
|
||||
this.label = 'Cheerio Web Scraper'
|
||||
this.name = 'cheerioWebScraper'
|
||||
this.version = 1.0
|
||||
this.version = 1.1
|
||||
this.type = 'Document'
|
||||
this.icon = 'cheerio.svg'
|
||||
this.category = 'Document Loaders'
|
||||
|
|
@ -66,6 +68,14 @@ class Cheerio_DocumentLoaders implements INode {
|
|||
'Only used when "Get Relative Links Method" is selected. Set 0 to retrieve all relative links, default limit is 10.',
|
||||
warning: `Retrieving all links might take long time, and all links will be upserted again if the flow's state changed (eg: different URL, chunk size, etc)`
|
||||
},
|
||||
{
|
||||
label: 'Selector (CSS)',
|
||||
name: 'selector',
|
||||
type: 'string',
|
||||
description: 'Specify a CSS selector to select the content to be extracted',
|
||||
optional: true,
|
||||
additionalParams: true
|
||||
},
|
||||
{
|
||||
label: 'Metadata',
|
||||
name: 'metadata',
|
||||
|
|
@ -88,10 +98,18 @@ class Cheerio_DocumentLoaders implements INode {
|
|||
throw new Error('Invalid URL')
|
||||
}
|
||||
|
||||
const selector: SelectorType = nodeData.inputs?.selector as SelectorType
|
||||
|
||||
let params: WebBaseLoaderParams = {}
|
||||
if (selector) {
|
||||
parse(selector) // comes with cheerio - will throw error if invalid
|
||||
params['selector'] = selector
|
||||
}
|
||||
|
||||
async function cheerioLoader(url: string): Promise<any> {
|
||||
try {
|
||||
let docs = []
|
||||
const loader = new CheerioWebBaseLoader(url)
|
||||
const loader = new CheerioWebBaseLoader(url, params)
|
||||
if (textSplitter) {
|
||||
docs = await loader.loadAndSplit(textSplitter)
|
||||
} else {
|
||||
|
|
|
|||
Loading…
Reference in New Issue