Merge pull request #948 from matthias/feature/CheerioSelector
Added CSS selector to Cheerio
This commit is contained in:
commit
c0d9484958
|
|
@ -1,8 +1,10 @@
|
||||||
import { INode, INodeData, INodeParams } from '../../../src/Interface'
|
import { INode, INodeData, INodeParams } from '../../../src/Interface'
|
||||||
import { TextSplitter } from 'langchain/text_splitter'
|
import { TextSplitter } from 'langchain/text_splitter'
|
||||||
import { CheerioWebBaseLoader } from 'langchain/document_loaders/web/cheerio'
|
import { CheerioWebBaseLoader, WebBaseLoaderParams } from 'langchain/document_loaders/web/cheerio'
|
||||||
import { test } from 'linkifyjs'
|
import { test } from 'linkifyjs'
|
||||||
|
import { parse } from 'css-what'
|
||||||
import { webCrawl, xmlScrape } from '../../../src'
|
import { webCrawl, xmlScrape } from '../../../src'
|
||||||
|
import { SelectorType } from 'cheerio'
|
||||||
|
|
||||||
class Cheerio_DocumentLoaders implements INode {
|
class Cheerio_DocumentLoaders implements INode {
|
||||||
label: string
|
label: string
|
||||||
|
|
@ -18,7 +20,7 @@ class Cheerio_DocumentLoaders implements INode {
|
||||||
constructor() {
|
constructor() {
|
||||||
this.label = 'Cheerio Web Scraper'
|
this.label = 'Cheerio Web Scraper'
|
||||||
this.name = 'cheerioWebScraper'
|
this.name = 'cheerioWebScraper'
|
||||||
this.version = 1.0
|
this.version = 1.1
|
||||||
this.type = 'Document'
|
this.type = 'Document'
|
||||||
this.icon = 'cheerio.svg'
|
this.icon = 'cheerio.svg'
|
||||||
this.category = 'Document Loaders'
|
this.category = 'Document Loaders'
|
||||||
|
|
@ -66,6 +68,14 @@ class Cheerio_DocumentLoaders implements INode {
|
||||||
'Only used when "Get Relative Links Method" is selected. Set 0 to retrieve all relative links, default limit is 10.',
|
'Only used when "Get Relative Links Method" is selected. Set 0 to retrieve all relative links, default limit is 10.',
|
||||||
warning: `Retrieving all links might take long time, and all links will be upserted again if the flow's state changed (eg: different URL, chunk size, etc)`
|
warning: `Retrieving all links might take long time, and all links will be upserted again if the flow's state changed (eg: different URL, chunk size, etc)`
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
label: 'Selector (CSS)',
|
||||||
|
name: 'selector',
|
||||||
|
type: 'string',
|
||||||
|
description: 'Specify a CSS selector to select the content to be extracted',
|
||||||
|
optional: true,
|
||||||
|
additionalParams: true
|
||||||
|
},
|
||||||
{
|
{
|
||||||
label: 'Metadata',
|
label: 'Metadata',
|
||||||
name: 'metadata',
|
name: 'metadata',
|
||||||
|
|
@ -88,10 +98,18 @@ class Cheerio_DocumentLoaders implements INode {
|
||||||
throw new Error('Invalid URL')
|
throw new Error('Invalid URL')
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const selector: SelectorType = nodeData.inputs?.selector as SelectorType
|
||||||
|
|
||||||
|
let params: WebBaseLoaderParams = {}
|
||||||
|
if (selector) {
|
||||||
|
parse(selector) // comes with cheerio - will throw error if invalid
|
||||||
|
params['selector'] = selector
|
||||||
|
}
|
||||||
|
|
||||||
async function cheerioLoader(url: string): Promise<any> {
|
async function cheerioLoader(url: string): Promise<any> {
|
||||||
try {
|
try {
|
||||||
let docs = []
|
let docs = []
|
||||||
const loader = new CheerioWebBaseLoader(url)
|
const loader = new CheerioWebBaseLoader(url, params)
|
||||||
if (textSplitter) {
|
if (textSplitter) {
|
||||||
docs = await loader.loadAndSplit(textSplitter)
|
docs = await loader.loadAndSplit(textSplitter)
|
||||||
} else {
|
} else {
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue