replace regex with linkifyjs

This commit is contained in:
chungyau97 2023-05-15 11:43:04 +07:00
parent 2cf266d9f9
commit 7460dd3a72
2 changed files with 7 additions and 12 deletions

View File

@ -1,7 +1,7 @@
import { INode, INodeData, INodeParams } from '../../../src/Interface'
import { TextSplitter } from 'langchain/text_splitter'
import { CheerioWebBaseLoader } from 'langchain/document_loaders/web/cheerio'
import { test } from 'linkifyjs'
class Cheerio_DocumentLoaders implements INode {
label: string
name: string
@ -47,18 +47,12 @@ class Cheerio_DocumentLoaders implements INode {
const metadata = nodeData.inputs?.metadata
let url = nodeData.inputs?.url as string
url = url.trim()
if (!test(url)) {
throw new Error('Invalid URL')
}
var urlPattern = new RegExp(
'^(https?:\\/\\/)?' + // validate protocol
'((([a-z\\d]([a-z\\d-]*[a-z\\d])*)\\.)+[a-z]{2,}|' + // validate domain name
'((\\d{1,3}\\.){3}\\d{1,3}))' + // validate OR ip (v4) address
'(\\:\\d+)?(\\/[-a-z\\d%_.~+]*)*' + // validate port and path
'(\\?[;&a-z\\d%_.~+=-]*)?' + // validate query string
'(\\#[-a-z\\d_]*)?$',
'i'
) // validate fragment locator
const loader = new CheerioWebBaseLoader(urlPattern.test(url.trim()) ? url.trim() : '')
const loader = new CheerioWebBaseLoader(url)
let docs = []
if (textSplitter) {

View File

@ -30,6 +30,7 @@
"form-data": "^4.0.0",
"graphql": "^16.6.0",
"langchain": "^0.0.73",
"linkifyjs": "^4.1.1",
"mammoth": "^1.5.1",
"moment": "^2.29.3",
"node-fetch": "2",