diff --git a/packages/components/nodes/textsplitters/HtmlToMarkdownTextSplitter/HtmlToMarkdownTextSplitter.ts b/packages/components/nodes/textsplitters/HtmlToMarkdownTextSplitter/HtmlToMarkdownTextSplitter.ts new file mode 100644 index 000000000..d05298fe1 --- /dev/null +++ b/packages/components/nodes/textsplitters/HtmlToMarkdownTextSplitter/HtmlToMarkdownTextSplitter.ts @@ -0,0 +1,75 @@ +import { INode, INodeData, INodeParams } from '../../../src/Interface' +import { getBaseClasses } from '../../../src/utils' +import { MarkdownTextSplitter, MarkdownTextSplitterParams } from 'langchain/text_splitter' +import { NodeHtmlMarkdown } from 'node-html-markdown' + +class HtmlToMarkdownTextSplitter_TextSplitters implements INode { + label: string + name: string + description: string + type: string + icon: string + category: string + baseClasses: string[] + inputs: INodeParams[] + + constructor() { + this.label = 'HtmlToMarkdown Text Splitter' + this.name = 'htmlToMarkdownTextSplitter' + this.type = 'HtmlToMarkdownTextSplitter' + this.icon = 'htmlToMarkdownTextSplitter.svg' + this.category = 'Text Splitters' + this.description = `Converts Html to Markdown and then split your content into documents based on the Markdown headers` + this.baseClasses = [this.type, ...getBaseClasses(HtmlToMarkdownTextSplitter)] + this.inputs = [ + { + label: 'Chunk Size', + name: 'chunkSize', + type: 'number', + default: 1000, + optional: true + }, + { + label: 'Chunk Overlap', + name: 'chunkOverlap', + type: 'number', + optional: true + } + ] + } + + async init(nodeData: INodeData): Promise { + const chunkSize = nodeData.inputs?.chunkSize as string + const chunkOverlap = nodeData.inputs?.chunkOverlap as string + + const obj = {} as MarkdownTextSplitterParams + + if (chunkSize) obj.chunkSize = parseInt(chunkSize, 10) + if (chunkOverlap) obj.chunkOverlap = parseInt(chunkOverlap, 10) + + const splitter = new HtmlToMarkdownTextSplitter(obj) + + return splitter + } +} +class HtmlToMarkdownTextSplitter extends MarkdownTextSplitter implements MarkdownTextSplitterParams { + constructor(fields?: Partial) { + { + super(fields) + } + } + splitText(text: string): Promise { + return new Promise((resolve, reject) => { + const markdown = NodeHtmlMarkdown.translate( + /* html */ text, + /* options (optional) */ {}, + /* customTranslators (optional) */ undefined, + /* customCodeBlockTranslators (optional) */ undefined + ) + super.splitText(markdown).then((result) => { + resolve(result) + }) + }) + } +} +module.exports = { nodeClass: HtmlToMarkdownTextSplitter_TextSplitters } diff --git a/packages/components/nodes/textsplitters/HtmlToMarkdownTextSplitter/htmlToMarkdownTextSplitter.svg b/packages/components/nodes/textsplitters/HtmlToMarkdownTextSplitter/htmlToMarkdownTextSplitter.svg new file mode 100644 index 000000000..f7d45d603 --- /dev/null +++ b/packages/components/nodes/textsplitters/HtmlToMarkdownTextSplitter/htmlToMarkdownTextSplitter.svg @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/packages/components/package.json b/packages/components/package.json index 3459a372f..d3ac06c35 100644 --- a/packages/components/package.json +++ b/packages/components/package.json @@ -42,6 +42,7 @@ "mammoth": "^1.5.1", "moment": "^2.29.3", "node-fetch": "^2.6.11", + "node-html-markdown": "^1.3.0", "pdf-parse": "^1.1.1", "pdfjs-dist": "^3.7.107", "playwright": "^1.35.0",