From d584c0b7002b65dcea3c069570e320f048a0259f Mon Sep 17 00:00:00 2001 From: Amr Aly Date: Fri, 18 Jul 2025 14:45:41 +0300 Subject: [PATCH] feat: Add header-based splitting to MarkdownTextSplitter (#4861) * feat: Add header-based splitting to MarkdownTextSplitter - Add dropdown for header level selection (H1-H6) - Implement hierarchical splitting (H2 includes H1 headers) - Headers preserved with content sections - Prioritize semantic boundaries over chunk size * Update MarkdownTextSplitter.ts * Update MarkdownTextSplitter.ts --------- Co-authored-by: Henry Heng --- .../MarkdownTextSplitter.ts | 116 +++++++++++++++++- 1 file changed, 115 insertions(+), 1 deletion(-) diff --git a/packages/components/nodes/textsplitters/MarkdownTextSplitter/MarkdownTextSplitter.ts b/packages/components/nodes/textsplitters/MarkdownTextSplitter/MarkdownTextSplitter.ts index 82d9f1e22..4170972ad 100644 --- a/packages/components/nodes/textsplitters/MarkdownTextSplitter/MarkdownTextSplitter.ts +++ b/packages/components/nodes/textsplitters/MarkdownTextSplitter/MarkdownTextSplitter.ts @@ -16,7 +16,7 @@ class MarkdownTextSplitter_TextSplitters implements INode { constructor() { this.label = 'Markdown Text Splitter' this.name = 'markdownTextSplitter' - this.version = 1.0 + this.version = 1.1 this.type = 'MarkdownTextSplitter' this.icon = 'markdownTextSplitter.svg' this.category = 'Text Splitters' @@ -38,6 +38,44 @@ class MarkdownTextSplitter_TextSplitters implements INode { description: 'Number of characters to overlap between chunks. Default is 200.', default: 200, optional: true + }, + { + label: 'Split by Headers', + name: 'splitByHeaders', + type: 'options', + description: 'Split documents at specified header levels. Headers will be included with their content.', + default: 'disabled', + options: [ + { + label: 'Disabled', + name: 'disabled' + }, + { + label: '# Headers (H1)', + name: 'h1' + }, + { + label: '## Headers (H2)', + name: 'h2' + }, + { + label: '### Headers (H3)', + name: 'h3' + }, + { + label: '#### Headers (H4)', + name: 'h4' + }, + { + label: '##### Headers (H5)', + name: 'h5' + }, + { + label: '###### Headers (H6)', + name: 'h6' + } + ], + optional: true } ] } @@ -45,6 +83,7 @@ class MarkdownTextSplitter_TextSplitters implements INode { async init(nodeData: INodeData): Promise { const chunkSize = nodeData.inputs?.chunkSize as string const chunkOverlap = nodeData.inputs?.chunkOverlap as string + const splitByHeaders = nodeData.inputs?.splitByHeaders as string const obj = {} as MarkdownTextSplitterParams @@ -53,8 +92,83 @@ class MarkdownTextSplitter_TextSplitters implements INode { const splitter = new MarkdownTextSplitter(obj) + if (splitByHeaders && splitByHeaders !== 'disabled') { + return { + splitDocuments: async (documents: any[]) => { + const results = [] + + for (const doc of documents) { + const chunks = await this.splitByHeaders(doc.pageContent, splitByHeaders, splitter) + for (const chunk of chunks) { + results.push({ + pageContent: chunk, + metadata: { ...doc.metadata } + }) + } + } + + return results + }, + splitText: async (text: string) => { + return await this.splitByHeaders(text, splitByHeaders, splitter) + } + } + } + return splitter } + + private async splitByHeaders(text: string, headerLevel: string, fallbackSplitter: any): Promise { + const maxLevel = this.getHeaderLevel(headerLevel) + if (maxLevel === 0) return await fallbackSplitter.splitText(text) + + const lines = text.split('\n') + const sections: string[] = [] + let currentSection: string[] = [] + + for (const line of lines) { + const isHeader = line.startsWith('#') && line.match(/^#{1,6}\s/) + const headerDepth = isHeader ? line.match(/^(#+)/)?.[1]?.length || 0 : 0 + + if (isHeader && headerDepth <= maxLevel) { + // Save previous section + if (currentSection.length > 0) { + sections.push(currentSection.join('\n').trim()) + } + // Start new section + currentSection = [line] + } else { + // Add line to current section + currentSection.push(line) + } + } + + // Add final section + if (currentSection.length > 0) { + sections.push(currentSection.join('\n').trim()) + } + + return sections + } + + private getHeaderLevel(headerLevel: string): number { + switch (headerLevel) { + case 'h1': + return 1 + case 'h2': + return 2 + case 'h3': + return 3 + case 'h4': + return 4 + case 'h5': + return 5 + case 'h6': + return 6 + default: + return 0 + } + } } module.exports = { nodeClass: MarkdownTextSplitter_TextSplitters }