feat: Add header-based splitting to MarkdownTextSplitter (#4861)

* feat: Add header-based splitting to MarkdownTextSplitter - Add dropdown for header level selection (H1-H6) - Implement hierarchical splitting (H2 includes H1 headers) - Headers preserved with content sections - Prioritize semantic boundaries over chunk size * Update MarkdownTextSplitter.ts * Update MarkdownTextSplitter.ts --------- Co-authored-by: Henry Heng <henryheng@flowiseai.com>
2025-07-18 14:45:41 +03:00 · 2025-07-18 14:45:41 +03:00 · d584c0b700
parent ebf222731e
commit d584c0b700
1 changed files with 115 additions and 1 deletions
--- a/packages/components/nodes/textsplitters/MarkdownTextSplitter/MarkdownTextSplitter.ts
+++ b/packages/components/nodes/textsplitters/MarkdownTextSplitter/MarkdownTextSplitter.ts
@ -16,7 +16,7 @@ class MarkdownTextSplitter_TextSplitters implements INode {
    constructor() {
        this.label = 'Markdown Text Splitter'
        this.name = 'markdownTextSplitter'
-        this.version = 1.0
+        this.version = 1.1
        this.type = 'MarkdownTextSplitter'
        this.icon = 'markdownTextSplitter.svg'
        this.category = 'Text Splitters'
@ -38,6 +38,44 @@ class MarkdownTextSplitter_TextSplitters implements INode {
                description: 'Number of characters to overlap between chunks. Default is 200.',
                default: 200,
                optional: true
+            },
+            {
+                label: 'Split by Headers',
+                name: 'splitByHeaders',
+                type: 'options',
+                description: 'Split documents at specified header levels. Headers will be included with their content.',
+                default: 'disabled',
+                options: [
+                    {
+                        label: 'Disabled',
+                        name: 'disabled'
+                    },
+                    {
+                        label: '# Headers (H1)',
+                        name: 'h1'
+                    },
+                    {
+                        label: '## Headers (H2)',
+                        name: 'h2'
+                    },
+                    {
+                        label: '### Headers (H3)',
+                        name: 'h3'
+                    },
+                    {
+                        label: '#### Headers (H4)',
+                        name: 'h4'
+                    },
+                    {
+                        label: '##### Headers (H5)',
+                        name: 'h5'
+                    },
+                    {
+                        label: '###### Headers (H6)',
+                        name: 'h6'
+                    }
+                ],
+                optional: true
            }
        ]
    }
@ -45,6 +83,7 @@ class MarkdownTextSplitter_TextSplitters implements INode {
    async init(nodeData: INodeData): Promise<any> {
        const chunkSize = nodeData.inputs?.chunkSize as string
        const chunkOverlap = nodeData.inputs?.chunkOverlap as string
+        const splitByHeaders = nodeData.inputs?.splitByHeaders as string

        const obj = {} as MarkdownTextSplitterParams

@ -53,8 +92,83 @@ class MarkdownTextSplitter_TextSplitters implements INode {

        const splitter = new MarkdownTextSplitter(obj)

+        if (splitByHeaders && splitByHeaders !== 'disabled') {
+            return {
+                splitDocuments: async (documents: any[]) => {
+                    const results = []
+
+                    for (const doc of documents) {
+                        const chunks = await this.splitByHeaders(doc.pageContent, splitByHeaders, splitter)
+                        for (const chunk of chunks) {
+                            results.push({
+                                pageContent: chunk,
+                                metadata: { ...doc.metadata }
+                            })
+                        }
+                    }
+
+                    return results
+                },
+                splitText: async (text: string) => {
+                    return await this.splitByHeaders(text, splitByHeaders, splitter)
+                }
+            }
+        }
+
        return splitter
    }
+
+    private async splitByHeaders(text: string, headerLevel: string, fallbackSplitter: any): Promise<string[]> {
+        const maxLevel = this.getHeaderLevel(headerLevel)
+        if (maxLevel === 0) return await fallbackSplitter.splitText(text)
+
+        const lines = text.split('\n')
+        const sections: string[] = []
+        let currentSection: string[] = []
+
+        for (const line of lines) {
+            const isHeader = line.startsWith('#') && line.match(/^#{1,6}\s/)
+            const headerDepth = isHeader ? line.match(/^(#+)/)?.[1]?.length || 0 : 0
+
+            if (isHeader && headerDepth <= maxLevel) {
+                // Save previous section
+                if (currentSection.length > 0) {
+                    sections.push(currentSection.join('\n').trim())
+                }
+                // Start new section
+                currentSection = [line]
+            } else {
+                // Add line to current section
+                currentSection.push(line)
+            }
+        }
+
+        // Add final section
+        if (currentSection.length > 0) {
+            sections.push(currentSection.join('\n').trim())
+        }
+
+        return sections
+    }
+
+    private getHeaderLevel(headerLevel: string): number {
+        switch (headerLevel) {
+            case 'h1':
+                return 1
+            case 'h2':
+                return 2
+            case 'h3':
+                return 3
+            case 'h4':
+                return 4
+            case 'h5':
+                return 5
+            case 'h6':
+                return 6
+            default:
+                return 0
+        }
+    }
 }

 module.exports = { nodeClass: MarkdownTextSplitter_TextSplitters }