feat: Add header-based splitting to MarkdownTextSplitter (#4861)

* feat: Add header-based splitting to MarkdownTextSplitter

  - Add dropdown for header level selection (H1-H6)
  - Implement hierarchical splitting (H2 includes H1 headers)
  - Headers preserved with content sections
  - Prioritize semantic boundaries over chunk size

* Update MarkdownTextSplitter.ts

* Update MarkdownTextSplitter.ts

---------

Co-authored-by: Henry Heng <henryheng@flowiseai.com>
This commit is contained in:
Amr Aly 2025-07-18 14:45:41 +03:00 committed by GitHub
parent ebf222731e
commit d584c0b700
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
1 changed files with 115 additions and 1 deletions

View File

@ -16,7 +16,7 @@ class MarkdownTextSplitter_TextSplitters implements INode {
constructor() {
this.label = 'Markdown Text Splitter'
this.name = 'markdownTextSplitter'
this.version = 1.0
this.version = 1.1
this.type = 'MarkdownTextSplitter'
this.icon = 'markdownTextSplitter.svg'
this.category = 'Text Splitters'
@ -38,6 +38,44 @@ class MarkdownTextSplitter_TextSplitters implements INode {
description: 'Number of characters to overlap between chunks. Default is 200.',
default: 200,
optional: true
},
{
label: 'Split by Headers',
name: 'splitByHeaders',
type: 'options',
description: 'Split documents at specified header levels. Headers will be included with their content.',
default: 'disabled',
options: [
{
label: 'Disabled',
name: 'disabled'
},
{
label: '# Headers (H1)',
name: 'h1'
},
{
label: '## Headers (H2)',
name: 'h2'
},
{
label: '### Headers (H3)',
name: 'h3'
},
{
label: '#### Headers (H4)',
name: 'h4'
},
{
label: '##### Headers (H5)',
name: 'h5'
},
{
label: '###### Headers (H6)',
name: 'h6'
}
],
optional: true
}
]
}
@ -45,6 +83,7 @@ class MarkdownTextSplitter_TextSplitters implements INode {
async init(nodeData: INodeData): Promise<any> {
const chunkSize = nodeData.inputs?.chunkSize as string
const chunkOverlap = nodeData.inputs?.chunkOverlap as string
const splitByHeaders = nodeData.inputs?.splitByHeaders as string
const obj = {} as MarkdownTextSplitterParams
@ -53,8 +92,83 @@ class MarkdownTextSplitter_TextSplitters implements INode {
const splitter = new MarkdownTextSplitter(obj)
if (splitByHeaders && splitByHeaders !== 'disabled') {
return {
splitDocuments: async (documents: any[]) => {
const results = []
for (const doc of documents) {
const chunks = await this.splitByHeaders(doc.pageContent, splitByHeaders, splitter)
for (const chunk of chunks) {
results.push({
pageContent: chunk,
metadata: { ...doc.metadata }
})
}
}
return results
},
splitText: async (text: string) => {
return await this.splitByHeaders(text, splitByHeaders, splitter)
}
}
}
return splitter
}
private async splitByHeaders(text: string, headerLevel: string, fallbackSplitter: any): Promise<string[]> {
const maxLevel = this.getHeaderLevel(headerLevel)
if (maxLevel === 0) return await fallbackSplitter.splitText(text)
const lines = text.split('\n')
const sections: string[] = []
let currentSection: string[] = []
for (const line of lines) {
const isHeader = line.startsWith('#') && line.match(/^#{1,6}\s/)
const headerDepth = isHeader ? line.match(/^(#+)/)?.[1]?.length || 0 : 0
if (isHeader && headerDepth <= maxLevel) {
// Save previous section
if (currentSection.length > 0) {
sections.push(currentSection.join('\n').trim())
}
// Start new section
currentSection = [line]
} else {
// Add line to current section
currentSection.push(line)
}
}
// Add final section
if (currentSection.length > 0) {
sections.push(currentSection.join('\n').trim())
}
return sections
}
private getHeaderLevel(headerLevel: string): number {
switch (headerLevel) {
case 'h1':
return 1
case 'h2':
return 2
case 'h3':
return 3
case 'h4':
return 4
case 'h5':
return 5
case 'h6':
return 6
default:
return 0
}
}
}
module.exports = { nodeClass: MarkdownTextSplitter_TextSplitters }