feat: Add header-based splitting to MarkdownTextSplitter (#4861)
* feat: Add header-based splitting to MarkdownTextSplitter - Add dropdown for header level selection (H1-H6) - Implement hierarchical splitting (H2 includes H1 headers) - Headers preserved with content sections - Prioritize semantic boundaries over chunk size * Update MarkdownTextSplitter.ts * Update MarkdownTextSplitter.ts --------- Co-authored-by: Henry Heng <henryheng@flowiseai.com>
This commit is contained in:
parent
ebf222731e
commit
d584c0b700
|
|
@ -16,7 +16,7 @@ class MarkdownTextSplitter_TextSplitters implements INode {
|
||||||
constructor() {
|
constructor() {
|
||||||
this.label = 'Markdown Text Splitter'
|
this.label = 'Markdown Text Splitter'
|
||||||
this.name = 'markdownTextSplitter'
|
this.name = 'markdownTextSplitter'
|
||||||
this.version = 1.0
|
this.version = 1.1
|
||||||
this.type = 'MarkdownTextSplitter'
|
this.type = 'MarkdownTextSplitter'
|
||||||
this.icon = 'markdownTextSplitter.svg'
|
this.icon = 'markdownTextSplitter.svg'
|
||||||
this.category = 'Text Splitters'
|
this.category = 'Text Splitters'
|
||||||
|
|
@ -38,6 +38,44 @@ class MarkdownTextSplitter_TextSplitters implements INode {
|
||||||
description: 'Number of characters to overlap between chunks. Default is 200.',
|
description: 'Number of characters to overlap between chunks. Default is 200.',
|
||||||
default: 200,
|
default: 200,
|
||||||
optional: true
|
optional: true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
label: 'Split by Headers',
|
||||||
|
name: 'splitByHeaders',
|
||||||
|
type: 'options',
|
||||||
|
description: 'Split documents at specified header levels. Headers will be included with their content.',
|
||||||
|
default: 'disabled',
|
||||||
|
options: [
|
||||||
|
{
|
||||||
|
label: 'Disabled',
|
||||||
|
name: 'disabled'
|
||||||
|
},
|
||||||
|
{
|
||||||
|
label: '# Headers (H1)',
|
||||||
|
name: 'h1'
|
||||||
|
},
|
||||||
|
{
|
||||||
|
label: '## Headers (H2)',
|
||||||
|
name: 'h2'
|
||||||
|
},
|
||||||
|
{
|
||||||
|
label: '### Headers (H3)',
|
||||||
|
name: 'h3'
|
||||||
|
},
|
||||||
|
{
|
||||||
|
label: '#### Headers (H4)',
|
||||||
|
name: 'h4'
|
||||||
|
},
|
||||||
|
{
|
||||||
|
label: '##### Headers (H5)',
|
||||||
|
name: 'h5'
|
||||||
|
},
|
||||||
|
{
|
||||||
|
label: '###### Headers (H6)',
|
||||||
|
name: 'h6'
|
||||||
|
}
|
||||||
|
],
|
||||||
|
optional: true
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
|
|
@ -45,6 +83,7 @@ class MarkdownTextSplitter_TextSplitters implements INode {
|
||||||
async init(nodeData: INodeData): Promise<any> {
|
async init(nodeData: INodeData): Promise<any> {
|
||||||
const chunkSize = nodeData.inputs?.chunkSize as string
|
const chunkSize = nodeData.inputs?.chunkSize as string
|
||||||
const chunkOverlap = nodeData.inputs?.chunkOverlap as string
|
const chunkOverlap = nodeData.inputs?.chunkOverlap as string
|
||||||
|
const splitByHeaders = nodeData.inputs?.splitByHeaders as string
|
||||||
|
|
||||||
const obj = {} as MarkdownTextSplitterParams
|
const obj = {} as MarkdownTextSplitterParams
|
||||||
|
|
||||||
|
|
@ -53,8 +92,83 @@ class MarkdownTextSplitter_TextSplitters implements INode {
|
||||||
|
|
||||||
const splitter = new MarkdownTextSplitter(obj)
|
const splitter = new MarkdownTextSplitter(obj)
|
||||||
|
|
||||||
|
if (splitByHeaders && splitByHeaders !== 'disabled') {
|
||||||
|
return {
|
||||||
|
splitDocuments: async (documents: any[]) => {
|
||||||
|
const results = []
|
||||||
|
|
||||||
|
for (const doc of documents) {
|
||||||
|
const chunks = await this.splitByHeaders(doc.pageContent, splitByHeaders, splitter)
|
||||||
|
for (const chunk of chunks) {
|
||||||
|
results.push({
|
||||||
|
pageContent: chunk,
|
||||||
|
metadata: { ...doc.metadata }
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return results
|
||||||
|
},
|
||||||
|
splitText: async (text: string) => {
|
||||||
|
return await this.splitByHeaders(text, splitByHeaders, splitter)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
return splitter
|
return splitter
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private async splitByHeaders(text: string, headerLevel: string, fallbackSplitter: any): Promise<string[]> {
|
||||||
|
const maxLevel = this.getHeaderLevel(headerLevel)
|
||||||
|
if (maxLevel === 0) return await fallbackSplitter.splitText(text)
|
||||||
|
|
||||||
|
const lines = text.split('\n')
|
||||||
|
const sections: string[] = []
|
||||||
|
let currentSection: string[] = []
|
||||||
|
|
||||||
|
for (const line of lines) {
|
||||||
|
const isHeader = line.startsWith('#') && line.match(/^#{1,6}\s/)
|
||||||
|
const headerDepth = isHeader ? line.match(/^(#+)/)?.[1]?.length || 0 : 0
|
||||||
|
|
||||||
|
if (isHeader && headerDepth <= maxLevel) {
|
||||||
|
// Save previous section
|
||||||
|
if (currentSection.length > 0) {
|
||||||
|
sections.push(currentSection.join('\n').trim())
|
||||||
|
}
|
||||||
|
// Start new section
|
||||||
|
currentSection = [line]
|
||||||
|
} else {
|
||||||
|
// Add line to current section
|
||||||
|
currentSection.push(line)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Add final section
|
||||||
|
if (currentSection.length > 0) {
|
||||||
|
sections.push(currentSection.join('\n').trim())
|
||||||
|
}
|
||||||
|
|
||||||
|
return sections
|
||||||
|
}
|
||||||
|
|
||||||
|
private getHeaderLevel(headerLevel: string): number {
|
||||||
|
switch (headerLevel) {
|
||||||
|
case 'h1':
|
||||||
|
return 1
|
||||||
|
case 'h2':
|
||||||
|
return 2
|
||||||
|
case 'h3':
|
||||||
|
return 3
|
||||||
|
case 'h4':
|
||||||
|
return 4
|
||||||
|
case 'h5':
|
||||||
|
return 5
|
||||||
|
case 'h6':
|
||||||
|
return 6
|
||||||
|
default:
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
module.exports = { nodeClass: MarkdownTextSplitter_TextSplitters }
|
module.exports = { nodeClass: MarkdownTextSplitter_TextSplitters }
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue