109 lines
3.9 KiB
TypeScript
109 lines
3.9 KiB
TypeScript
import { Document } from '@langchain/core/documents'
|
|
import { BufferLoader } from 'langchain/document_loaders/fs/buffer'
|
|
import { parseOfficeAsync } from 'officeparser'
|
|
|
|
/**
|
|
* Document loader that uses officeparser to load Word documents.
|
|
*
|
|
* The document is parsed into a single Document with metadata including
|
|
* document type and extracted text content.
|
|
*/
|
|
export class WordLoader extends BufferLoader {
|
|
attributes: { name: string; description: string; type: string }[] = []
|
|
|
|
constructor(filePathOrBlob: string | Blob) {
|
|
super(filePathOrBlob)
|
|
this.attributes = []
|
|
}
|
|
|
|
/**
|
|
* Parse Word document
|
|
*
|
|
* @param raw Raw data Buffer
|
|
* @param metadata Document metadata
|
|
* @returns Array of Documents
|
|
*/
|
|
async parse(raw: Buffer, metadata: Document['metadata']): Promise<Document[]> {
|
|
const result: Document[] = []
|
|
|
|
this.attributes = [
|
|
{ name: 'documentType', description: 'Type of document', type: 'string' },
|
|
{ name: 'pageCount', description: 'Number of pages/sections', type: 'number' }
|
|
]
|
|
|
|
try {
|
|
// Use officeparser to extract text from Word document
|
|
const data = await parseOfficeAsync(raw)
|
|
|
|
if (typeof data === 'string' && data.trim()) {
|
|
// Split content by common page/section separators
|
|
const sections = this.splitIntoSections(data)
|
|
|
|
sections.forEach((sectionContent, index) => {
|
|
if (sectionContent.trim()) {
|
|
result.push({
|
|
pageContent: sectionContent.trim(),
|
|
metadata: {
|
|
documentType: 'word',
|
|
pageNumber: index + 1,
|
|
...metadata
|
|
}
|
|
})
|
|
}
|
|
})
|
|
}
|
|
} catch (error) {
|
|
console.error('Error parsing Word file:', error)
|
|
throw new Error(`Failed to parse Word file: ${error instanceof Error ? error.message : 'Unknown error'}`)
|
|
}
|
|
|
|
return result
|
|
}
|
|
|
|
/**
|
|
* Split content into sections based on common patterns
|
|
* This is a heuristic approach since officeparser returns plain text
|
|
*/
|
|
private splitIntoSections(content: string): string[] {
|
|
// Try to split by common section patterns
|
|
const sectionPatterns = [
|
|
/\n\s*Page\s+\d+/gi,
|
|
/\n\s*Section\s+\d+/gi,
|
|
/\n\s*Chapter\s+\d+/gi,
|
|
/\n\s*\d+\.\s+/gi, // Numbered sections like "1. ", "2. "
|
|
/\n\s*[A-Z][A-Z\s]{2,}\n/g, // ALL CAPS headings
|
|
/\n\s*_{5,}/g, // Long underscores as separators
|
|
/\n\s*-{5,}/g // Long dashes as separators
|
|
]
|
|
|
|
let sections: string[] = []
|
|
|
|
// Try each pattern and use the one that creates the most reasonable splits
|
|
for (const pattern of sectionPatterns) {
|
|
const potentialSections = content.split(pattern)
|
|
if (potentialSections.length > 1 && potentialSections.length < 50) {
|
|
// Reasonable number of sections
|
|
sections = potentialSections
|
|
break
|
|
}
|
|
}
|
|
|
|
// If no good pattern found, split by multiple newlines as a fallback
|
|
if (sections.length === 0) {
|
|
sections = content.split(/\n\s*\n\s*\n\s*\n/)
|
|
}
|
|
|
|
// If still no good split, split by double newlines
|
|
if (sections.length === 0 || sections.every((section) => section.trim().length < 20)) {
|
|
sections = content.split(/\n\s*\n\s*\n/)
|
|
}
|
|
|
|
// If still no good split, treat entire content as one section
|
|
if (sections.length === 0 || sections.every((section) => section.trim().length < 10)) {
|
|
sections = [content]
|
|
}
|
|
|
|
return sections.filter((section) => section.trim().length > 0)
|
|
}
|
|
}
|