Flowise/packages/components/nodes/documentloaders/MicrosoftPowerpoint/PowerpointLoader.ts

import { Document } from '@langchain/core/documents'
import { BufferLoader } from 'langchain/document_loaders/fs/buffer'
import { parseOfficeAsync } from 'officeparser'

/**
 * Document loader that uses officeparser to load PowerPoint documents.
 *
 * Each slide is parsed into a separate Document with metadata including
 * slide number and extracted text content.
 */
export class PowerpointLoader extends BufferLoader {
    attributes: { name: string; description: string; type: string }[] = []

    constructor(filePathOrBlob: string | Blob) {
        super(filePathOrBlob)
        this.attributes = []
    }

    /**
     * Parse PowerPoint document
     *
     * @param raw Raw data Buffer
     * @param metadata Document metadata
     * @returns Array of Documents
     */
    async parse(raw: Buffer, metadata: Document['metadata']): Promise<Document[]> {
        const result: Document[] = []

        this.attributes = [
            { name: 'slideNumber', description: 'Slide number', type: 'number' },
            { name: 'documentType', description: 'Type of document', type: 'string' }
        ]

        try {
            // Use officeparser to extract text from PowerPoint
            const data = await parseOfficeAsync(raw)

            if (typeof data === 'string' && data.trim()) {
                // Split content by common slide separators or use the entire content as one document
                const slides = this.splitIntoSlides(data)

                slides.forEach((slideContent, index) => {
                    if (slideContent.trim()) {
                        result.push({
                            pageContent: slideContent.trim(),
                            metadata: {
                                slideNumber: index + 1,
                                documentType: 'powerpoint',
                                ...metadata
                            }
                        })
                    }
                })
            }
        } catch (error) {
            console.error('Error parsing PowerPoint file:', error)
            throw new Error(`Failed to parse PowerPoint file: ${error instanceof Error ? error.message : 'Unknown error'}`)
        }

        return result
    }

    /**
     * Split content into slides based on common patterns
     * This is a heuristic approach since officeparser returns plain text
     */
    private splitIntoSlides(content: string): string[] {
        // Try to split by common slide patterns
        const slidePatterns = [
            /\n\s*Slide\s+\d+/gi,
            /\n\s*Page\s+\d+/gi,
            /\n\s*\d+\s*\/\s*\d+/gi,
            /\n\s*_{3,}/g, // Underscores as separators
            /\n\s*-{3,}/g // Dashes as separators
        ]

        let slides: string[] = []

        // Try each pattern and use the one that creates the most reasonable splits
        for (const pattern of slidePatterns) {
            const potentialSlides = content.split(pattern)
            if (potentialSlides.length > 1 && potentialSlides.length < 100) {
                // Reasonable number of slides
                slides = potentialSlides
                break
            }
        }

        // If no good pattern found, split by double newlines as a fallback
        if (slides.length === 0) {
            slides = content.split(/\n\s*\n\s*\n/)
        }

        // If still no good split, treat entire content as one slide
        if (slides.length === 0 || slides.every((slide) => slide.trim().length < 10)) {
            slides = [content]
        }

        return slides.filter((slide) => slide.trim().length > 0)
    }
}