feat. add epub support #3629 (#3668)

* feat. add epub support * Add dependencies
2024-12-12 07:30:31 -06:00 · 2024-12-12 07:30:31 -06:00 · bfd677059e
parent 26b78ad55a
commit bfd677059e
4 changed files with 34224 additions and 33948 deletions
--- a/packages/components/nodes/documentloaders/Epub/Epub.ts
+++ b/packages/components/nodes/documentloaders/Epub/Epub.ts
@ -0,0 +1,202 @@
+import { omit } from 'lodash'
+import { IDocument, ICommonObject, INode, INodeData, INodeParams } from '../../../src/Interface'
+import { TextSplitter } from 'langchain/text_splitter'
+import { getFileFromStorage, handleEscapeCharacters, INodeOutputsValue } from '../../../src'
+import { EPubLoader } from '@langchain/community/document_loaders/fs/epub'
+
+import * as fs from 'fs'
+import * as path from 'path'
+class Epub_DocumentLoaders implements INode {
+    label: string
+    name: string
+    version: number
+    description: string
+    type: string
+    icon: string
+    category: string
+    baseClasses: string[]
+    inputs: INodeParams[]
+    outputs: INodeOutputsValue[]
+
+    constructor() {
+        this.label = 'Epub File'
+        this.name = 'epubFile'
+        this.version = 1.0
+        this.type = 'Document'
+        this.icon = 'epub.svg'
+        this.category = 'Document Loaders'
+        this.description = 'Load data from EPUB files'
+        this.baseClasses = [this.type]
+
+        this.inputs = [
+            {
+                label: 'Epub File',
+                name: 'epubFile',
+                type: 'file',
+                fileType: '.epub'
+            },
+            {
+                label: 'Text Splitter',
+                name: 'textSplitter',
+                type: 'TextSplitter',
+                optional: true
+            },
+            {
+                label: 'Usage',
+                name: 'usage',
+                type: 'options',
+                options: [
+                    {
+                        label: 'One document per chapter',
+                        name: 'perChapter'
+                    },
+                    {
+                        label: 'One document per file',
+                        name: 'perFile'
+                    }
+                ],
+                default: 'perChapter'
+            },
+            {
+                label: 'Additional Metadata',
+                name: 'metadata',
+                type: 'json',
+                description: 'Additional metadata to be added to the extracted documents',
+                optional: true,
+                additionalParams: true
+            },
+            {
+                label: 'Omit Metadata Keys',
+                name: 'omitMetadataKeys',
+                type: 'string',
+                rows: 4,
+                description: 'Metadata keys to omit, comma-separated',
+                placeholder: 'key1, key2, key3',
+                optional: true,
+                additionalParams: true
+            }
+        ]
+
+        this.outputs = [
+            {
+                label: 'Document',
+                name: 'document',
+                description: 'Array of document objects',
+                baseClasses: [...this.baseClasses, 'json']
+            },
+            {
+                label: 'Text',
+                name: 'text',
+                description: 'Concatenated text from documents',
+                baseClasses: ['string', 'json']
+            }
+        ]
+    }
+
+    async init(nodeData: INodeData, _: string, options: ICommonObject): Promise<any> {
+        const textSplitter = nodeData.inputs?.textSplitter as TextSplitter
+        const epubFileBase64 = nodeData.inputs?.epubFile as string
+        const usage = nodeData.inputs?.usage as string
+        const metadata = nodeData.inputs?.metadata
+        const _omitMetadataKeys = nodeData.inputs?.omitMetadataKeys as string
+        const output = nodeData.outputs?.output as string
+
+        let omitMetadataKeys: string[] = []
+        if (_omitMetadataKeys) {
+            omitMetadataKeys = _omitMetadataKeys.split(',').map((key) => key.trim())
+        }
+
+        let docs: IDocument[] = []
+        let files: string[] = []
+
+        const tempDir = path.join(process.cwd(), 'temp_epub_files')
+        fs.mkdirSync(tempDir, { recursive: true })
+
+        try {
+            if (epubFileBase64.startsWith('FILE-STORAGE::')) {
+                const fileName = epubFileBase64.replace('FILE-STORAGE::', '')
+                files = fileName.startsWith('[') && fileName.endsWith(']') ? JSON.parse(fileName) : [fileName]
+
+                const chatflowid = options.chatflowid
+
+                for (const file of files) {
+                    if (!file) continue
+                    const fileData = await getFileFromStorage(file, chatflowid)
+                    const tempFilePath = path.join(tempDir, `${Date.now()}_${file}`)
+                    fs.writeFileSync(tempFilePath, fileData)
+                    await this.extractDocs(usage, tempFilePath, textSplitter, docs)
+                }
+            } else {
+                files = epubFileBase64.startsWith('[') && epubFileBase64.endsWith(']') ? JSON.parse(epubFileBase64) : [epubFileBase64]
+
+                for (const file of files) {
+                    if (!file) continue
+                    const splitDataURI = file.split(',')
+                    splitDataURI.pop()
+                    const fileBuffer = Buffer.from(splitDataURI.pop() || '', 'base64')
+                    const tempFilePath = path.join(tempDir, `${Date.now()}_epub_file.epub`)
+                    fs.writeFileSync(tempFilePath, fileBuffer)
+                    await this.extractDocs(usage, tempFilePath, textSplitter, docs)
+                }
+            }
+
+            if (metadata) {
+                const parsedMetadata = typeof metadata === 'object' ? metadata : JSON.parse(metadata)
+                docs = docs.map((doc) => ({
+                    ...doc,
+                    metadata:
+                        _omitMetadataKeys === '*'
+                            ? {
+                                  ...parsedMetadata
+                              }
+                            : omit(
+                                  {
+                                      ...doc.metadata,
+                                      ...parsedMetadata
+                                  },
+                                  omitMetadataKeys
+                              )
+                }))
+            } else {
+                docs = docs.map((doc) => ({
+                    ...doc,
+                    metadata:
+                        _omitMetadataKeys === '*'
+                            ? {}
+                            : omit(
+                                  {
+                                      ...doc.metadata
+                                  },
+                                  omitMetadataKeys
+                              )
+                }))
+            }
+
+            if (output === 'document') {
+                return docs
+            } else {
+                let finaltext = ''
+                for (const doc of docs) {
+                    finaltext += `${doc.pageContent}\n`
+                }
+                return handleEscapeCharacters(finaltext, false)
+            }
+        } catch (error) {
+            console.error('Error processing EPUB files:', error)
+            throw error
+        } finally {
+            fs.rmSync(tempDir, { recursive: true, force: true })
+        }
+    }
+
+    private async extractDocs(usage: string, filePath: string, textSplitter: TextSplitter, docs: IDocument[]) {
+        const loader = new EPubLoader(filePath, { splitChapters: usage === 'perChapter' })
+        const loadedDocs = await loader.load()
+
+        const processedDocs = textSplitter ? await textSplitter.splitDocuments(loadedDocs) : loadedDocs
+
+        docs.push(...processedDocs)
+    }
+}
+
+module.exports = { nodeClass: Epub_DocumentLoaders }
--- a/packages/components/nodes/documentloaders/Epub/epub.svg
+++ b/packages/components/nodes/documentloaders/Epub/epub.svg
@ -0,0 +1,4 @@
+<svg width="800px" height="800px" viewBox="-10 -5 1034 1034" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" version="1.1">
+   <path fill="#000000"
+d="M500 227q-19 0 -32 13l-355 354q-13 14 -13 33t13 32l355 354q13 14 32 14t32 -14l355 -354q13 -13 13 -32t-13 -33l-52 -51l-335 335l-251 -251l251 -252l84 84l-168 168l84 83l251 -251l-219 -219q-13 -13 -32 -13z" />
+</svg>
--- a/packages/components/package.json
+++ b/packages/components/package.json
@ -79,6 +79,7 @@
        "css-what": "^6.1.0",
        "d3-dsv": "2",
        "dotenv": "^16.0.0",
+        "epub2": "^3.0.2",
        "exa-js": "^1.0.12",
        "express": "^4.17.3",
        "faiss-node": "^0.5.1",
--- a/pnpm-lock.yaml
+++ b/pnpm-lock.yaml