import { Document } from '@langchain/core/documents' import { BufferLoader } from 'langchain/document_loaders/fs/buffer' import { read, utils } from 'xlsx' /** * Document loader that uses SheetJS to load documents. * * Each worksheet is parsed into an array of row objects using the SheetJS * `sheet_to_json` method and projected to a `Document`. Metadata includes * original sheet name, row data, and row index */ export class LoadOfSheet extends BufferLoader { attributes: { name: string; description: string; type: string }[] = [] constructor(filePathOrBlob: string | Blob) { super(filePathOrBlob) this.attributes = [] } /** * Parse document * * NOTE: column labels in multiple sheets are not disambiguated! * * @param raw Raw data Buffer * @param metadata Document metadata * @returns Array of Documents */ async parse(raw: Buffer, metadata: Document['metadata']): Promise { const result: Document[] = [] this.attributes = [ { name: 'worksheet', description: 'Sheet or Worksheet Name', type: 'string' }, { name: 'rowNum', description: 'Row index', type: 'number' } ] const wb = read(raw, { type: 'buffer' }) for (let name of wb.SheetNames) { const fields: Record> = {} const ws = wb.Sheets[name] if (!ws) continue const aoo = utils.sheet_to_json(ws) as Record[] aoo.forEach((row) => { result.push({ pageContent: Object.entries(row) .map((kv) => `- ${kv[0]}: ${kv[1]}`) .join('\n') + '\n', metadata: { worksheet: name, rowNum: row['__rowNum__'], ...metadata, ...row } }) Object.entries(row).forEach(([k, v]) => { if (v != null) (fields[k] || (fields[k] = {}))[v instanceof Date ? 'date' : typeof v] = true }) }) Object.entries(fields).forEach(([k, v]) => this.attributes.push({ name: k, description: k, type: Object.keys(v).join(' or ') }) ) } return result } }