From 6af00db86cf06c1b25099ad39bfd49ab09a87281 Mon Sep 17 00:00:00 2001 From: chungyau97 Date: Mon, 24 Apr 2023 22:16:44 +0700 Subject: [PATCH 1/2] add Docx loader --- .../nodes/documentloaders/Docx/Docx.png | Bin 0 -> 8259 bytes .../nodes/documentloaders/Docx/Docx.ts | 59 ++++++++++++++++++ packages/components/package.json | 1 + 3 files changed, 60 insertions(+) create mode 100644 packages/components/nodes/documentloaders/Docx/Docx.png create mode 100644 packages/components/nodes/documentloaders/Docx/Docx.ts diff --git a/packages/components/nodes/documentloaders/Docx/Docx.png b/packages/components/nodes/documentloaders/Docx/Docx.png new file mode 100644 index 0000000000000000000000000000000000000000..6d527bd2dd4289623b925b0ab76ded6352b0641f GIT binary patch literal 8259 zcmeHsWmJ^izxEzlX^?J^l#)hTq#L9=q>&f|Bu5dE1_41rKuYNz8hk)Hq@}wXB&6nL zcs{(J&N}~(XRUMYwPs(l_x{E8yY{}<+;i_RO?3s_`;_+q0Jutu&$IzRx&4X)uuyNI zE$?C1Eucc5Dm}--!kS*xSh(%pb5k^g0)VrN{6Joe0j1kP3Qsu$PaRiVPj5?)7r@)w zo7>@~6V%$$?FF~1hh6HPIOS~|psD&?4uL@2+W+76e+vBXQ~;X^zIaMovLVMNLCXN6)~>^pKf_m5rU_5hoWn5AS0>egQ!t zVG+?MV&W2#QqnR{WuM8(D<~={tEj4}YiMd|>pa)h(>E|QGBz*4ONu+`Rm6-wO(hic3n%$}1|Xs%zl2b@dI6P0cN>KmKWJ|Jl*m)!ozE*FP}$ zYiM|6bZmU$_vF;{%9DSJyY&Sd)SP zP!%XWlX>nlwVVDLrang1xnGq0la%ih4x?d32%D4ECrzDGt!^Ub&7|k!EQ!MfG>>zh z1_s9A5HYe_IV-)O8vV^@oos2Fe4PF9EWK{9ZoTfVoDH(Ri7jz zUqFJWUnhjR1`vLI0D1`yuY5If&xPe1u^n@s0@Y`=ukuiRzoE7cV!`J;rLlFF%xQj% zXW-SE5dvSP@peyXLqS0PMw9qLD{2!qLFk|yk(i8^94sC0YmN>+dN!YDRycN*pxrakbnW>!bL|0 z$b*1th($J%TbU2iT@-#-!Dcm*8d`@Z0A-n6LwDgTG?CGEWbd+Y-en=X%>vsQfq>Qm z_3=AKc%LwDPBV%- zmpeu~kUJL_WT-A-8UUQfv+my!s`+{c`j$vAU_>%H{g)R$vig@|chzTeBg;PZlSB5L z{T|42oCk~>?ykh?gQ!FMv;uB!_eR=!hfANAH>#GDAZ{7(14D8_;%SRf*4-~)}=VCL(x zJ{oKg0yD1K2?u2lK&>1PGT#Qyts%WV39>4#)7z~<30weDsNFBtNY$$Nu#t)EjZVIl z7EP_j22QxU5&=yGi37C%$v+jElrU}6_o~U|&Xe5yW&-e=={5 z3VYDAmMLPV0sgMUC-94M1G{YV4JoljTkl7I7kNAXp`*?~!E7-1^TiM)MtF6*l~7NO zj=wnw)d;a(9U66OST^^?ZRCg_PpA>T5#9;aSRKBhat^r8c(3qcWYg>PXUm$>xsGId zk%Rt~%HDh77o&S0Pur#bIGt;&2hccTgZM=y!;?=mhpq{j3Oxml@m!s{i0{rh| z_LRgFw@Y!-;3b$k=&c1iwFMm&eo7PvwHkqjN(7B`d-3LH#$bWswl84~fx#Cg|DM#K z1Jc%jBd0h0seugLRS*8V1*OJVNJBXFq|YP9#M++BYL{+}5i$%t9@`RbIX*nozuBvu zY#iM!l=Kr`95UP8(&r=!)OFs?+g;kDdg3rtLahr#nw;d8+RHU0pOm@6#ioT^pXVEe zXlrGqkZR>QHW^RNPwx+D@ieE`E`6&sKkb{yP9+(?=FsBmD1Ne?FxeI%4}T=HS5h+W zejbTJqBpERGWEj#0BwCsHV}h8pmW$^`Ep*Xq4u{z$Bd-asgPaZgBA@(-49L;Zrfp@ z&TwC^O@g?sNZpH|k6W(|y-4a^{yH;d#|)DxQ2%)Z0>1ili0Z-jA3dsYiIQF0@nUSq zeCvw%6B(ta!P4>BO*-3wHrHznN*QQjaq9Chz~ya_Ii%PhaX=J2K2>vk^^0lAolWwp zOieEyGv#N?Ml34AY&PwFXFwy4g(}^-lyq4STD4 zVm&{IETdR*N0pBwQP96gNu)-afc}UwOq@^_9{58m0WZ z^Ow;0i9RLKjMLSTa}qI*M<7MdJp`X;iIvmpEQ?19K}pK_>EH`_x(wWTwYP81N!1Kg zWo8+&$hu$Y^~#54@L!(N(OCU7oA9q3KiSON{$^~iX69iU-3Lr+gP%#o7|*Cq;Z%%e^r9}hgl_g2G|cp^tZPZaS6E!^E1>ne3ElQC)ihpGk3_lKV`xZ zh5UMJe&T&4SPU&G>rd2+*#E@N+lWY%T+C11lekh=_X*woNC(5Vt~8PAM7+(GyM**& z8w=?k$cB!zV9Xj0zwr!ukB$P28h>51F5VYP@qf;FV@_7?ycv-GTJ_@l5o|y$=VRrU z+!0h%;POqHvm$lU(c_cimA-<3aL!q3V7xO;9#nB!$7UPf00uIn9^=VN`Eij>+BhFI zC~kZ8YzMdvyt27exF7vAT+3xU-J1?DmlOu6G&EfN*`KSCDf&zy)^_p7m0W|PU10OH zeFdGr`N@_xghLCtmuGc5|tv%OBtj7L zp2(w>4%VbC>XMVrqs2sogA8?3O^Cm|-1_!!W^G*T{UlUa#1x^cNo}<8S6@zqM1#@U z*Qzi65xR9@RwI$7WA=DaH%ebT|KBkN?`TLP0SZLVV%SHum>4tu^o0Id%hz|4MHKeYdeS6fF&%WGOHVQT+}qb~fCC$z(Qh zj*RBm*%nKW>+y8!9t8*e(x>DEXx_x+awaBm*}wa!x^$|jr!#-gNyrd^)>TuzOiF}WM99V zu;%*QzBBY>xV_~%!u4CZRY4Ib^Yf_*-7tNe`f^^?-ZDct^U~xjJTC3g{bFmc@2QK_ zy7+W-{@fS~o;1mNg`CLL1PC<+*&=`J~2&rj>C~aB9=VehM^*#%m zUTiSk)?1dWRd%gce-iLEdalEe^4GR%)bh+CmTmTzoFehlGlV(s$;?MIP?HvKHCbhG z9NN_pg8$*kMc~xefqP2%DsC+4&cy+46#|wl9J{59_(PXYwuF>{j0Bxro0B zW!d<@UQ1*Jk=jc{G&dPh^yxJzQ1~t`1(6ZM8)ZF!w#e*}-LY`qwpo%ge9^EF&amxO ze>id4k&&Kl%WlQpyebx+hoguA>=>Ro#kK4**^K0?iCmyHv1!s5b0-GMRa<#QM2?0= z?oRC2CtT_H0h*d8H%GhSp+W2k)Qf{oFwn z_KYK$WUrC~X;P1lO?;IQHqCj$!`nfpof+59uE|N)H(cM81aRLzfPJP-lGvL)(y3U4 z=3S*xu!qk;rUbWX01P+JOc?qHaYKTA`xw|#;X8+m8H?+kdJN?`3YHI77x!{wyxCz( zJpL|aKAeb-2L4#7B%cvSn+&?z3=c*;Tc@>Ox&7A#c3$ugu~&)l0)ww%*A+Tlw)a?O z)6`1#-@P7`il>-na2Nn?H5FkSkv4B2t=~1v_4%4U-(Xh;=Jkpd6KublYo{I-KU_%Y z$07ja=^oeb?fC2Sh9Q+N^YRl|eS|I}orT8Pfk>|@iWBokl!M6MA9+9D*Bq`&lA`C5 z+?kA=3q2{e2F_SSk!snEFo=by!@N^2Q(v=aT~yA`y)XS;{vluMq9LuRWQWRf7t!cA zpe63K^vpi7grX=Z0xm$3g-(v$ClQEox@N+keR^FpF^{^s|BKHTd;}aaz zV+w3GKp88%P``;2ciu#`YU07Q0E^XA!#LrCkB!%VUppLL)0ukk@(!HT@ z?$On@VQQM`d2Zpa5l6eP1aAD$@qQQ)D1sd#3pJ`HaW%W4hr*IZS=~V9F}<)~)x{TQ!5>xe z8nGFxh63jTlLfOR)F>$f_n;EMaFNle(ph+tW1wSIsMN%dPxKMr`h7eI&t^v>fLNeA zNOn{jQqD37?XK!~4;~eect@oxDW2j&cpQ9TfcL$?el>(ArexgnpQ6-FE)28KFrg|Y zrrsNd1!mK9LUK?=-`}z`R?^gU)s*ZQAKz$Z+~ylkYAp5u;i?*9_e>hNy${@YO-C)0 zkEX`Axi(^>S9=y{^8~y2z))d{M+B`g=fE5l0{fYBV!z*r?Mv>KuOu}{qFVi1!_W4M zR48or+s}{cZ-6wkoEuoz*+i9G^oi%Rge|q2Z>98`EvB^M`fX=S`sl#vhdRRAJKt8ouODn! z-rS}$-KtTs@gYAmwyQU%w>L@|7R(jie)OBf=)!IFx?m+WI(Ue8(UoLC9NHSgD#cvT zy6}>`Q`G!7RgfbV`SpbVx$PGBTj5?DX;QnHLjIa;I}}w_bm}~8ym{Ts!>7^E@cnWU z;5ck~Fppq43l~VDfSFNGKY3Lf{R=Ap^PTWEj*vB9u(yRt6TeW0RPdA3(~O5d=|aZ) zm{Bc=Iz366RHf7{7iHc?--K{0=}JyK;J#RIm~Og%z7O(=Uy+gUzd(n18^F%qyiTeL z-s5|R^$B~esAl8jp|{7iwRt-5AaQFx^l?bb0d6tab?QNvG!kSv)eol1Iz_#BRnHD` zr+seN1*SK3TMLr#b2p@C<>DT&Lw;+A*nRG*zYK?V1BjGsWM=7piQmts&7tyBP)Dq~ z`{(1ULQ=89Lu&HC*3F4wEbppEOmxO4?nYBoJzTfqW}`3DNLE2iC@i)RW|nU_yuCQD zh}k*`@o!sk*b7UU3=Cf9<8`f07=UdE9SmMm*vYCZ?iOTVqumsCEd0VpMz>e}66{>h8KAqb1=%r5k zc+@BwU0>}ytut%+LH#~FNbnI|#`HL5peaV{>A9wC;hEv)sJ!}E(AJ*?^HV~epv(eT zq4LS~2k|yxeg_|}C`p?o+8>eR9nHD=>R!q~x~DUE8!O@IOZx-saN*jUsdem~6r-_1 zUPMYm9qB~qhq}Unpij1|4S)Iuc+8xS2YJLhRtPkW9jx@eruTvm)BEL;NpTe~R;ML6 zK1_1jRc-G2IAq6QIY{k$ba?LVB^6MZYOaybuNYXypeGd!UG2$a&!=7Z{_`Q!7hbf0 z?qBeoA=7L(wYq9GWDjNGAM7H}^!I8h)%My=p5G|q2|J7>$ikjK=h4e=%Uthh7jBVX znQ-#FdAZl0LStm*i1n?ZM^*}@q~D7IG_xDzV63sFMp0&n`4y$mczqG(M;qnX?stLG zLWpngd(EtZrRvTOh@Wn$@Z-fQDG^P(V;k6f4Iz~gUCoJ;z<{U z4U35>Puo;&NovJ!D;o&aDEng>oMX9Dr71|qi+_jIPK+A*8<$r6wBLPB{3{XexGT8G zKySy97@x(fVgBzN{bIyQ6CYhz7?i{>YjL@*e>m6hIGT)AbC zCBcV6kQm$TuRRCNBJJyVdBE;h`8~1pBKSAKFZ#)hW2_BEI-n)1^LcQ}WKh5I2dc)o z;w&e2z>l-`_b|#J$#=(CznVYdzb8`OgS%-XKiVwtz>NoO%+dNuu^ ziaW;omiUMe$P|+rXcWv&3U*a3dhj1}PKt{>9EQ8jbXj9{ZMFs4TV$MJC39i}hGG^v zn;N#shbR+j)n<#VyNL!FYxPj{_b{W#MLqoirp$JC3h-fvcWfXY{;?q z^_%IrooUvqFU+q|WTkiIWS1KiS}owH1kimdDi^7M%fara=rsjVbC%gde`1!~&CxB@ z=Kox|NUFgjNt20I#ZjKtToA+Tj3)@_C=|x`MFXR)L%AMG1rU2bC)Y-FtJP5T8;N`k zaZf69D=dG+hprD1+;xXEG%83x3PL$ZB0eoiMT~z7q%od-UMjzuOe7~h-xas&DBwUf zv0v_9s&4Gg8zWj2$*dtTV$6gUzR7F3>X2Ww#?mrTs#_%KuP+=yhZs6TpWieLb+~t; zDKbGfaLfD5v}MLC#q_l)1kelL`|n-~{U`HSKpJ^|NdREaQ7t5A zSXpg+gDVr_`VFb1tcvZ&{y9E zJoV&ypXyNcjhTUgu~@aK@5^(LeC_QSMr-xV+qXwbB#2Fa(=%qShm4#zI$yCX^?InKO|EiLU(6)K zM|rX14r3}P>-nL@WKWmrPCT6UmNKc)I$y8ic(|TA_F8|48F?~3-2b!!=M+VVRoI|sDoRb~`_zaaQuNB~&pNF}6!VIqI Yk{xn44VSupAgfi9Q-4`16q*SW&i*H literal 0 HcmV?d00001 diff --git a/packages/components/nodes/documentloaders/Docx/Docx.ts b/packages/components/nodes/documentloaders/Docx/Docx.ts new file mode 100644 index 000000000..627103ab9 --- /dev/null +++ b/packages/components/nodes/documentloaders/Docx/Docx.ts @@ -0,0 +1,59 @@ +import { INode, INodeData, INodeParams } from '../../../src/Interface' +import { TextSplitter } from 'langchain/text_splitter' +import { DocxLoader } from 'langchain/document_loaders/fs/docx' + +class Docx_DocumentLoaders implements INode { + label: string + name: string + description: string + type: string + icon: string + category: string + baseClasses: string[] + inputs: INodeParams[] + + constructor() { + this.label = 'Docx File' + this.name = 'docxFile' + this.type = 'Document' + this.icon = 'Docx.png' + this.category = 'Document Loaders' + this.description = `Load data from Docx files` + this.baseClasses = [this.type] + this.inputs = [ + { + label: 'Docx File', + name: 'docxFile', + type: 'file', + fileType: '.docx' + }, + { + label: 'Text Splitter', + name: 'textSplitter', + type: 'TextSplitter', + optional: true + } + ] + } + + async init(nodeData: INodeData): Promise { + const textSplitter = nodeData.inputs?.textSplitter as TextSplitter + const docxFileBase64 = nodeData.inputs?.docxFile as string + const splitDataURI = docxFileBase64.split(',') + splitDataURI.pop() + const bf = Buffer.from(splitDataURI.pop() || '', 'base64') + + const blob = new Blob([bf]) + const loader = new DocxLoader(blob) + + if (textSplitter) { + const docs = await loader.loadAndSplit(textSplitter) + return docs + } else { + const docs = await loader.load() + return docs + } + } +} + +module.exports = { nodeClass: Docx_DocumentLoaders } diff --git a/packages/components/package.json b/packages/components/package.json index 793356063..1d8af269e 100644 --- a/packages/components/package.json +++ b/packages/components/package.json @@ -28,6 +28,7 @@ "express": "^4.17.3", "form-data": "^4.0.0", "langchain": "^0.0.63", + "mammoth": "^1.5.1", "moment": "^2.29.3", "node-fetch": "2", "pdf-parse": "^1.1.1", From 8a3d47b92c51ede57b799e13f493d4d1e2359415 Mon Sep 17 00:00:00 2001 From: chungyau97 Date: Mon, 24 Apr 2023 23:58:30 +0700 Subject: [PATCH 2/2] docx node code cleanup --- packages/components/nodes/documentloaders/Docx/Docx.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/components/nodes/documentloaders/Docx/Docx.ts b/packages/components/nodes/documentloaders/Docx/Docx.ts index 627103ab9..bfc859b96 100644 --- a/packages/components/nodes/documentloaders/Docx/Docx.ts +++ b/packages/components/nodes/documentloaders/Docx/Docx.ts @@ -18,7 +18,7 @@ class Docx_DocumentLoaders implements INode { this.type = 'Document' this.icon = 'Docx.png' this.category = 'Document Loaders' - this.description = `Load data from Docx files` + this.description = `Load data from DOCX files` this.baseClasses = [this.type] this.inputs = [ {