From 61a0cee55103fe990cee9b761a38b151ad0421e1 Mon Sep 17 00:00:00 2001 From: chungyau97 Date: Sat, 6 May 2023 11:46:50 +0700 Subject: [PATCH] add token text splitter --- .../TokenTextSplitter/TokenTextSplitter.ts | 86 +++++++++++++++++++ .../TokenTextSplitter/tiktoken.svg | 7 ++ packages/components/package.json | 2 +- 3 files changed, 94 insertions(+), 1 deletion(-) create mode 100644 packages/components/nodes/textsplitters/TokenTextSplitter/TokenTextSplitter.ts create mode 100644 packages/components/nodes/textsplitters/TokenTextSplitter/tiktoken.svg diff --git a/packages/components/nodes/textsplitters/TokenTextSplitter/TokenTextSplitter.ts b/packages/components/nodes/textsplitters/TokenTextSplitter/TokenTextSplitter.ts new file mode 100644 index 000000000..8c8d6abea --- /dev/null +++ b/packages/components/nodes/textsplitters/TokenTextSplitter/TokenTextSplitter.ts @@ -0,0 +1,86 @@ +import { INode, INodeData, INodeParams } from '../../../src/Interface' +import { getBaseClasses } from '../../../src/utils' +import { TokenTextSplitter, TokenTextSplitterParams } from 'langchain/text_splitter' +import { TiktokenEncoding } from '@dqbd/tiktoken' + +class TokenTextSplitter_TextSplitters implements INode { + label: string + name: string + description: string + type: string + icon: string + category: string + baseClasses: string[] + inputs: INodeParams[] + + constructor() { + this.label = 'Token Text Splitter' + this.name = 'tokenTextSplitter' + this.type = 'TokenTextSplitter' + this.icon = 'tiktoken.svg' + this.category = 'Text Splitters' + this.description = `Splits a raw text string by first converting the text into BPE tokens, then split these tokens into chunks and convert the tokens within a single chunk back into text.` + this.baseClasses = [this.type, ...getBaseClasses(TokenTextSplitter)] + this.inputs = [ + { + label: 'Encoding Name', + name: 'encodingName', + type: 'options', + options: [ + { + label: 'gpt2', + name: 'gpt2' + }, + { + label: 'r50k_base', + name: 'r50k_base' + }, + { + label: 'p50k_base', + name: 'p50k_base' + }, + { + label: 'p50k_edit', + name: 'p50k_edit' + }, + { + label: 'cl100k_base', + name: 'cl100k_base' + } + ], + default: 'gpt2' + }, + { + label: 'Chunk Size', + name: 'chunkSize', + type: 'number', + default: 1000, + optional: true + }, + { + label: 'Chunk Overlap', + name: 'chunkOverlap', + type: 'number', + optional: true + } + ] + } + + async init(nodeData: INodeData): Promise { + const encodingName = nodeData.inputs?.encodingName as string + const chunkSize = nodeData.inputs?.chunkSize as string + const chunkOverlap = nodeData.inputs?.chunkOverlap as string + + const obj = {} as TokenTextSplitterParams + + obj.encodingName = encodingName as TiktokenEncoding + if (chunkSize) obj.chunkSize = parseInt(chunkSize, 10) + if (chunkOverlap) obj.chunkOverlap = parseInt(chunkOverlap, 10) + + const splitter = new TokenTextSplitter(obj) + + return splitter + } +} + +module.exports = { nodeClass: TokenTextSplitter_TextSplitters } diff --git a/packages/components/nodes/textsplitters/TokenTextSplitter/tiktoken.svg b/packages/components/nodes/textsplitters/TokenTextSplitter/tiktoken.svg new file mode 100644 index 000000000..833cfae10 --- /dev/null +++ b/packages/components/nodes/textsplitters/TokenTextSplitter/tiktoken.svg @@ -0,0 +1,7 @@ + + + + + + + \ No newline at end of file diff --git a/packages/components/package.json b/packages/components/package.json index 884d02e6c..f31ebf689 100644 --- a/packages/components/package.json +++ b/packages/components/package.json @@ -16,7 +16,7 @@ }, "license": "SEE LICENSE IN LICENSE.md", "dependencies": { - "@dqbd/tiktoken": "^1.0.4", + "@dqbd/tiktoken": "^1.0.7", "@huggingface/inference": "^1.6.3", "@pinecone-database/pinecone": "^0.0.12", "@supabase/supabase-js": "^2.21.0",