diff --git a/packages/components/nodes/chains/ConversationChain/ConversationChain.ts b/packages/components/nodes/chains/ConversationChain/ConversationChain.ts index fcd9921e5..0bba9b3c2 100644 --- a/packages/components/nodes/chains/ConversationChain/ConversationChain.ts +++ b/packages/components/nodes/chains/ConversationChain/ConversationChain.ts @@ -1,4 +1,4 @@ -import { FlowiseMemory, ICommonObject, IMessage, INode, INodeData, INodeParams } from '../../../src/Interface' +import { FlowiseMemory, ICommonObject, INode, INodeData, INodeParams } from '../../../src/Interface' import { ConversationChain } from 'langchain/chains' import { getBaseClasses } from '../../../src/utils' import { ChatPromptTemplate, HumanMessagePromptTemplate, MessagesPlaceholder, SystemMessagePromptTemplate } from 'langchain/prompts' @@ -8,6 +8,8 @@ import { flatten } from 'lodash' import { Document } from 'langchain/document' import { RunnableSequence } from 'langchain/schema/runnable' import { StringOutputParser } from 'langchain/schema/output_parser' +import { addImagesToMessages, processSpeechToText } from '../../../src/MultiModalUtils' +import { HumanMessage } from 'langchain/schema' let systemMessage = `The following is a friendly conversation between a human and an AI. The AI is talkative and provides lots of specific details from its context. If the AI does not know the answer to a question, it truthfully says it does not know.` const inputKey = 'input' @@ -67,13 +69,15 @@ class ConversationChain_Chains implements INode { } async init(nodeData: INodeData, _: string, options: ICommonObject): Promise { - const chain = prepareChain(nodeData, this.sessionId, options.chatHistory) + const chain = prepareChain(nodeData, options, this.sessionId) return chain } async run(nodeData: INodeData, input: string, options: ICommonObject): Promise { const memory = nodeData.inputs?.memory - const chain = prepareChain(nodeData, this.sessionId, options.chatHistory) + input = await processSpeechToText(nodeData, input, options) + + const chain = prepareChain(nodeData, options, this.sessionId) const loggerHandler = new ConsoleCallbackHandler(options.logger) const callbacks = await additionalCallbacks(nodeData, options) @@ -105,7 +109,7 @@ class ConversationChain_Chains implements INode { } } -const prepareChatPrompt = (nodeData: INodeData) => { +const prepareChatPrompt = (nodeData: INodeData, options: ICommonObject) => { const memory = nodeData.inputs?.memory as FlowiseMemory const prompt = nodeData.inputs?.systemMessagePrompt as string const docs = nodeData.inputs?.document as Document[] @@ -128,16 +132,31 @@ const prepareChatPrompt = (nodeData: INodeData) => { if (finalText) systemMessage = `${systemMessage}\nThe AI has the following context:\n${finalText}` - const chatPrompt = ChatPromptTemplate.fromMessages([ + // TODO: add audio uploads + // if (options.uploads.length > 0) { + // const audioUploads = getAudioUploads(options.uploads) + // for (const upload of audioUploads) { + // await this.processAudioWithWhisper(upload, chatMessages) + // } + // } + const imageContent = addImagesToMessages(nodeData, options) + + //TODO, this should not be any[], what interface should it be? + let promptMessages: any[] = [ SystemMessagePromptTemplate.fromTemplate(prompt ? `${prompt}\n${systemMessage}` : systemMessage), new MessagesPlaceholder(memory.memoryKey ?? 'chat_history'), HumanMessagePromptTemplate.fromTemplate(`{${inputKey}}`) - ]) + ] + if (imageContent.length > 0) { + promptMessages.push(new HumanMessage({ content: imageContent })) + } + const chatPrompt = ChatPromptTemplate.fromMessages(promptMessages) return chatPrompt } -const prepareChain = (nodeData: INodeData, sessionId?: string, chatHistory: IMessage[] = []) => { +const prepareChain = (nodeData: INodeData, options: ICommonObject, sessionId?: string) => { + const chatHistory = options.chatHistory const model = nodeData.inputs?.model as BaseChatModel const memory = nodeData.inputs?.memory as FlowiseMemory const memoryKey = memory.memoryKey ?? 'chat_history' @@ -150,7 +169,7 @@ const prepareChain = (nodeData: INodeData, sessionId?: string, chatHistory: IMes return history } }, - prepareChatPrompt(nodeData), + prepareChatPrompt(nodeData, options), model, new StringOutputParser() ]) diff --git a/packages/components/nodes/chatmodels/ChatOpenAI/ChatOpenAI.ts b/packages/components/nodes/chatmodels/ChatOpenAI/ChatOpenAI.ts index 493261637..bc5814d02 100644 --- a/packages/components/nodes/chatmodels/ChatOpenAI/ChatOpenAI.ts +++ b/packages/components/nodes/chatmodels/ChatOpenAI/ChatOpenAI.ts @@ -19,7 +19,7 @@ class ChatOpenAI_ChatModels implements INode { constructor() { this.label = 'ChatOpenAI' this.name = 'chatOpenAI' - this.version = 2.0 + this.version = 3.0 this.type = 'ChatOpenAI' this.icon = 'openai.svg' this.category = 'Chat Models' @@ -152,6 +152,73 @@ class ChatOpenAI_ChatModels implements INode { type: 'json', optional: true, additionalParams: true + }, + { + label: 'Allow Image Uploads', + name: 'allowImageUploads', + type: 'boolean', + default: false, + optional: true + }, + { + label: 'Allow Audio Uploads', + name: 'allowAudioUploads', + type: 'boolean', + default: false, + optional: true + }, + { + label: 'Allow Speech to Text', + name: 'allowSpeechToText', + type: 'boolean', + default: false, + optional: true + }, + // TODO: only show when speechToText is true + { + label: 'Speech to Text Method', + description: 'How to turn audio into text', + name: 'speechToTextMode', + type: 'options', + options: [ + { + label: 'Transcriptions', + name: 'transcriptions', + description: + 'Transcribe audio into whatever language the audio is in. Default method when Speech to Text is turned on.' + }, + { + label: 'Translations', + name: 'translations', + description: 'Translate and transcribe the audio into english.' + } + ], + optional: false, + default: 'transcriptions', + additionalParams: true + }, + { + label: 'Image Resolution', + description: 'This parameter controls the resolution in which the model views the image.', + name: 'imageResolution', + type: 'options', + options: [ + { + label: 'Low', + name: 'low' + }, + { + label: 'High', + name: 'high' + }, + { + label: 'Auto', + name: 'auto' + } + ], + default: 'low', + optional: false, + additionalParams: true } ] } @@ -168,6 +235,12 @@ class ChatOpenAI_ChatModels implements INode { const basePath = nodeData.inputs?.basepath as string const baseOptions = nodeData.inputs?.baseOptions + const allowImageUploads = nodeData.inputs?.allowImageUploads as boolean + const allowAudioUploads = nodeData.inputs?.allowAudioUploads as boolean + const allowSpeechToText = nodeData.inputs?.allowSpeechToText as boolean + const speechToTextMode = nodeData.inputs?.speechToTextMode as string + const imageResolution = nodeData.inputs?.imageResolution as string + const credentialData = await getCredentialData(nodeData.credential ?? '', options) const openAIApiKey = getCredentialParam('openAIApiKey', credentialData, nodeData) @@ -200,6 +273,20 @@ class ChatOpenAI_ChatModels implements INode { basePath, baseOptions: parsedBaseOptions }) + + const multiModal = { + allowImageUploads: allowImageUploads ?? false, + allowAudioUploads: allowAudioUploads ?? false, + allowSpeechToText: allowSpeechToText ?? false, + imageResolution, + speechToTextMode + } + Object.defineProperty(model, 'multiModal', { + enumerable: true, + configurable: true, + writable: true, + value: multiModal + }) return model } } diff --git a/packages/components/src/MultiModalUtils.ts b/packages/components/src/MultiModalUtils.ts new file mode 100644 index 000000000..513915a57 --- /dev/null +++ b/packages/components/src/MultiModalUtils.ts @@ -0,0 +1,87 @@ +import { ICommonObject, INodeData } from './Interface' +import { BaseChatModel } from 'langchain/chat_models/base' +import { type ClientOptions, OpenAIClient } from '@langchain/openai' +import { ChatOpenAI } from 'langchain/chat_models/openai' +import path from 'path' +import { getUserHome } from './utils' +import fs from 'fs' +import { MessageContent } from '@langchain/core/dist/messages' + +export const processSpeechToText = async (nodeData: INodeData, input: string, options: ICommonObject) => { + const MODEL_NAME = 'whisper-1' + + let model = nodeData.inputs?.model as BaseChatModel + if (model instanceof ChatOpenAI && (model as any).multiModal) { + const multiModalConfig = (model as any).multiModal + if (options?.uploads) { + if (options.uploads.length === 1 && input.length === 0 && options.uploads[0].mime === 'audio/webm') { + const upload = options.uploads[0] + //special case, text input is empty, but we have an upload (recorded audio) + if (multiModalConfig.allowSpeechToText) { + const openAIClientOptions: ClientOptions = { + apiKey: model.openAIApiKey, + organization: model.organization + } + const openAIClient = new OpenAIClient(openAIClientOptions) + const filePath = path.join(getUserHome(), '.flowise', 'gptvision', upload.data, upload.name) + + // as the image is stored in the server, read the file and convert it to base64 + const audio_file = fs.createReadStream(filePath) + + if (multiModalConfig.speechToTextMode === 'transcriptions') { + const transcription = await openAIClient.audio.transcriptions.create({ + file: audio_file, + model: MODEL_NAME + }) + return transcription.text + } else if (multiModalConfig.speechToTextMode === 'translations') { + const translation = await openAIClient.audio.translations.create({ + file: audio_file, + model: MODEL_NAME + }) + return translation.text + } + } else { + throw new Error('Speech to text is not selected, but found a recorded audio file. Please fix the chain.') + } + } + } + } + return input +} + +export const addImagesToMessages = (nodeData: INodeData, options: ICommonObject): MessageContent => { + const imageContent: MessageContent = [] + let model = nodeData.inputs?.model as BaseChatModel + if (model instanceof ChatOpenAI && (model as any).multiModal) { + if (options?.uploads && options?.uploads.length > 0) { + const imageUploads = getImageUploads(options.uploads) + for (const upload of imageUploads) { + let bf = upload.data + if (upload.type == 'stored-file') { + const filePath = path.join(getUserHome(), '.flowise', 'gptvision', upload.data, upload.name) + + // as the image is stored in the server, read the file and convert it to base64 + const contents = fs.readFileSync(filePath) + bf = 'data:' + upload.mime + ';base64,' + contents.toString('base64') + } + imageContent.push({ + type: 'image_url', + image_url: { + url: bf, + detail: 'low' + } + }) + } + } + } + return imageContent +} + +export const getAudioUploads = (uploads: any[]) => { + return uploads.filter((url: any) => url.mime.startsWith('audio/')) +} + +export const getImageUploads = (uploads: any[]) => { + return uploads.filter((url: any) => url.mime.startsWith('image/')) +} diff --git a/packages/server/src/index.ts b/packages/server/src/index.ts index 7f1b94141..da1057a9d 100644 --- a/packages/server/src/index.ts +++ b/packages/server/src/index.ts @@ -467,40 +467,45 @@ export class App { }) if (!chatflow) return res.status(404).send(`Chatflow ${req.params.id} not found`) - const uploadAllowedNodes = ['OpenAIMultiModalChain', 'OpenAIWhisper'] + const uploadAllowedCategoryNodes = ['Chat Models'] try { const flowObj = JSON.parse(chatflow.flowData) - let isUploadAllowed = false const allowances: IUploadFileSizeAndTypes[] = [] - + let allowSpeechToText = false + let allowImageUploads = false + let allowAudioUploads = false flowObj.nodes.forEach((node: IReactFlowNode) => { - if (uploadAllowedNodes.indexOf(node.data.type) > -1) { + if (uploadAllowedCategoryNodes.indexOf(node.data.category) > -1) { logger.debug(`[server]: Found Eligible Node ${node.data.type}, Allowing Uploads.`) - isUploadAllowed = true - - const allowance: IUploadFileSizeAndTypes = { - fileTypes: [], - maxUploadSize: 0 - } + // there could be multiple components allowing uploads, so we check if it's already added + // TODO: for now the maxUploadSize is hardcoded to 5MB, we need to add it to the node properties node.data.inputParams.map((param: INodeParams) => { - if (param.name === 'allowedUploadTypes') { - allowance.fileTypes = (param.default as string).split(';') + if (param.name === 'allowImageUploads' && node.data.inputs?.['allowImageUploads'] && !allowImageUploads) { + allowances.push({ + fileTypes: 'image/gif;image/jpeg;image/png;image/webp'.split(';'), + maxUploadSize: 5 + }) + allowImageUploads = true } - if (param.name === 'maxUploadSize') { - allowance.maxUploadSize = parseInt(param.default ? (param.default as string) : '0') + if (param.name === 'allowAudioUploads' && node.data.inputs?.['allowAudioUploads'] && !allowAudioUploads) { + allowances.push({ + fileTypes: 'audio/mpeg;audio/x-wav;audio/mp4'.split(';'), + maxUploadSize: 5 + }) + allowAudioUploads = true + } + if (param.name === 'allowSpeechToText' && node.data.inputs?.['allowSpeechToText']) { + allowSpeechToText = true } }) - - if (allowance.fileTypes && allowance.maxUploadSize) { - allowances.push(allowance) - } } }) return res.json({ - isUploadAllowed, + allowSpeechToText: allowSpeechToText, + isUploadAllowed: allowances.length > 0, uploadFileSizeAndTypes: allowances }) } catch (e) { diff --git a/packages/ui/src/views/chatmessage/ChatMessage.js b/packages/ui/src/views/chatmessage/ChatMessage.js index 82b17ded6..155b3e990 100644 --- a/packages/ui/src/views/chatmessage/ChatMessage.js +++ b/packages/ui/src/views/chatmessage/ChatMessage.js @@ -74,6 +74,7 @@ export const ChatMessage = ({ open, chatflowid, isDialog }) => { ]) const [socketIOClientId, setSocketIOClientId] = useState('') const [isChatFlowAvailableToStream, setIsChatFlowAvailableToStream] = useState(false) + const [isChatFlowAvailableForSpeech, setIsChatFlowAvailableForSpeech] = useState(false) const [sourceDialogOpen, setSourceDialogOpen] = useState(false) const [sourceDialogProps, setSourceDialogProps] = useState({}) const [chatId, setChatId] = useState(undefined) @@ -513,6 +514,7 @@ export const ChatMessage = ({ open, chatflowid, isDialog }) => { useEffect(() => { if (getAllowChatFlowUploads.data) { setIsChatFlowAvailableForUploads(getAllowChatFlowUploads.data?.isUploadAllowed ?? false) + setIsChatFlowAvailableForSpeech(getAllowChatFlowUploads.data?.allowSpeechToText ?? false) } // eslint-disable-next-line react-hooks/exhaustive-deps }, [getAllowChatFlowUploads.data]) @@ -922,7 +924,7 @@ export const ChatMessage = ({ open, chatflowid, isDialog }) => { } endAdornment={ <> - {isChatFlowAvailableForUploads && ( + {isChatFlowAvailableForSpeech && ( onMicrophonePressed()}