GPT Vision: Added multi model capabilities to ChatOpenAI and ConversationChain.

This commit is contained in:
vinodkiran 2024-01-19 18:02:05 +05:30
parent f87d84997c
commit e774bd3c12
5 changed files with 229 additions and 29 deletions

View File

@ -1,4 +1,4 @@
import { FlowiseMemory, ICommonObject, IMessage, INode, INodeData, INodeParams } from '../../../src/Interface' import { FlowiseMemory, ICommonObject, INode, INodeData, INodeParams } from '../../../src/Interface'
import { ConversationChain } from 'langchain/chains' import { ConversationChain } from 'langchain/chains'
import { getBaseClasses } from '../../../src/utils' import { getBaseClasses } from '../../../src/utils'
import { ChatPromptTemplate, HumanMessagePromptTemplate, MessagesPlaceholder, SystemMessagePromptTemplate } from 'langchain/prompts' import { ChatPromptTemplate, HumanMessagePromptTemplate, MessagesPlaceholder, SystemMessagePromptTemplate } from 'langchain/prompts'
@ -8,6 +8,8 @@ import { flatten } from 'lodash'
import { Document } from 'langchain/document' import { Document } from 'langchain/document'
import { RunnableSequence } from 'langchain/schema/runnable' import { RunnableSequence } from 'langchain/schema/runnable'
import { StringOutputParser } from 'langchain/schema/output_parser' import { StringOutputParser } from 'langchain/schema/output_parser'
import { addImagesToMessages, processSpeechToText } from '../../../src/MultiModalUtils'
import { HumanMessage } from 'langchain/schema'
let systemMessage = `The following is a friendly conversation between a human and an AI. The AI is talkative and provides lots of specific details from its context. If the AI does not know the answer to a question, it truthfully says it does not know.` let systemMessage = `The following is a friendly conversation between a human and an AI. The AI is talkative and provides lots of specific details from its context. If the AI does not know the answer to a question, it truthfully says it does not know.`
const inputKey = 'input' const inputKey = 'input'
@ -67,13 +69,15 @@ class ConversationChain_Chains implements INode {
} }
async init(nodeData: INodeData, _: string, options: ICommonObject): Promise<any> { async init(nodeData: INodeData, _: string, options: ICommonObject): Promise<any> {
const chain = prepareChain(nodeData, this.sessionId, options.chatHistory) const chain = prepareChain(nodeData, options, this.sessionId)
return chain return chain
} }
async run(nodeData: INodeData, input: string, options: ICommonObject): Promise<string> { async run(nodeData: INodeData, input: string, options: ICommonObject): Promise<string> {
const memory = nodeData.inputs?.memory const memory = nodeData.inputs?.memory
const chain = prepareChain(nodeData, this.sessionId, options.chatHistory) input = await processSpeechToText(nodeData, input, options)
const chain = prepareChain(nodeData, options, this.sessionId)
const loggerHandler = new ConsoleCallbackHandler(options.logger) const loggerHandler = new ConsoleCallbackHandler(options.logger)
const callbacks = await additionalCallbacks(nodeData, options) const callbacks = await additionalCallbacks(nodeData, options)
@ -105,7 +109,7 @@ class ConversationChain_Chains implements INode {
} }
} }
const prepareChatPrompt = (nodeData: INodeData) => { const prepareChatPrompt = (nodeData: INodeData, options: ICommonObject) => {
const memory = nodeData.inputs?.memory as FlowiseMemory const memory = nodeData.inputs?.memory as FlowiseMemory
const prompt = nodeData.inputs?.systemMessagePrompt as string const prompt = nodeData.inputs?.systemMessagePrompt as string
const docs = nodeData.inputs?.document as Document[] const docs = nodeData.inputs?.document as Document[]
@ -128,16 +132,31 @@ const prepareChatPrompt = (nodeData: INodeData) => {
if (finalText) systemMessage = `${systemMessage}\nThe AI has the following context:\n${finalText}` if (finalText) systemMessage = `${systemMessage}\nThe AI has the following context:\n${finalText}`
const chatPrompt = ChatPromptTemplate.fromMessages([ // TODO: add audio uploads
// if (options.uploads.length > 0) {
// const audioUploads = getAudioUploads(options.uploads)
// for (const upload of audioUploads) {
// await this.processAudioWithWhisper(upload, chatMessages)
// }
// }
const imageContent = addImagesToMessages(nodeData, options)
//TODO, this should not be any[], what interface should it be?
let promptMessages: any[] = [
SystemMessagePromptTemplate.fromTemplate(prompt ? `${prompt}\n${systemMessage}` : systemMessage), SystemMessagePromptTemplate.fromTemplate(prompt ? `${prompt}\n${systemMessage}` : systemMessage),
new MessagesPlaceholder(memory.memoryKey ?? 'chat_history'), new MessagesPlaceholder(memory.memoryKey ?? 'chat_history'),
HumanMessagePromptTemplate.fromTemplate(`{${inputKey}}`) HumanMessagePromptTemplate.fromTemplate(`{${inputKey}}`)
]) ]
if (imageContent.length > 0) {
promptMessages.push(new HumanMessage({ content: imageContent }))
}
const chatPrompt = ChatPromptTemplate.fromMessages(promptMessages)
return chatPrompt return chatPrompt
} }
const prepareChain = (nodeData: INodeData, sessionId?: string, chatHistory: IMessage[] = []) => { const prepareChain = (nodeData: INodeData, options: ICommonObject, sessionId?: string) => {
const chatHistory = options.chatHistory
const model = nodeData.inputs?.model as BaseChatModel const model = nodeData.inputs?.model as BaseChatModel
const memory = nodeData.inputs?.memory as FlowiseMemory const memory = nodeData.inputs?.memory as FlowiseMemory
const memoryKey = memory.memoryKey ?? 'chat_history' const memoryKey = memory.memoryKey ?? 'chat_history'
@ -150,7 +169,7 @@ const prepareChain = (nodeData: INodeData, sessionId?: string, chatHistory: IMes
return history return history
} }
}, },
prepareChatPrompt(nodeData), prepareChatPrompt(nodeData, options),
model, model,
new StringOutputParser() new StringOutputParser()
]) ])

View File

@ -19,7 +19,7 @@ class ChatOpenAI_ChatModels implements INode {
constructor() { constructor() {
this.label = 'ChatOpenAI' this.label = 'ChatOpenAI'
this.name = 'chatOpenAI' this.name = 'chatOpenAI'
this.version = 2.0 this.version = 3.0
this.type = 'ChatOpenAI' this.type = 'ChatOpenAI'
this.icon = 'openai.svg' this.icon = 'openai.svg'
this.category = 'Chat Models' this.category = 'Chat Models'
@ -152,6 +152,73 @@ class ChatOpenAI_ChatModels implements INode {
type: 'json', type: 'json',
optional: true, optional: true,
additionalParams: true additionalParams: true
},
{
label: 'Allow Image Uploads',
name: 'allowImageUploads',
type: 'boolean',
default: false,
optional: true
},
{
label: 'Allow Audio Uploads',
name: 'allowAudioUploads',
type: 'boolean',
default: false,
optional: true
},
{
label: 'Allow Speech to Text',
name: 'allowSpeechToText',
type: 'boolean',
default: false,
optional: true
},
// TODO: only show when speechToText is true
{
label: 'Speech to Text Method',
description: 'How to turn audio into text',
name: 'speechToTextMode',
type: 'options',
options: [
{
label: 'Transcriptions',
name: 'transcriptions',
description:
'Transcribe audio into whatever language the audio is in. Default method when Speech to Text is turned on.'
},
{
label: 'Translations',
name: 'translations',
description: 'Translate and transcribe the audio into english.'
}
],
optional: false,
default: 'transcriptions',
additionalParams: true
},
{
label: 'Image Resolution',
description: 'This parameter controls the resolution in which the model views the image.',
name: 'imageResolution',
type: 'options',
options: [
{
label: 'Low',
name: 'low'
},
{
label: 'High',
name: 'high'
},
{
label: 'Auto',
name: 'auto'
}
],
default: 'low',
optional: false,
additionalParams: true
} }
] ]
} }
@ -168,6 +235,12 @@ class ChatOpenAI_ChatModels implements INode {
const basePath = nodeData.inputs?.basepath as string const basePath = nodeData.inputs?.basepath as string
const baseOptions = nodeData.inputs?.baseOptions const baseOptions = nodeData.inputs?.baseOptions
const allowImageUploads = nodeData.inputs?.allowImageUploads as boolean
const allowAudioUploads = nodeData.inputs?.allowAudioUploads as boolean
const allowSpeechToText = nodeData.inputs?.allowSpeechToText as boolean
const speechToTextMode = nodeData.inputs?.speechToTextMode as string
const imageResolution = nodeData.inputs?.imageResolution as string
const credentialData = await getCredentialData(nodeData.credential ?? '', options) const credentialData = await getCredentialData(nodeData.credential ?? '', options)
const openAIApiKey = getCredentialParam('openAIApiKey', credentialData, nodeData) const openAIApiKey = getCredentialParam('openAIApiKey', credentialData, nodeData)
@ -200,6 +273,20 @@ class ChatOpenAI_ChatModels implements INode {
basePath, basePath,
baseOptions: parsedBaseOptions baseOptions: parsedBaseOptions
}) })
const multiModal = {
allowImageUploads: allowImageUploads ?? false,
allowAudioUploads: allowAudioUploads ?? false,
allowSpeechToText: allowSpeechToText ?? false,
imageResolution,
speechToTextMode
}
Object.defineProperty(model, 'multiModal', {
enumerable: true,
configurable: true,
writable: true,
value: multiModal
})
return model return model
} }
} }

View File

@ -0,0 +1,87 @@
import { ICommonObject, INodeData } from './Interface'
import { BaseChatModel } from 'langchain/chat_models/base'
import { type ClientOptions, OpenAIClient } from '@langchain/openai'
import { ChatOpenAI } from 'langchain/chat_models/openai'
import path from 'path'
import { getUserHome } from './utils'
import fs from 'fs'
import { MessageContent } from '@langchain/core/dist/messages'
export const processSpeechToText = async (nodeData: INodeData, input: string, options: ICommonObject) => {
const MODEL_NAME = 'whisper-1'
let model = nodeData.inputs?.model as BaseChatModel
if (model instanceof ChatOpenAI && (model as any).multiModal) {
const multiModalConfig = (model as any).multiModal
if (options?.uploads) {
if (options.uploads.length === 1 && input.length === 0 && options.uploads[0].mime === 'audio/webm') {
const upload = options.uploads[0]
//special case, text input is empty, but we have an upload (recorded audio)
if (multiModalConfig.allowSpeechToText) {
const openAIClientOptions: ClientOptions = {
apiKey: model.openAIApiKey,
organization: model.organization
}
const openAIClient = new OpenAIClient(openAIClientOptions)
const filePath = path.join(getUserHome(), '.flowise', 'gptvision', upload.data, upload.name)
// as the image is stored in the server, read the file and convert it to base64
const audio_file = fs.createReadStream(filePath)
if (multiModalConfig.speechToTextMode === 'transcriptions') {
const transcription = await openAIClient.audio.transcriptions.create({
file: audio_file,
model: MODEL_NAME
})
return transcription.text
} else if (multiModalConfig.speechToTextMode === 'translations') {
const translation = await openAIClient.audio.translations.create({
file: audio_file,
model: MODEL_NAME
})
return translation.text
}
} else {
throw new Error('Speech to text is not selected, but found a recorded audio file. Please fix the chain.')
}
}
}
}
return input
}
export const addImagesToMessages = (nodeData: INodeData, options: ICommonObject): MessageContent => {
const imageContent: MessageContent = []
let model = nodeData.inputs?.model as BaseChatModel
if (model instanceof ChatOpenAI && (model as any).multiModal) {
if (options?.uploads && options?.uploads.length > 0) {
const imageUploads = getImageUploads(options.uploads)
for (const upload of imageUploads) {
let bf = upload.data
if (upload.type == 'stored-file') {
const filePath = path.join(getUserHome(), '.flowise', 'gptvision', upload.data, upload.name)
// as the image is stored in the server, read the file and convert it to base64
const contents = fs.readFileSync(filePath)
bf = 'data:' + upload.mime + ';base64,' + contents.toString('base64')
}
imageContent.push({
type: 'image_url',
image_url: {
url: bf,
detail: 'low'
}
})
}
}
}
return imageContent
}
export const getAudioUploads = (uploads: any[]) => {
return uploads.filter((url: any) => url.mime.startsWith('audio/'))
}
export const getImageUploads = (uploads: any[]) => {
return uploads.filter((url: any) => url.mime.startsWith('image/'))
}

View File

@ -467,40 +467,45 @@ export class App {
}) })
if (!chatflow) return res.status(404).send(`Chatflow ${req.params.id} not found`) if (!chatflow) return res.status(404).send(`Chatflow ${req.params.id} not found`)
const uploadAllowedNodes = ['OpenAIMultiModalChain', 'OpenAIWhisper'] const uploadAllowedCategoryNodes = ['Chat Models']
try { try {
const flowObj = JSON.parse(chatflow.flowData) const flowObj = JSON.parse(chatflow.flowData)
let isUploadAllowed = false
const allowances: IUploadFileSizeAndTypes[] = [] const allowances: IUploadFileSizeAndTypes[] = []
let allowSpeechToText = false
let allowImageUploads = false
let allowAudioUploads = false
flowObj.nodes.forEach((node: IReactFlowNode) => { flowObj.nodes.forEach((node: IReactFlowNode) => {
if (uploadAllowedNodes.indexOf(node.data.type) > -1) { if (uploadAllowedCategoryNodes.indexOf(node.data.category) > -1) {
logger.debug(`[server]: Found Eligible Node ${node.data.type}, Allowing Uploads.`) logger.debug(`[server]: Found Eligible Node ${node.data.type}, Allowing Uploads.`)
isUploadAllowed = true
const allowance: IUploadFileSizeAndTypes = {
fileTypes: [],
maxUploadSize: 0
}
// there could be multiple components allowing uploads, so we check if it's already added
// TODO: for now the maxUploadSize is hardcoded to 5MB, we need to add it to the node properties
node.data.inputParams.map((param: INodeParams) => { node.data.inputParams.map((param: INodeParams) => {
if (param.name === 'allowedUploadTypes') { if (param.name === 'allowImageUploads' && node.data.inputs?.['allowImageUploads'] && !allowImageUploads) {
allowance.fileTypes = (param.default as string).split(';') allowances.push({
fileTypes: 'image/gif;image/jpeg;image/png;image/webp'.split(';'),
maxUploadSize: 5
})
allowImageUploads = true
} }
if (param.name === 'maxUploadSize') { if (param.name === 'allowAudioUploads' && node.data.inputs?.['allowAudioUploads'] && !allowAudioUploads) {
allowance.maxUploadSize = parseInt(param.default ? (param.default as string) : '0') allowances.push({
fileTypes: 'audio/mpeg;audio/x-wav;audio/mp4'.split(';'),
maxUploadSize: 5
})
allowAudioUploads = true
}
if (param.name === 'allowSpeechToText' && node.data.inputs?.['allowSpeechToText']) {
allowSpeechToText = true
} }
}) })
if (allowance.fileTypes && allowance.maxUploadSize) {
allowances.push(allowance)
}
} }
}) })
return res.json({ return res.json({
isUploadAllowed, allowSpeechToText: allowSpeechToText,
isUploadAllowed: allowances.length > 0,
uploadFileSizeAndTypes: allowances uploadFileSizeAndTypes: allowances
}) })
} catch (e) { } catch (e) {

View File

@ -74,6 +74,7 @@ export const ChatMessage = ({ open, chatflowid, isDialog }) => {
]) ])
const [socketIOClientId, setSocketIOClientId] = useState('') const [socketIOClientId, setSocketIOClientId] = useState('')
const [isChatFlowAvailableToStream, setIsChatFlowAvailableToStream] = useState(false) const [isChatFlowAvailableToStream, setIsChatFlowAvailableToStream] = useState(false)
const [isChatFlowAvailableForSpeech, setIsChatFlowAvailableForSpeech] = useState(false)
const [sourceDialogOpen, setSourceDialogOpen] = useState(false) const [sourceDialogOpen, setSourceDialogOpen] = useState(false)
const [sourceDialogProps, setSourceDialogProps] = useState({}) const [sourceDialogProps, setSourceDialogProps] = useState({})
const [chatId, setChatId] = useState(undefined) const [chatId, setChatId] = useState(undefined)
@ -513,6 +514,7 @@ export const ChatMessage = ({ open, chatflowid, isDialog }) => {
useEffect(() => { useEffect(() => {
if (getAllowChatFlowUploads.data) { if (getAllowChatFlowUploads.data) {
setIsChatFlowAvailableForUploads(getAllowChatFlowUploads.data?.isUploadAllowed ?? false) setIsChatFlowAvailableForUploads(getAllowChatFlowUploads.data?.isUploadAllowed ?? false)
setIsChatFlowAvailableForSpeech(getAllowChatFlowUploads.data?.allowSpeechToText ?? false)
} }
// eslint-disable-next-line react-hooks/exhaustive-deps // eslint-disable-next-line react-hooks/exhaustive-deps
}, [getAllowChatFlowUploads.data]) }, [getAllowChatFlowUploads.data])
@ -922,7 +924,7 @@ export const ChatMessage = ({ open, chatflowid, isDialog }) => {
} }
endAdornment={ endAdornment={
<> <>
{isChatFlowAvailableForUploads && ( {isChatFlowAvailableForSpeech && (
<InputAdornment position='end'> <InputAdornment position='end'>
<IconButton <IconButton
onClick={() => onMicrophonePressed()} onClick={() => onMicrophonePressed()}