GPT Vision: Updated behaviour to submit voice recording directly without the need to do another submit.
This commit is contained in:
parent
188311187a
commit
9222aafc6f
|
|
@ -1,15 +1,9 @@
|
||||||
import {
|
import { ICommonObject, INode, INodeData, INodeOutputsValue, INodeParams } from '../../../src/Interface'
|
||||||
ICommonObject,
|
|
||||||
INode,
|
|
||||||
INodeData,
|
|
||||||
INodeOutputsValue,
|
|
||||||
INodeParams
|
|
||||||
} from "../../../src/Interface";
|
|
||||||
import { getBaseClasses, getCredentialData, getCredentialParam, handleEscapeCharacters } from '../../../src/utils'
|
import { getBaseClasses, getCredentialData, getCredentialParam, handleEscapeCharacters } from '../../../src/utils'
|
||||||
import { OpenAIMultiModalChainInput, VLLMChain } from "./VLLMChain";
|
import { OpenAIMultiModalChainInput, VLLMChain } from './VLLMChain'
|
||||||
import { ConsoleCallbackHandler, CustomChainHandler, additionalCallbacks } from '../../../src/handler'
|
import { ConsoleCallbackHandler, CustomChainHandler, additionalCallbacks } from '../../../src/handler'
|
||||||
import { formatResponse } from '../../outputparsers/OutputParserHelpers'
|
import { formatResponse } from '../../outputparsers/OutputParserHelpers'
|
||||||
import { checkInputs, Moderation, streamResponse } from "../../moderation/Moderation";
|
import { checkInputs, Moderation, streamResponse } from '../../moderation/Moderation'
|
||||||
|
|
||||||
class OpenAIMultiModalChain_Chains implements INode {
|
class OpenAIMultiModalChain_Chains implements INode {
|
||||||
label: string
|
label: string
|
||||||
|
|
@ -72,7 +66,7 @@ class OpenAIMultiModalChain_Chains implements INode {
|
||||||
label: 'Speech to Text',
|
label: 'Speech to Text',
|
||||||
name: 'speechToText',
|
name: 'speechToText',
|
||||||
type: 'boolean',
|
type: 'boolean',
|
||||||
optional: true,
|
optional: true
|
||||||
},
|
},
|
||||||
// TODO: only show when speechToText is true
|
// TODO: only show when speechToText is true
|
||||||
{
|
{
|
||||||
|
|
@ -84,7 +78,8 @@ class OpenAIMultiModalChain_Chains implements INode {
|
||||||
{
|
{
|
||||||
label: 'Transcriptions',
|
label: 'Transcriptions',
|
||||||
name: 'transcriptions',
|
name: 'transcriptions',
|
||||||
description: 'Transcribe audio into whatever language the audio is in. Default method when Speech to Text is turned on.'
|
description:
|
||||||
|
'Transcribe audio into whatever language the audio is in. Default method when Speech to Text is turned on.'
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
label: 'Translations',
|
label: 'Translations',
|
||||||
|
|
@ -186,7 +181,6 @@ class OpenAIMultiModalChain_Chains implements INode {
|
||||||
const topP = nodeData.inputs?.topP as string
|
const topP = nodeData.inputs?.topP as string
|
||||||
const speechToText = nodeData.inputs?.speechToText as boolean
|
const speechToText = nodeData.inputs?.speechToText as boolean
|
||||||
|
|
||||||
|
|
||||||
const fields: OpenAIMultiModalChainInput = {
|
const fields: OpenAIMultiModalChainInput = {
|
||||||
openAIApiKey: openAIApiKey,
|
openAIApiKey: openAIApiKey,
|
||||||
imageResolution: imageResolution,
|
imageResolution: imageResolution,
|
||||||
|
|
@ -256,6 +250,22 @@ const runPrediction = async (
|
||||||
const socketIO = isStreaming ? options.socketIO : undefined
|
const socketIO = isStreaming ? options.socketIO : undefined
|
||||||
const socketIOClientId = isStreaming ? options.socketIOClientId : ''
|
const socketIOClientId = isStreaming ? options.socketIOClientId : ''
|
||||||
const moderations = nodeData.inputs?.inputModeration as Moderation[]
|
const moderations = nodeData.inputs?.inputModeration as Moderation[]
|
||||||
|
const speechToText = nodeData.inputs?.speechToText as boolean
|
||||||
|
|
||||||
|
if (options?.uploads) {
|
||||||
|
if (options.uploads.length === 1 && input.length === 0) {
|
||||||
|
if (speechToText) {
|
||||||
|
//special case, text input is empty, but we have an upload (recorded audio)
|
||||||
|
const convertedText = await chain.processAudioWithWisper(options.uploads[0], undefined)
|
||||||
|
//so we use the upload as input
|
||||||
|
input = convertedText
|
||||||
|
}
|
||||||
|
// do not send the audio file to the model
|
||||||
|
} else {
|
||||||
|
chain.uploads = options.uploads
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (moderations && moderations.length > 0) {
|
if (moderations && moderations.length > 0) {
|
||||||
try {
|
try {
|
||||||
// Use the output of the moderation chain as input for the LLM chain
|
// Use the output of the moderation chain as input for the LLM chain
|
||||||
|
|
@ -273,9 +283,6 @@ const runPrediction = async (
|
||||||
* TO: { "value": "hello i am ben\n\n\thow are you?" }
|
* TO: { "value": "hello i am ben\n\n\thow are you?" }
|
||||||
*/
|
*/
|
||||||
const promptValues = handleEscapeCharacters(promptValuesRaw, true)
|
const promptValues = handleEscapeCharacters(promptValuesRaw, true)
|
||||||
if (options?.uploads) {
|
|
||||||
chain.uploads = options.uploads
|
|
||||||
}
|
|
||||||
if (promptValues && inputVariables.length > 0) {
|
if (promptValues && inputVariables.length > 0) {
|
||||||
let seen: string[] = []
|
let seen: string[] = []
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -101,42 +101,20 @@ export class VLLMChain extends BaseChain implements OpenAIMultiModalChainInput {
|
||||||
})
|
})
|
||||||
if (this.speechToTextMode && this.uploads && this.uploads.length > 0) {
|
if (this.speechToTextMode && this.uploads && this.uploads.length > 0) {
|
||||||
const audioUploads = this.getAudioUploads(this.uploads)
|
const audioUploads = this.getAudioUploads(this.uploads)
|
||||||
for (const url of audioUploads) {
|
for (const upload of audioUploads) {
|
||||||
const filePath = path.join(getUserHome(), '.flowise', 'gptvision', url.data, url.name)
|
await this.processAudioWithWisper(upload, chatMessages)
|
||||||
|
|
||||||
// as the image is stored in the server, read the file and convert it to base64
|
|
||||||
const audio_file = fs.createReadStream(filePath)
|
|
||||||
if (this.speechToTextMode.purpose === 'transcriptions') {
|
|
||||||
const transcription = await this.client.audio.transcriptions.create({
|
|
||||||
file: audio_file,
|
|
||||||
model: 'whisper-1'
|
|
||||||
})
|
|
||||||
chatMessages.push({
|
|
||||||
type: 'text',
|
|
||||||
text: transcription.text
|
|
||||||
})
|
|
||||||
} else if (this.speechToTextMode.purpose === 'translations') {
|
|
||||||
const translation = await this.client.audio.translations.create({
|
|
||||||
file: audio_file,
|
|
||||||
model: 'whisper-1'
|
|
||||||
})
|
|
||||||
chatMessages.push({
|
|
||||||
type: 'text',
|
|
||||||
text: translation.text
|
|
||||||
})
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (this.uploads && this.uploads.length > 0) {
|
if (this.uploads && this.uploads.length > 0) {
|
||||||
const imageUploads = this.getImageUploads(this.uploads)
|
const imageUploads = this.getImageUploads(this.uploads)
|
||||||
for (const url of imageUploads) {
|
for (const upload of imageUploads) {
|
||||||
let bf = url.data
|
let bf = upload.data
|
||||||
if (url.type == 'stored-file') {
|
if (upload.type == 'stored-file') {
|
||||||
const filePath = path.join(getUserHome(), '.flowise', 'gptvision', url.data, url.name)
|
const filePath = path.join(getUserHome(), '.flowise', 'gptvision', upload.data, upload.name)
|
||||||
|
|
||||||
// as the image is stored in the server, read the file and convert it to base64
|
// as the image is stored in the server, read the file and convert it to base64
|
||||||
const contents = fs.readFileSync(filePath)
|
const contents = fs.readFileSync(filePath)
|
||||||
bf = 'data:' + url.mime + ';base64,' + contents.toString('base64')
|
bf = 'data:' + upload.mime + ';base64,' + contents.toString('base64')
|
||||||
}
|
}
|
||||||
chatMessages.push({
|
chatMessages.push({
|
||||||
type: 'image_url',
|
type: 'image_url',
|
||||||
|
|
@ -182,6 +160,40 @@ export class VLLMChain extends BaseChain implements OpenAIMultiModalChainInput {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public async processAudioWithWisper(upload: IFileUpload, chatMessages: ChatCompletionContentPart[] | undefined): Promise<string> {
|
||||||
|
const filePath = path.join(getUserHome(), '.flowise', 'gptvision', upload.data, upload.name)
|
||||||
|
|
||||||
|
// as the image is stored in the server, read the file and convert it to base64
|
||||||
|
const audio_file = fs.createReadStream(filePath)
|
||||||
|
if (this.speechToTextMode === 'transcriptions') {
|
||||||
|
const transcription = await this.client.audio.transcriptions.create({
|
||||||
|
file: audio_file,
|
||||||
|
model: 'whisper-1'
|
||||||
|
})
|
||||||
|
if (chatMessages) {
|
||||||
|
chatMessages.push({
|
||||||
|
type: 'text',
|
||||||
|
text: transcription.text
|
||||||
|
})
|
||||||
|
}
|
||||||
|
return transcription.text
|
||||||
|
} else if (this.speechToTextMode === 'translations') {
|
||||||
|
const translation = await this.client.audio.translations.create({
|
||||||
|
file: audio_file,
|
||||||
|
model: 'whisper-1'
|
||||||
|
})
|
||||||
|
if (chatMessages) {
|
||||||
|
chatMessages.push({
|
||||||
|
type: 'text',
|
||||||
|
text: translation.text
|
||||||
|
})
|
||||||
|
}
|
||||||
|
return translation.text
|
||||||
|
}
|
||||||
|
//should never get here
|
||||||
|
return ''
|
||||||
|
}
|
||||||
|
|
||||||
getAudioUploads = (urls: any[]) => {
|
getAudioUploads = (urls: any[]) => {
|
||||||
return urls.filter((url: any) => url.mime.startsWith('audio/'))
|
return urls.filter((url: any) => url.mime.startsWith('audio/'))
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -304,10 +304,11 @@ export const ChatMessage = ({ open, chatflowid, isDialog }) => {
|
||||||
setRecordingNotSupported(false)
|
setRecordingNotSupported(false)
|
||||||
}
|
}
|
||||||
|
|
||||||
const onRecordingStopped = () => {
|
const onRecordingStopped = async () => {
|
||||||
stopAudioRecording(addRecordingToPreviews)
|
stopAudioRecording(addRecordingToPreviews)
|
||||||
setIsRecording(false)
|
setIsRecording(false)
|
||||||
setRecordingNotSupported(false)
|
setRecordingNotSupported(false)
|
||||||
|
handlePromptClick('')
|
||||||
}
|
}
|
||||||
|
|
||||||
const onSourceDialogClick = (data, title) => {
|
const onSourceDialogClick = (data, title) => {
|
||||||
|
|
@ -366,8 +367,10 @@ export const ChatMessage = ({ open, chatflowid, isDialog }) => {
|
||||||
if (e) e.preventDefault()
|
if (e) e.preventDefault()
|
||||||
|
|
||||||
if (!promptStarterInput && userInput.trim() === '') {
|
if (!promptStarterInput && userInput.trim() === '') {
|
||||||
|
if (!(previews.length === 1 && previews[0].type === 'audio')) {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
let input = userInput
|
let input = userInput
|
||||||
|
|
||||||
|
|
@ -626,7 +629,7 @@ export const ChatMessage = ({ open, chatflowid, isDialog }) => {
|
||||||
</div>
|
</div>
|
||||||
<i className='stop-recording-button'>
|
<i className='stop-recording-button'>
|
||||||
<Button variant='outlined' color='primary' onClick={onRecordingStopped}>
|
<Button variant='outlined' color='primary' onClick={onRecordingStopped}>
|
||||||
Save
|
Send
|
||||||
</Button>
|
</Button>
|
||||||
</i>
|
</i>
|
||||||
</div>
|
</div>
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue