Add option to autoplay tts audio after prediction completes

2025-08-14 18:06:57 +05:30 · 2025-08-14 18:06:57 +05:30 · ef1b0dc856
parent 1902701e64
commit ef1b0dc856
8 changed files with 239 additions and 4 deletions
--- a/packages/components/src/Interface.ts
+++ b/packages/components/src/Interface.ts
@ -441,6 +441,7 @@ export interface IServerSideEventStreamer {
    streamAbortEvent(chatId: string): void
    streamEndEvent(chatId: string): void
    streamUsageMetadataEvent(chatId: string, data: any): void
+    streamAudioEvent(chatId: string, audioData: string): void
 }

 export enum FollowUpPromptProvider {
--- a/packages/server/src/Interface.ts
+++ b/packages/server/src/Interface.ts
@ -64,6 +64,7 @@ export interface IChatFlow {
    apikeyid?: string
    analytic?: string
    speechToText?: string
+    textToSpeech?: string
    chatbotConfig?: string
    followUpPrompts?: string
    apiConfig?: string
--- a/packages/server/src/queue/RedisEventPublisher.ts
+++ b/packages/server/src/queue/RedisEventPublisher.ts
@ -393,6 +393,21 @@ export class RedisEventPublisher implements IServerSideEventStreamer {
        }
    }

+    streamAudioEvent(chatId: string, audioData: string): void {
+        try {
+            this.redisPublisher.publish(
+                chatId,
+                JSON.stringify({
+                    chatId,
+                    eventType: 'audio',
+                    data: audioData
+                })
+            )
+        } catch (error) {
+            console.error('Error streaming audio event:', error)
+        }
+    }
+
    async disconnect() {
        if (this.redisPublisher) {
            await this.redisPublisher.quit()
--- a/packages/server/src/utils/SSEStreamer.ts
+++ b/packages/server/src/utils/SSEStreamer.ts
@ -257,4 +257,15 @@ export class SSEStreamer implements IServerSideEventStreamer {
            client.response.write('message:\ndata:' + JSON.stringify(clientResponse) + '\n\n')
        }
    }
+
+    streamAudioEvent(chatId: string, audioData: string): void {
+        const client = this.clients[chatId]
+        if (client) {
+            const clientResponse = {
+                event: 'audio',
+                data: audioData
+            }
+            client.response.write('message:\ndata:' + JSON.stringify(clientResponse) + '\n\n')
+        }
+    }
 }
--- a/packages/server/src/utils/buildAgentflow.ts
+++ b/packages/server/src/utils/buildAgentflow.ts
@ -11,7 +11,8 @@ import {
    IMessage,
    IServerSideEventStreamer,
    convertChatHistoryToText,
-    generateFollowUpPrompts
+    generateFollowUpPrompts,
+    convertTextToSpeech
 } from 'flowise-components'
 import {
    IncomingAgentflowInput,
@ -135,6 +136,59 @@ interface IExecuteNodeParams {
    subscriptionId: string
 }

+// Helper function to check if auto-play TTS is enabled
+const shouldAutoPlayTTS = (textToSpeechConfig: string | undefined | null): boolean => {
+    if (!textToSpeechConfig) return false
+    try {
+        const config = typeof textToSpeechConfig === 'string' ? JSON.parse(textToSpeechConfig) : textToSpeechConfig
+        // Check each provider to see if any has autoPlay enabled and status true
+        for (const providerKey in config) {
+            const provider = config[providerKey]
+            if (provider && provider.status === true && provider.autoPlay === true) {
+                return true
+            }
+        }
+        return false
+    } catch (error) {
+        return false
+    }
+}
+
+// Helper function to generate TTS for response
+const generateTTSForResponse = async (
+    responseText: string,
+    textToSpeechConfig: string | undefined,
+    options: ICommonObject
+): Promise<Buffer | null> => {
+    try {
+        if (!textToSpeechConfig) return null
+        const config = typeof textToSpeechConfig === 'string' ? JSON.parse(textToSpeechConfig) : textToSpeechConfig
+
+        // Find the active provider configuration
+        let activeProviderConfig = null
+        for (const providerKey in config) {
+            const provider = config[providerKey]
+            if (provider && provider.status === true) {
+                activeProviderConfig = {
+                    name: providerKey,
+                    credentialId: provider.credentialId,
+                    voice: provider.voice,
+                    model: provider.model
+                }
+                break
+            }
+        }
+
+        if (!activeProviderConfig) return null
+
+        const audioBuffer = await convertTextToSpeech(responseText, activeProviderConfig, options)
+        return audioBuffer
+    } catch (error) {
+        logger.error(`[server]: TTS generation failed: ${getErrorMessage(error)}`)
+        return null
+    }
+}
+
 interface IExecuteAgentFlowParams extends Omit<IExecuteFlowParams, 'incomingInput'> {
    incomingInput: IncomingAgentflowInput
 }
@ -2038,5 +2092,26 @@ export const executeAgentFlow = async ({

    if (sessionId) result.sessionId = sessionId

+    /*** Auto-play TTS Logic ***/
+    if (shouldAutoPlayTTS(chatflow.textToSpeech) && result.text) {
+        const options = {
+            orgId,
+            chatflowid,
+            chatId,
+            appDataSource,
+            databaseEntities
+        }
+
+        const audioBuffer = await generateTTSForResponse(result.text, chatflow.textToSpeech, options)
+        if (audioBuffer) {
+            const audioBase64 = audioBuffer.toString('base64')
+
+            // Agent flows are always streamed, so send audio via SSE
+            if (sseStreamer) {
+                sseStreamer.streamAudioEvent(chatId, audioBase64)
+            }
+        }
+    }
+
    return result
 }
--- a/packages/server/src/utils/buildChatflow.ts
+++ b/packages/server/src/utils/buildChatflow.ts
@ -6,6 +6,7 @@ import { omit } from 'lodash'
 import {
    IFileUpload,
    convertSpeechToText,
+    convertTextToSpeech,
    ICommonObject,
    addSingleFileToStorage,
    generateFollowUpPrompts,
@ -70,9 +71,59 @@ import { executeAgentFlow } from './buildAgentflow'
 import { Workspace } from '../enterprise/database/entities/workspace.entity'
 import { Organization } from '../enterprise/database/entities/organization.entity'

-/*
- * Initialize the ending node to be executed
- */
+// Helper function to check if auto-play TTS is enabled
+const shouldAutoPlayTTS = (textToSpeechConfig: string | undefined | null): boolean => {
+    if (!textToSpeechConfig) return false
+    try {
+        const config = typeof textToSpeechConfig === 'string' ? JSON.parse(textToSpeechConfig) : textToSpeechConfig
+        // Check each provider to see if any has autoPlay enabled and status true
+        for (const providerKey in config) {
+            const provider = config[providerKey]
+            if (provider && provider.status === true && provider.autoPlay === true) {
+                return true
+            }
+        }
+        return false
+    } catch (error) {
+        return false
+    }
+}
+
+// Helper function to generate TTS for response
+const generateTTSForResponse = async (
+    responseText: string,
+    textToSpeechConfig: string | undefined,
+    options: ICommonObject
+): Promise<Buffer | null> => {
+    try {
+        if (!textToSpeechConfig) return null
+        const config = typeof textToSpeechConfig === 'string' ? JSON.parse(textToSpeechConfig) : textToSpeechConfig
+
+        // Find the active provider configuration
+        let activeProviderConfig = null
+        for (const providerKey in config) {
+            const provider = config[providerKey]
+            if (provider && provider.status === true) {
+                activeProviderConfig = {
+                    name: providerKey,
+                    credentialId: provider.credentialId,
+                    voice: provider.voice,
+                    model: provider.model
+                }
+                break
+            }
+        }
+
+        if (!activeProviderConfig) return null
+
+        const audioBuffer = await convertTextToSpeech(responseText, activeProviderConfig, options)
+        return audioBuffer
+    } catch (error) {
+        logger.error(`[server]: TTS generation failed: ${getErrorMessage(error)}`)
+        return null
+    }
+}
+
 const initEndingNode = async ({
    endingNodeIds,
    componentNodes,
@ -828,6 +879,29 @@ export const executeFlow = async ({
        if (memoryType) result.memoryType = memoryType
        if (Object.keys(setVariableNodesOutput).length) result.flowVariables = setVariableNodesOutput

+        if (shouldAutoPlayTTS(chatflow.textToSpeech) && result.text) {
+            logger.info('[server]: Generating TTS for response')
+            logger.info(`[server/executeFlow]: TTS config: ${JSON.stringify(chatflow.textToSpeech)}`)
+            const options = {
+                orgId,
+                chatflowid,
+                chatId,
+                appDataSource,
+                databaseEntities
+            }
+
+            const audioBuffer = await generateTTSForResponse(result.text, chatflow.textToSpeech, options)
+            if (audioBuffer) {
+                const audioBase64 = audioBuffer.toString('base64')
+
+                if (streaming && sseStreamer) {
+                    sseStreamer.streamAudioEvent(chatId, audioBase64)
+                } else {
+                    result.audioData = audioBase64
+                }
+            }
+        }
+
        return result
    }
 }
--- a/packages/ui/src/ui-component/extended/TextToSpeech.jsx
+++ b/packages/ui/src/ui-component/extended/TextToSpeech.jsx
@ -455,6 +455,24 @@ const TextToSpeech = ({ dialogProps }) => {
                            )}
                        </Box>
                    ))}
+
+                    {/* Auto-play Toggle */}
+                    <Box sx={{ p: 2 }}>
+                        <div style={{ display: 'flex', flexDirection: 'row', alignItems: 'center' }}>
+                            <Typography>
+                                Automatically play audio
+                                <TooltipWithParser
+                                    style={{ marginLeft: 10 }}
+                                    title='When enabled, bot responses will be automatically converted to speech and played'
+                                />
+                            </Typography>
+                        </div>
+                        <SwitchInput
+                            onChange={(newValue) => setValue(newValue, selectedProvider, 'autoPlay')}
+                            value={textToSpeech[selectedProvider] ? textToSpeech[selectedProvider].autoPlay ?? false : false}
+                        />
+                    </Box>
+
                    {/* Test TTS Button */}
                    <Box sx={{ p: 2 }}>
                        <StyledButton
--- a/packages/ui/src/views/chatmessage/ChatMessage.jsx
+++ b/packages/ui/src/views/chatmessage/ChatMessage.jsx
@ -955,6 +955,12 @@ const ChatMessage = ({ open, chatflowid, isAgentCanvas, isDialog, previews, setP
                    setLoading(false)
                    setUserInput('')
                    setUploadedFiles([])
+
+                    // Handle auto-play audio for non-streaming responses
+                    if (data.audioData) {
+                        handleAutoPlayAudio(data.audioData)
+                    }
+
                    setTimeout(() => {
                        inputRef.current?.focus()
                        scrollToBottom()
@ -1033,6 +1039,9 @@ const ChatMessage = ({ open, chatflowid, isAgentCanvas, isDialog, previews, setP
                        abortMessage(payload.data)
                        closeResponse()
                        break
+                    case 'audio':
+                        handleAutoPlayAudio(payload.data)
+                        break
                    case 'end':
                        setLocalStorageChatflow(chatflowid, chatId)
                        closeResponse()
@ -1631,6 +1640,37 @@ const ChatMessage = ({ open, chatflowid, isAgentCanvas, isDialog, previews, setP
        }
    }

+    const handleAutoPlayAudio = async (audioData) => {
+        try {
+            // Convert base64 audio data to blob and play
+            const audioBuffer = Uint8Array.from(atob(audioData), (c) => c.charCodeAt(0))
+            const audioBlob = new Blob([audioBuffer], { type: 'audio/mpeg' })
+            const audioUrl = URL.createObjectURL(audioBlob)
+            const audio = new Audio(audioUrl)
+
+            audio.addEventListener('ended', () => {
+                URL.revokeObjectURL(audioUrl)
+            })
+
+            await audio.play()
+        } catch (error) {
+            console.error('Error playing auto TTS audio:', error)
+            // Fallback: Use manual TTS API call
+            const lastMessage = messages[messages.length - 1]
+            if (lastMessage && lastMessage.type === 'apiMessage' && lastMessage.message) {
+                try {
+                    await handleTTSClick(lastMessage.id, lastMessage.message)
+                } catch (fallbackError) {
+                    console.error('TTS fallback also failed:', fallbackError)
+                    enqueueSnackbar({
+                        message: 'Auto-play audio failed',
+                        options: { variant: 'error' }
+                    })
+                }
+            }
+        }
+    }
+
    const getInputDisabled = () => {
        return (
            loading ||