Set up streaming response for text to speech audio

2025-08-20 12:52:36 +05:30 · 2025-08-20 12:52:36 +05:30 · b30e4a9da8
parent 08f0d7597d
commit b30e4a9da8
2 changed files with 121 additions and 6 deletions
--- a/packages/components/src/Interface.ts
+++ b/packages/components/src/Interface.ts
@ -442,6 +442,8 @@ export interface IServerSideEventStreamer {
    streamEndEvent(chatId: string): void
    streamUsageMetadataEvent(chatId: string, data: any): void
    streamAudioEvent(chatId: string, audioData: string): void
    streamTTSDataEvent(chatId: string, audioChunk: string): void
    streamTTSEndEvent(chatId: string): void
 }
 export enum FollowUpPromptProvider {
--- a/packages/components/src/textToSpeech.ts
+++ b/packages/components/src/textToSpeech.ts
@ -2,12 +2,111 @@ import { ICommonObject } from './Interface'
 import { getCredentialData } from './utils'
 import OpenAI from 'openai'
 import { ElevenLabsClient } from '@elevenlabs/elevenlabs-js'
 import { Readable } from 'node:stream'
 import type { ReadableStream } from 'node:stream/web'
 const TextToSpeechType = {
    OPENAI_TTS: 'openai',
    ELEVEN_LABS_TTS: 'elevenlabs'
 }
 export const convertTextToSpeechStream = async (
    text: string,
    textToSpeechConfig: ICommonObject,
    options: ICommonObject,
    onChunk: (chunk: Buffer) => void,
    onEnd: () => void
 ): Promise<void> => {
    return new Promise<void>(async (resolve, reject) => {
        try {
            if (textToSpeechConfig) {
                const credentialId = textToSpeechConfig.credentialId as string
                const credentialData = await getCredentialData(credentialId ?? '', options)
                switch (textToSpeechConfig.name) {
                    case TextToSpeechType.OPENAI_TTS: {
                        const openai = new OpenAI({
                            apiKey: credentialData.openAIApiKey
                        })
                        const response = await openai.audio.speech.create({
                            model: 'gpt-4o-mini-tts',
                            voice: (textToSpeechConfig.voice || 'alloy') as
                                | 'alloy'
                                | 'ash'
                                | 'ballad'
                                | 'coral'
                                | 'echo'
                                | 'fable'
                                | 'nova'
                                | 'onyx'
                                | 'sage'
                                | 'shimmer',
                            input: text,
                            response_format: 'wav'
                        })
                        const stream = Readable.fromWeb(response as unknown as ReadableStream)
                        if (!stream) {
                            throw new Error('Failed to get response stream')
                        }
                        stream.on('data', (chunk) => {
                            onChunk(Buffer.from(chunk))
                        })
                        stream.on('end', () => {
                            onEnd()
                            resolve()
                        })
                        stream.on('error', (error) => {
                            reject(error)
                        })
                        break
                    }
                    case TextToSpeechType.ELEVEN_LABS_TTS: {
                        const client = new ElevenLabsClient({
                            apiKey: credentialData.elevenLabsApiKey
                        })
                        const response = await client.textToSpeech.stream(textToSpeechConfig.voice || '21m00Tcm4TlvDq8ikWAM', {
                            text: text,
                            modelId: 'eleven_multilingual_v2'
                        })
                        const stream = Readable.fromWeb(response as unknown as ReadableStream)
                        if (!stream) {
                            throw new Error('Failed to get response stream')
                        }
                        stream.on('data', (chunk) => {
                            onChunk(Buffer.from(chunk))
                        })
                        stream.on('end', () => {
                            onEnd()
                            resolve()
                        })
                        stream.on('error', (error) => {
                            reject(error)
                        })
                        break
                    }
                }
            } else {
                reject(new Error('Text to speech is not selected. Please configure TTS in the chatflow.'))
            }
        } catch (error) {
            reject(error)
        }
    })
 }
 export const convertTextToSpeech = async (text: string, textToSpeechConfig: ICommonObject, options: ICommonObject): Promise<Buffer> => {
    if (textToSpeechConfig) {
        const credentialId = textToSpeechConfig.credentialId as string
@ -20,10 +119,20 @@ export const convertTextToSpeech = async (text: string, textToSpeechConfig: ICom
                })
                const response = await openai.audio.speech.create({
-                    model: textToSpeechConfig.model || 'tts-1',
+                    model: 'gpt-4o-mini-tts',
-                    voice: (textToSpeechConfig.voice || 'alloy') as 'alloy' | 'echo' | 'fable' | 'onyx' | 'nova' | 'shimmer',
+                    voice: (textToSpeechConfig.voice || 'alloy') as
                        | 'alloy'
                        | 'ash'
                        | 'ballad'
                        | 'coral'
                        | 'echo'
                        | 'fable'
                        | 'nova'
                        | 'onyx'
                        | 'sage'
                        | 'shimmer',
                    input: text,
-                    response_format: 'mp3'
+                    response_format: 'wav'
                })
                const audioBuffer = Buffer.from(await response.arrayBuffer())
@ -35,9 +144,9 @@ export const convertTextToSpeech = async (text: string, textToSpeechConfig: ICom
                    apiKey: credentialData.elevenLabsApiKey
                })
-                const audioStream = await client.textToSpeech.convert(textToSpeechConfig.voice || '21m00Tcm4TlvDq8ikWAM', {
+                const audioStream = await client.textToSpeech.stream(textToSpeechConfig.voice || '21m00Tcm4TlvDq8ikWAM', {
                    text: text,
-                    modelId: 'eleven_monolingual_v1'
+                    modelId: 'eleven_multilingual_v2'
                })
                // Convert the audio stream to buffer
@ -73,10 +182,14 @@ export const getVoices = async (provider: string, credentialId: string, options:
        case TextToSpeechType.OPENAI_TTS:
            return [
                { id: 'alloy', name: 'Alloy' },
                { id: 'ash', name: 'Ash' },
                { id: 'ballad', name: 'Ballad' },
                { id: 'coral', name: 'Coral' },
                { id: 'echo', name: 'Echo' },
                { id: 'fable', name: 'Fable' },
                { id: 'onyx', name: 'Onyx' },
                { id: 'nova', name: 'Nova' },
                { id: 'onyx', name: 'Onyx' },
                { id: 'sage', name: 'Sage' },
                { id: 'shimmer', name: 'Shimmer' }
            ]