Set up streaming response for text to speech audio
This commit is contained in:
parent
08f0d7597d
commit
b30e4a9da8
|
|
@ -442,6 +442,8 @@ export interface IServerSideEventStreamer {
|
||||||
streamEndEvent(chatId: string): void
|
streamEndEvent(chatId: string): void
|
||||||
streamUsageMetadataEvent(chatId: string, data: any): void
|
streamUsageMetadataEvent(chatId: string, data: any): void
|
||||||
streamAudioEvent(chatId: string, audioData: string): void
|
streamAudioEvent(chatId: string, audioData: string): void
|
||||||
|
streamTTSDataEvent(chatId: string, audioChunk: string): void
|
||||||
|
streamTTSEndEvent(chatId: string): void
|
||||||
}
|
}
|
||||||
|
|
||||||
export enum FollowUpPromptProvider {
|
export enum FollowUpPromptProvider {
|
||||||
|
|
|
||||||
|
|
@ -2,12 +2,111 @@ import { ICommonObject } from './Interface'
|
||||||
import { getCredentialData } from './utils'
|
import { getCredentialData } from './utils'
|
||||||
import OpenAI from 'openai'
|
import OpenAI from 'openai'
|
||||||
import { ElevenLabsClient } from '@elevenlabs/elevenlabs-js'
|
import { ElevenLabsClient } from '@elevenlabs/elevenlabs-js'
|
||||||
|
import { Readable } from 'node:stream'
|
||||||
|
import type { ReadableStream } from 'node:stream/web'
|
||||||
|
|
||||||
const TextToSpeechType = {
|
const TextToSpeechType = {
|
||||||
OPENAI_TTS: 'openai',
|
OPENAI_TTS: 'openai',
|
||||||
ELEVEN_LABS_TTS: 'elevenlabs'
|
ELEVEN_LABS_TTS: 'elevenlabs'
|
||||||
}
|
}
|
||||||
|
|
||||||
|
export const convertTextToSpeechStream = async (
|
||||||
|
text: string,
|
||||||
|
textToSpeechConfig: ICommonObject,
|
||||||
|
options: ICommonObject,
|
||||||
|
onChunk: (chunk: Buffer) => void,
|
||||||
|
onEnd: () => void
|
||||||
|
): Promise<void> => {
|
||||||
|
return new Promise<void>(async (resolve, reject) => {
|
||||||
|
try {
|
||||||
|
if (textToSpeechConfig) {
|
||||||
|
const credentialId = textToSpeechConfig.credentialId as string
|
||||||
|
const credentialData = await getCredentialData(credentialId ?? '', options)
|
||||||
|
|
||||||
|
switch (textToSpeechConfig.name) {
|
||||||
|
case TextToSpeechType.OPENAI_TTS: {
|
||||||
|
const openai = new OpenAI({
|
||||||
|
apiKey: credentialData.openAIApiKey
|
||||||
|
})
|
||||||
|
|
||||||
|
const response = await openai.audio.speech.create({
|
||||||
|
model: 'gpt-4o-mini-tts',
|
||||||
|
voice: (textToSpeechConfig.voice || 'alloy') as
|
||||||
|
| 'alloy'
|
||||||
|
| 'ash'
|
||||||
|
| 'ballad'
|
||||||
|
| 'coral'
|
||||||
|
| 'echo'
|
||||||
|
| 'fable'
|
||||||
|
| 'nova'
|
||||||
|
| 'onyx'
|
||||||
|
| 'sage'
|
||||||
|
| 'shimmer',
|
||||||
|
input: text,
|
||||||
|
response_format: 'wav'
|
||||||
|
})
|
||||||
|
|
||||||
|
const stream = Readable.fromWeb(response as unknown as ReadableStream)
|
||||||
|
if (!stream) {
|
||||||
|
throw new Error('Failed to get response stream')
|
||||||
|
}
|
||||||
|
|
||||||
|
stream.on('data', (chunk) => {
|
||||||
|
onChunk(Buffer.from(chunk))
|
||||||
|
})
|
||||||
|
|
||||||
|
stream.on('end', () => {
|
||||||
|
onEnd()
|
||||||
|
resolve()
|
||||||
|
})
|
||||||
|
|
||||||
|
stream.on('error', (error) => {
|
||||||
|
reject(error)
|
||||||
|
})
|
||||||
|
|
||||||
|
break
|
||||||
|
}
|
||||||
|
|
||||||
|
case TextToSpeechType.ELEVEN_LABS_TTS: {
|
||||||
|
const client = new ElevenLabsClient({
|
||||||
|
apiKey: credentialData.elevenLabsApiKey
|
||||||
|
})
|
||||||
|
|
||||||
|
const response = await client.textToSpeech.stream(textToSpeechConfig.voice || '21m00Tcm4TlvDq8ikWAM', {
|
||||||
|
text: text,
|
||||||
|
modelId: 'eleven_multilingual_v2'
|
||||||
|
})
|
||||||
|
|
||||||
|
const stream = Readable.fromWeb(response as unknown as ReadableStream)
|
||||||
|
if (!stream) {
|
||||||
|
throw new Error('Failed to get response stream')
|
||||||
|
}
|
||||||
|
|
||||||
|
stream.on('data', (chunk) => {
|
||||||
|
onChunk(Buffer.from(chunk))
|
||||||
|
})
|
||||||
|
|
||||||
|
stream.on('end', () => {
|
||||||
|
onEnd()
|
||||||
|
resolve()
|
||||||
|
})
|
||||||
|
|
||||||
|
stream.on('error', (error) => {
|
||||||
|
reject(error)
|
||||||
|
})
|
||||||
|
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
reject(new Error('Text to speech is not selected. Please configure TTS in the chatflow.'))
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
reject(error)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
export const convertTextToSpeech = async (text: string, textToSpeechConfig: ICommonObject, options: ICommonObject): Promise<Buffer> => {
|
export const convertTextToSpeech = async (text: string, textToSpeechConfig: ICommonObject, options: ICommonObject): Promise<Buffer> => {
|
||||||
if (textToSpeechConfig) {
|
if (textToSpeechConfig) {
|
||||||
const credentialId = textToSpeechConfig.credentialId as string
|
const credentialId = textToSpeechConfig.credentialId as string
|
||||||
|
|
@ -20,10 +119,20 @@ export const convertTextToSpeech = async (text: string, textToSpeechConfig: ICom
|
||||||
})
|
})
|
||||||
|
|
||||||
const response = await openai.audio.speech.create({
|
const response = await openai.audio.speech.create({
|
||||||
model: textToSpeechConfig.model || 'tts-1',
|
model: 'gpt-4o-mini-tts',
|
||||||
voice: (textToSpeechConfig.voice || 'alloy') as 'alloy' | 'echo' | 'fable' | 'onyx' | 'nova' | 'shimmer',
|
voice: (textToSpeechConfig.voice || 'alloy') as
|
||||||
|
| 'alloy'
|
||||||
|
| 'ash'
|
||||||
|
| 'ballad'
|
||||||
|
| 'coral'
|
||||||
|
| 'echo'
|
||||||
|
| 'fable'
|
||||||
|
| 'nova'
|
||||||
|
| 'onyx'
|
||||||
|
| 'sage'
|
||||||
|
| 'shimmer',
|
||||||
input: text,
|
input: text,
|
||||||
response_format: 'mp3'
|
response_format: 'wav'
|
||||||
})
|
})
|
||||||
|
|
||||||
const audioBuffer = Buffer.from(await response.arrayBuffer())
|
const audioBuffer = Buffer.from(await response.arrayBuffer())
|
||||||
|
|
@ -35,9 +144,9 @@ export const convertTextToSpeech = async (text: string, textToSpeechConfig: ICom
|
||||||
apiKey: credentialData.elevenLabsApiKey
|
apiKey: credentialData.elevenLabsApiKey
|
||||||
})
|
})
|
||||||
|
|
||||||
const audioStream = await client.textToSpeech.convert(textToSpeechConfig.voice || '21m00Tcm4TlvDq8ikWAM', {
|
const audioStream = await client.textToSpeech.stream(textToSpeechConfig.voice || '21m00Tcm4TlvDq8ikWAM', {
|
||||||
text: text,
|
text: text,
|
||||||
modelId: 'eleven_monolingual_v1'
|
modelId: 'eleven_multilingual_v2'
|
||||||
})
|
})
|
||||||
|
|
||||||
// Convert the audio stream to buffer
|
// Convert the audio stream to buffer
|
||||||
|
|
@ -73,10 +182,14 @@ export const getVoices = async (provider: string, credentialId: string, options:
|
||||||
case TextToSpeechType.OPENAI_TTS:
|
case TextToSpeechType.OPENAI_TTS:
|
||||||
return [
|
return [
|
||||||
{ id: 'alloy', name: 'Alloy' },
|
{ id: 'alloy', name: 'Alloy' },
|
||||||
|
{ id: 'ash', name: 'Ash' },
|
||||||
|
{ id: 'ballad', name: 'Ballad' },
|
||||||
|
{ id: 'coral', name: 'Coral' },
|
||||||
{ id: 'echo', name: 'Echo' },
|
{ id: 'echo', name: 'Echo' },
|
||||||
{ id: 'fable', name: 'Fable' },
|
{ id: 'fable', name: 'Fable' },
|
||||||
{ id: 'onyx', name: 'Onyx' },
|
|
||||||
{ id: 'nova', name: 'Nova' },
|
{ id: 'nova', name: 'Nova' },
|
||||||
|
{ id: 'onyx', name: 'Onyx' },
|
||||||
|
{ id: 'sage', name: 'Sage' },
|
||||||
{ id: 'shimmer', name: 'Shimmer' }
|
{ id: 'shimmer', name: 'Shimmer' }
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue