Fix issues with TTS - openai voices, streaming audio, rate limiting, speed of speech
This commit is contained in:
parent
aa357c8373
commit
ad44c7b661
|
|
@ -442,6 +442,7 @@ export interface IServerSideEventStreamer {
|
|||
streamEndEvent(chatId: string): void
|
||||
streamUsageMetadataEvent(chatId: string, data: any): void
|
||||
streamAudioEvent(chatId: string, audioData: string): void
|
||||
streamTTSStartEvent(chatId: string, format: string): void
|
||||
streamTTSDataEvent(chatId: string, audioChunk: string): void
|
||||
streamTTSEndEvent(chatId: string): void
|
||||
}
|
||||
|
|
|
|||
|
|
@ -15,163 +15,136 @@ export const convertTextToSpeechStream = async (
|
|||
textToSpeechConfig: ICommonObject,
|
||||
options: ICommonObject,
|
||||
onChunk: (chunk: Buffer) => void,
|
||||
onEnd: () => void
|
||||
onEnd: () => void,
|
||||
onStart?: (format: string) => void
|
||||
): Promise<void> => {
|
||||
return new Promise<void>(async (resolve, reject) => {
|
||||
try {
|
||||
if (textToSpeechConfig) {
|
||||
const credentialId = textToSpeechConfig.credentialId as string
|
||||
const credentialData = await getCredentialData(credentialId ?? '', options)
|
||||
return new Promise<void>((resolve, reject) => {
|
||||
const processStream = async () => {
|
||||
try {
|
||||
if (textToSpeechConfig) {
|
||||
const credentialId = textToSpeechConfig.credentialId as string
|
||||
const credentialData = await getCredentialData(credentialId ?? '', options)
|
||||
|
||||
switch (textToSpeechConfig.name) {
|
||||
case TextToSpeechType.OPENAI_TTS: {
|
||||
const openai = new OpenAI({
|
||||
apiKey: credentialData.openAIApiKey
|
||||
})
|
||||
switch (textToSpeechConfig.name) {
|
||||
case TextToSpeechType.OPENAI_TTS: {
|
||||
if (onStart) onStart('mp3')
|
||||
|
||||
const response = await openai.audio.speech.create({
|
||||
model: 'gpt-4o-mini-tts',
|
||||
voice: (textToSpeechConfig.voice || 'alloy') as
|
||||
| 'alloy'
|
||||
| 'ash'
|
||||
| 'ballad'
|
||||
| 'coral'
|
||||
| 'echo'
|
||||
| 'fable'
|
||||
| 'nova'
|
||||
| 'onyx'
|
||||
| 'sage'
|
||||
| 'shimmer',
|
||||
input: text,
|
||||
response_format: 'wav'
|
||||
})
|
||||
const openai = new OpenAI({
|
||||
apiKey: credentialData.openAIApiKey
|
||||
})
|
||||
|
||||
const stream = Readable.fromWeb(response as unknown as ReadableStream)
|
||||
if (!stream) {
|
||||
throw new Error('Failed to get response stream')
|
||||
const response = await openai.audio.speech.create({
|
||||
model: 'gpt-4o-mini-tts',
|
||||
voice: (textToSpeechConfig.voice || 'alloy') as
|
||||
| 'alloy'
|
||||
| 'ash'
|
||||
| 'ballad'
|
||||
| 'coral'
|
||||
| 'echo'
|
||||
| 'fable'
|
||||
| 'nova'
|
||||
| 'onyx'
|
||||
| 'sage'
|
||||
| 'shimmer',
|
||||
input: text,
|
||||
response_format: 'mp3'
|
||||
})
|
||||
|
||||
const stream = response.body as unknown as Readable
|
||||
if (!stream) {
|
||||
throw new Error('Failed to get response stream')
|
||||
}
|
||||
|
||||
await processStreamWithRateLimit(stream, onChunk, onEnd, resolve, reject, 640, 20)
|
||||
break
|
||||
}
|
||||
|
||||
stream.on('data', (chunk) => {
|
||||
onChunk(Buffer.from(chunk))
|
||||
})
|
||||
case TextToSpeechType.ELEVEN_LABS_TTS: {
|
||||
if (onStart) onStart('mp3')
|
||||
|
||||
stream.on('end', () => {
|
||||
onEnd()
|
||||
resolve()
|
||||
})
|
||||
const client = new ElevenLabsClient({
|
||||
apiKey: credentialData.elevenLabsApiKey
|
||||
})
|
||||
|
||||
stream.on('error', (error) => {
|
||||
reject(error)
|
||||
})
|
||||
const response = await client.textToSpeech.stream(textToSpeechConfig.voice || '21m00Tcm4TlvDq8ikWAM', {
|
||||
text: text,
|
||||
modelId: 'eleven_multilingual_v2'
|
||||
})
|
||||
|
||||
break
|
||||
}
|
||||
const stream = Readable.fromWeb(response as unknown as ReadableStream)
|
||||
if (!stream) {
|
||||
throw new Error('Failed to get response stream')
|
||||
}
|
||||
|
||||
case TextToSpeechType.ELEVEN_LABS_TTS: {
|
||||
const client = new ElevenLabsClient({
|
||||
apiKey: credentialData.elevenLabsApiKey
|
||||
})
|
||||
|
||||
const response = await client.textToSpeech.stream(textToSpeechConfig.voice || '21m00Tcm4TlvDq8ikWAM', {
|
||||
text: text,
|
||||
modelId: 'eleven_multilingual_v2'
|
||||
})
|
||||
|
||||
const stream = Readable.fromWeb(response as unknown as ReadableStream)
|
||||
if (!stream) {
|
||||
throw new Error('Failed to get response stream')
|
||||
await processStreamWithRateLimit(stream, onChunk, onEnd, resolve, reject, 640, 40)
|
||||
break
|
||||
}
|
||||
|
||||
stream.on('data', (chunk) => {
|
||||
onChunk(Buffer.from(chunk))
|
||||
})
|
||||
|
||||
stream.on('end', () => {
|
||||
onEnd()
|
||||
resolve()
|
||||
})
|
||||
|
||||
stream.on('error', (error) => {
|
||||
reject(error)
|
||||
})
|
||||
|
||||
break
|
||||
}
|
||||
} else {
|
||||
reject(new Error('Text to speech is not selected. Please configure TTS in the chatflow.'))
|
||||
}
|
||||
} else {
|
||||
reject(new Error('Text to speech is not selected. Please configure TTS in the chatflow.'))
|
||||
} catch (error) {
|
||||
reject(error)
|
||||
}
|
||||
} catch (error) {
|
||||
reject(error)
|
||||
}
|
||||
|
||||
processStream()
|
||||
})
|
||||
}
|
||||
|
||||
export const convertTextToSpeech = async (text: string, textToSpeechConfig: ICommonObject, options: ICommonObject): Promise<Buffer> => {
|
||||
if (textToSpeechConfig) {
|
||||
const credentialId = textToSpeechConfig.credentialId as string
|
||||
const credentialData = await getCredentialData(credentialId ?? '', options)
|
||||
const processStreamWithRateLimit = async (
|
||||
stream: Readable,
|
||||
onChunk: (chunk: Buffer) => void,
|
||||
onEnd: () => void,
|
||||
resolve: () => void,
|
||||
reject: (error: any) => void,
|
||||
targetChunkSize: number = 640,
|
||||
rateLimitMs: number = 20
|
||||
) => {
|
||||
const TARGET_CHUNK_SIZE = targetChunkSize
|
||||
const RATE_LIMIT_MS = rateLimitMs
|
||||
|
||||
switch (textToSpeechConfig.name) {
|
||||
case TextToSpeechType.OPENAI_TTS: {
|
||||
const openai = new OpenAI({
|
||||
apiKey: credentialData.openAIApiKey
|
||||
})
|
||||
let buffer: Buffer = Buffer.alloc(0)
|
||||
let isEnded = false
|
||||
|
||||
const response = await openai.audio.speech.create({
|
||||
model: 'gpt-4o-mini-tts',
|
||||
voice: (textToSpeechConfig.voice || 'alloy') as
|
||||
| 'alloy'
|
||||
| 'ash'
|
||||
| 'ballad'
|
||||
| 'coral'
|
||||
| 'echo'
|
||||
| 'fable'
|
||||
| 'nova'
|
||||
| 'onyx'
|
||||
| 'sage'
|
||||
| 'shimmer',
|
||||
input: text,
|
||||
response_format: 'wav'
|
||||
})
|
||||
|
||||
const audioBuffer = Buffer.from(await response.arrayBuffer())
|
||||
return audioBuffer
|
||||
}
|
||||
|
||||
case TextToSpeechType.ELEVEN_LABS_TTS: {
|
||||
const client = new ElevenLabsClient({
|
||||
apiKey: credentialData.elevenLabsApiKey
|
||||
})
|
||||
|
||||
const audioStream = await client.textToSpeech.stream(textToSpeechConfig.voice || '21m00Tcm4TlvDq8ikWAM', {
|
||||
text: text,
|
||||
modelId: 'eleven_multilingual_v2'
|
||||
})
|
||||
|
||||
const chunks: Buffer[] = []
|
||||
const reader = audioStream.getReader()
|
||||
|
||||
try {
|
||||
let result = await reader.read()
|
||||
while (!result.done) {
|
||||
if (result.value) {
|
||||
chunks.push(Buffer.from(result.value))
|
||||
}
|
||||
result = await reader.read()
|
||||
}
|
||||
} finally {
|
||||
reader.releaseLock()
|
||||
}
|
||||
|
||||
const audioBuffer = Buffer.concat(chunks)
|
||||
return audioBuffer
|
||||
const processChunks = async () => {
|
||||
while (!isEnded || buffer.length > 0) {
|
||||
if (buffer.length >= TARGET_CHUNK_SIZE) {
|
||||
const chunk = buffer.subarray(0, TARGET_CHUNK_SIZE)
|
||||
buffer = buffer.subarray(TARGET_CHUNK_SIZE)
|
||||
onChunk(chunk)
|
||||
await sleep(RATE_LIMIT_MS)
|
||||
} else if (isEnded && buffer.length > 0) {
|
||||
onChunk(buffer)
|
||||
buffer = Buffer.alloc(0)
|
||||
} else if (!isEnded) {
|
||||
await sleep(RATE_LIMIT_MS)
|
||||
} else {
|
||||
break
|
||||
}
|
||||
}
|
||||
} else {
|
||||
throw new Error('Text to speech is not selected. Please configure TTS in the chatflow.')
|
||||
|
||||
onEnd()
|
||||
resolve()
|
||||
}
|
||||
return Buffer.alloc(0)
|
||||
|
||||
stream.on('data', (chunk) => {
|
||||
buffer = Buffer.concat([buffer, Buffer.from(chunk)])
|
||||
})
|
||||
|
||||
stream.on('end', () => {
|
||||
isEnded = true
|
||||
})
|
||||
|
||||
stream.on('error', (error) => {
|
||||
reject(error)
|
||||
})
|
||||
|
||||
processChunks().catch(reject)
|
||||
}
|
||||
|
||||
const sleep = (ms: number): Promise<void> => {
|
||||
return new Promise((resolve) => setTimeout(resolve, ms))
|
||||
}
|
||||
|
||||
export const getVoices = async (provider: string, credentialId: string, options: ICommonObject) => {
|
||||
|
|
|
|||
|
|
@ -67,7 +67,6 @@ const generateTextToSpeech = async (req: Request, res: Response) => {
|
|||
res.write(`data: ${JSON.stringify(clientResponse)}\n\n`)
|
||||
},
|
||||
async () => {
|
||||
// Send end event
|
||||
const endResponse = {
|
||||
event: 'tts_end',
|
||||
data: {}
|
||||
|
|
@ -75,6 +74,14 @@ const generateTextToSpeech = async (req: Request, res: Response) => {
|
|||
res.write('event: tts_end\n')
|
||||
res.write(`data: ${JSON.stringify(endResponse)}\n\n`)
|
||||
res.end()
|
||||
},
|
||||
(format: string) => {
|
||||
const startResponse = {
|
||||
event: 'tts_start',
|
||||
data: { format }
|
||||
}
|
||||
res.write('event: tts_start\n')
|
||||
res.write(`data: ${JSON.stringify(startResponse)}\n\n`)
|
||||
}
|
||||
)
|
||||
} catch (error) {
|
||||
|
|
|
|||
|
|
@ -2,7 +2,7 @@ import { StatusCodes } from 'http-status-codes'
|
|||
import { getRunningExpressApp } from '../../utils/getRunningExpressApp'
|
||||
import { InternalFlowiseError } from '../../errors/internalFlowiseError'
|
||||
import { getErrorMessage } from '../../errors/utils'
|
||||
import { convertTextToSpeech, getVoices } from 'flowise-components'
|
||||
import { getVoices } from 'flowise-components'
|
||||
import { databaseEntities } from '../../utils'
|
||||
|
||||
export enum TextToSpeechProvider {
|
||||
|
|
@ -23,51 +23,8 @@ export interface TTSResponse {
|
|||
contentType: string
|
||||
}
|
||||
|
||||
const generateTextToSpeech = async (request: TTSRequest): Promise<TTSResponse> => {
|
||||
try {
|
||||
const appServer = getRunningExpressApp()
|
||||
const options = {
|
||||
orgId: '',
|
||||
chatflowid: '',
|
||||
chatId: '',
|
||||
appDataSource: appServer.AppDataSource,
|
||||
databaseEntities: databaseEntities
|
||||
}
|
||||
|
||||
const textToSpeechConfig = {
|
||||
name: request.provider,
|
||||
credentialId: request.credentialId,
|
||||
voice: request.voice,
|
||||
model: request.model
|
||||
}
|
||||
|
||||
const audioBuffer = await convertTextToSpeech(request.text, textToSpeechConfig, options)
|
||||
|
||||
return {
|
||||
audioBuffer,
|
||||
contentType: 'audio/mpeg'
|
||||
}
|
||||
} catch (error) {
|
||||
throw new InternalFlowiseError(
|
||||
StatusCodes.INTERNAL_SERVER_ERROR,
|
||||
`Error: textToSpeechService.generateTextToSpeech - ${getErrorMessage(error)}`
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
const getVoicesForProvider = async (provider: string, credentialId?: string): Promise<any[]> => {
|
||||
try {
|
||||
if (provider === TextToSpeechProvider.OPENAI) {
|
||||
return [
|
||||
{ id: 'alloy', name: 'Alloy' },
|
||||
{ id: 'echo', name: 'Echo' },
|
||||
{ id: 'fable', name: 'Fable' },
|
||||
{ id: 'onyx', name: 'Onyx' },
|
||||
{ id: 'nova', name: 'Nova' },
|
||||
{ id: 'shimmer', name: 'Shimmer' }
|
||||
]
|
||||
}
|
||||
|
||||
if (!credentialId) {
|
||||
throw new InternalFlowiseError(StatusCodes.BAD_REQUEST, 'Credential ID required for this provider')
|
||||
}
|
||||
|
|
@ -91,6 +48,5 @@ const getVoicesForProvider = async (provider: string, credentialId?: string): Pr
|
|||
}
|
||||
|
||||
export default {
|
||||
generateTextToSpeech,
|
||||
getVoices: getVoicesForProvider
|
||||
}
|
||||
|
|
|
|||
|
|
@ -269,10 +269,19 @@ export class SSEStreamer implements IServerSideEventStreamer {
|
|||
}
|
||||
}
|
||||
|
||||
streamTTSStartEvent(chatId: string, format: string): void {
|
||||
const client = this.clients[chatId]
|
||||
if (client) {
|
||||
const clientResponse = {
|
||||
event: 'tts_start',
|
||||
data: { format }
|
||||
}
|
||||
client.response.write('message:\ndata:' + JSON.stringify(clientResponse) + '\n\n')
|
||||
}
|
||||
}
|
||||
|
||||
streamTTSDataEvent(chatId: string, audioChunk: string): void {
|
||||
const client = this.clients[chatId]
|
||||
console.log('clients', this.clients)
|
||||
console.log('client', client)
|
||||
if (client) {
|
||||
const clientResponse = {
|
||||
event: 'tts_data',
|
||||
|
|
@ -284,8 +293,6 @@ export class SSEStreamer implements IServerSideEventStreamer {
|
|||
|
||||
streamTTSEndEvent(chatId: string): void {
|
||||
const client = this.clients[chatId]
|
||||
console.log('clients', this.clients)
|
||||
console.log('client', client)
|
||||
if (client) {
|
||||
const clientResponse = {
|
||||
event: 'tts_end',
|
||||
|
|
|
|||
|
|
@ -6,7 +6,6 @@ import { omit } from 'lodash'
|
|||
import {
|
||||
IFileUpload,
|
||||
convertSpeechToText,
|
||||
convertTextToSpeech,
|
||||
convertTextToSpeechStream,
|
||||
ICommonObject,
|
||||
addSingleFileToStorage,
|
||||
|
|
@ -123,11 +122,13 @@ const generateTTSForResponseStream = async (
|
|||
options,
|
||||
(chunk: Buffer) => {
|
||||
const audioBase64 = chunk.toString('base64')
|
||||
logger.info(`Received TTS chunk: ${audioBase64}`)
|
||||
sseStreamer.streamTTSDataEvent(chatId, audioBase64)
|
||||
},
|
||||
() => {
|
||||
sseStreamer.streamTTSEndEvent(chatId)
|
||||
},
|
||||
(format: string) => {
|
||||
sseStreamer.streamTTSStartEvent(chatId, format)
|
||||
}
|
||||
)
|
||||
} catch (error) {
|
||||
|
|
|
|||
|
|
@ -58,30 +58,10 @@ const textToSpeechProviders = {
|
|||
{
|
||||
label: 'Voice',
|
||||
name: 'voice',
|
||||
type: 'options',
|
||||
type: 'voice_select',
|
||||
description: 'The voice to use when generating the audio',
|
||||
options: [
|
||||
{ label: 'Alloy', name: 'alloy' },
|
||||
{ label: 'Echo', name: 'echo' },
|
||||
{ label: 'Fable', name: 'fable' },
|
||||
{ label: 'Onyx', name: 'onyx' },
|
||||
{ label: 'Nova', name: 'nova' },
|
||||
{ label: 'Shimmer', name: 'shimmer' }
|
||||
],
|
||||
default: 'alloy',
|
||||
optional: true
|
||||
},
|
||||
{
|
||||
label: 'Model',
|
||||
name: 'model',
|
||||
type: 'options',
|
||||
description: 'The TTS model to use',
|
||||
options: [
|
||||
{ label: 'TTS-1', name: 'tts-1' },
|
||||
{ label: 'TTS-1 HD', name: 'tts-1-hd' }
|
||||
],
|
||||
default: 'tts-1',
|
||||
optional: true
|
||||
}
|
||||
]
|
||||
},
|
||||
|
|
|
|||
|
|
@ -257,6 +257,16 @@ const ChatMessage = ({ open, chatflowid, isAgentCanvas, isDialog, previews, setP
|
|||
const [ttsAudio, setTtsAudio] = useState({})
|
||||
const [isTTSEnabled, setIsTTSEnabled] = useState(false)
|
||||
|
||||
// TTS streaming state
|
||||
const [ttsStreamingState, setTtsStreamingState] = useState({
|
||||
mediaSource: null,
|
||||
sourceBuffer: null,
|
||||
audio: null,
|
||||
chunkQueue: [],
|
||||
isBuffering: false,
|
||||
audioFormat: null
|
||||
})
|
||||
|
||||
const isFileAllowedForUpload = (file) => {
|
||||
const constraints = getAllowChatFlowUploads.data
|
||||
/**
|
||||
|
|
@ -1042,6 +1052,15 @@ const ChatMessage = ({ open, chatflowid, isAgentCanvas, isDialog, previews, setP
|
|||
case 'audio':
|
||||
handleAutoPlayAudio(payload.data)
|
||||
break
|
||||
case 'tts_start':
|
||||
handleTTSStart(payload.data.format)
|
||||
break
|
||||
case 'tts_data':
|
||||
handleTTSDataChunk(payload.data)
|
||||
break
|
||||
case 'tts_end':
|
||||
handleTTSEnd()
|
||||
break
|
||||
case 'end':
|
||||
setLocalStorageChatflow(chatflowid, chatId)
|
||||
closeResponse()
|
||||
|
|
@ -1588,6 +1607,19 @@ const ChatMessage = ({ open, chatflowid, isAgentCanvas, isDialog, previews, setP
|
|||
return
|
||||
}
|
||||
|
||||
// Use existing streaming infrastructure for manual TTS
|
||||
handleTTSStart('mp3', (audio) => {
|
||||
setTtsAudio((prev) => ({ ...prev, [messageId]: audio }))
|
||||
|
||||
audio.addEventListener('ended', () => {
|
||||
setTtsAudio((prev) => {
|
||||
const newState = { ...prev }
|
||||
delete newState[messageId]
|
||||
return newState
|
||||
})
|
||||
})
|
||||
})
|
||||
|
||||
const response = await fetch('/api/v1/text-to-speech/generate', {
|
||||
method: 'POST',
|
||||
headers: {
|
||||
|
|
@ -1608,23 +1640,48 @@ const ChatMessage = ({ open, chatflowid, isAgentCanvas, isDialog, previews, setP
|
|||
throw new Error(`TTS request failed: ${response.status}`)
|
||||
}
|
||||
|
||||
const audioBuffer = await response.arrayBuffer()
|
||||
const audioBlob = new Blob([audioBuffer], { type: 'audio/mpeg' })
|
||||
const audioUrl = URL.createObjectURL(audioBlob)
|
||||
const audio = new Audio(audioUrl)
|
||||
const reader = response.body.getReader()
|
||||
const decoder = new TextDecoder()
|
||||
let buffer = ''
|
||||
|
||||
setTtsAudio((prev) => ({ ...prev, [messageId]: audio }))
|
||||
let done = false
|
||||
while (!done) {
|
||||
const result = await reader.read()
|
||||
done = result.done
|
||||
if (done) {
|
||||
break
|
||||
}
|
||||
const value = result.value
|
||||
|
||||
audio.addEventListener('ended', () => {
|
||||
setTtsAudio((prev) => {
|
||||
const newState = { ...prev }
|
||||
delete newState[messageId]
|
||||
return newState
|
||||
})
|
||||
URL.revokeObjectURL(audioUrl)
|
||||
})
|
||||
// Decode the chunk as text and add to buffer
|
||||
const chunk = decoder.decode(value, { stream: true })
|
||||
buffer += chunk
|
||||
|
||||
await audio.play()
|
||||
// Process complete SSE events
|
||||
const lines = buffer.split('\n\n')
|
||||
buffer = lines.pop() || '' // Keep incomplete event in buffer
|
||||
|
||||
for (const eventBlock of lines) {
|
||||
if (eventBlock.trim()) {
|
||||
const event = parseSSEEvent(eventBlock)
|
||||
if (event) {
|
||||
// Handle the event just like the SSE handler does
|
||||
switch (event.event) {
|
||||
case 'tts_start':
|
||||
break
|
||||
case 'tts_data':
|
||||
handleTTSDataChunk(event.data)
|
||||
break
|
||||
case 'tts_end':
|
||||
handleTTSEnd()
|
||||
break
|
||||
default:
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (error) {
|
||||
console.error('Error with TTS:', error)
|
||||
enqueueSnackbar({
|
||||
|
|
@ -1671,6 +1728,268 @@ const ChatMessage = ({ open, chatflowid, isAgentCanvas, isDialog, previews, setP
|
|||
}
|
||||
}
|
||||
|
||||
const parseSSEEvent = (eventBlock) => {
|
||||
const lines = eventBlock.split('\n')
|
||||
const event = {}
|
||||
|
||||
for (const line of lines) {
|
||||
if (line.startsWith('event:')) {
|
||||
event.event = line.substring(6).trim()
|
||||
} else if (line.startsWith('data:')) {
|
||||
const dataStr = line.substring(5).trim()
|
||||
try {
|
||||
const parsed = JSON.parse(dataStr)
|
||||
if (parsed.data) {
|
||||
event.data = parsed.data
|
||||
}
|
||||
} catch (e) {
|
||||
console.error('Error parsing SSE data:', e, 'Raw data:', dataStr)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return event.event ? event : null
|
||||
}
|
||||
|
||||
const initializeTTSStreaming = (format, onAudioReady = null) => {
|
||||
try {
|
||||
const mediaSource = new MediaSource()
|
||||
const audio = new Audio()
|
||||
audio.src = URL.createObjectURL(mediaSource)
|
||||
|
||||
mediaSource.addEventListener('sourceopen', () => {
|
||||
try {
|
||||
// Use the provided format, default to MP3 if not set
|
||||
const mimeType = format === 'mp3' ? 'audio/mpeg' : 'audio/mpeg'
|
||||
|
||||
const sourceBuffer = mediaSource.addSourceBuffer(mimeType)
|
||||
|
||||
setTtsStreamingState((prevState) => ({
|
||||
...prevState,
|
||||
mediaSource,
|
||||
sourceBuffer,
|
||||
audio
|
||||
}))
|
||||
|
||||
// Start playback
|
||||
|
||||
audio.play().catch((playError) => {
|
||||
console.error('Error starting audio playback:', playError)
|
||||
})
|
||||
|
||||
// Notify callback if provided
|
||||
if (onAudioReady) {
|
||||
onAudioReady(audio)
|
||||
}
|
||||
} catch (error) {
|
||||
console.error('Error setting up source buffer:', error)
|
||||
console.error('MediaSource readyState:', mediaSource.readyState)
|
||||
console.error('Requested MIME type:', mimeType)
|
||||
}
|
||||
})
|
||||
|
||||
audio.addEventListener('ended', () => {
|
||||
cleanupTTSStreaming()
|
||||
})
|
||||
} catch (error) {
|
||||
console.error('Error initializing TTS streaming:', error)
|
||||
}
|
||||
}
|
||||
|
||||
const cleanupTTSStreaming = () => {
|
||||
setTtsStreamingState((prevState) => {
|
||||
if (prevState.audio) {
|
||||
prevState.audio.pause()
|
||||
prevState.audio.removeAttribute('src')
|
||||
if (prevState.audio.src) {
|
||||
URL.revokeObjectURL(prevState.audio.src)
|
||||
}
|
||||
}
|
||||
|
||||
if (prevState.mediaSource) {
|
||||
if (prevState.mediaSource.readyState === 'open') {
|
||||
try {
|
||||
prevState.mediaSource.endOfStream()
|
||||
} catch (e) {
|
||||
// Ignore errors during cleanup
|
||||
}
|
||||
}
|
||||
prevState.mediaSource.removeEventListener('sourceopen', () => {})
|
||||
}
|
||||
|
||||
return {
|
||||
mediaSource: null,
|
||||
sourceBuffer: null,
|
||||
audio: null,
|
||||
chunkQueue: [],
|
||||
isBuffering: false,
|
||||
audioFormat: null
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
const processChunkQueue = () => {
|
||||
setTtsStreamingState((prevState) => {
|
||||
if (!prevState.sourceBuffer || prevState.sourceBuffer.updating || prevState.chunkQueue.length === 0) {
|
||||
return prevState
|
||||
}
|
||||
|
||||
const chunk = prevState.chunkQueue.shift()
|
||||
|
||||
try {
|
||||
prevState.sourceBuffer.appendBuffer(chunk)
|
||||
return {
|
||||
...prevState,
|
||||
chunkQueue: [...prevState.chunkQueue],
|
||||
isBuffering: true
|
||||
}
|
||||
} catch (error) {
|
||||
console.error('Error appending chunk to buffer:', error)
|
||||
return prevState
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
const handleTTSStart = (format, onAudioReady = null) => {
|
||||
// Store the audio format for this TTS session and initialize
|
||||
setTtsStreamingState((prevState) => {
|
||||
// Cleanup any existing streaming first
|
||||
if (prevState.audio) {
|
||||
prevState.audio.pause()
|
||||
if (prevState.audio.src) {
|
||||
URL.revokeObjectURL(prevState.audio.src)
|
||||
}
|
||||
}
|
||||
|
||||
if (prevState.mediaSource && prevState.mediaSource.readyState === 'open') {
|
||||
try {
|
||||
prevState.mediaSource.endOfStream()
|
||||
} catch (e) {
|
||||
// Ignore errors during cleanup
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
mediaSource: null,
|
||||
sourceBuffer: null,
|
||||
audio: null,
|
||||
chunkQueue: [],
|
||||
isBuffering: false,
|
||||
audioFormat: format
|
||||
}
|
||||
})
|
||||
|
||||
// Initialize TTS streaming with the correct format
|
||||
setTimeout(() => initializeTTSStreaming(format, onAudioReady), 0)
|
||||
}
|
||||
|
||||
const handleTTSDataChunk = (base64Data) => {
|
||||
try {
|
||||
const audioBuffer = Uint8Array.from(atob(base64Data), (c) => c.charCodeAt(0))
|
||||
|
||||
setTtsStreamingState((prevState) => {
|
||||
// Add chunk to queue
|
||||
const newState = {
|
||||
...prevState,
|
||||
chunkQueue: [...prevState.chunkQueue, audioBuffer]
|
||||
}
|
||||
|
||||
// Process queue if sourceBuffer is ready
|
||||
if (prevState.sourceBuffer && !prevState.sourceBuffer.updating) {
|
||||
setTimeout(() => processChunkQueue(), 0)
|
||||
}
|
||||
|
||||
return newState
|
||||
})
|
||||
} catch (error) {
|
||||
console.error('Error handling TTS data chunk:', error)
|
||||
}
|
||||
}
|
||||
|
||||
const handleTTSEnd = () => {
|
||||
setTtsStreamingState((prevState) => {
|
||||
if (prevState.mediaSource && prevState.mediaSource.readyState === 'open') {
|
||||
try {
|
||||
// Process any remaining chunks first
|
||||
if (prevState.sourceBuffer && prevState.chunkQueue.length > 0 && !prevState.sourceBuffer.updating) {
|
||||
const remainingChunks = [...prevState.chunkQueue]
|
||||
remainingChunks.forEach((chunk, index) => {
|
||||
setTimeout(() => {
|
||||
if (prevState.sourceBuffer && !prevState.sourceBuffer.updating) {
|
||||
try {
|
||||
prevState.sourceBuffer.appendBuffer(chunk)
|
||||
if (index === remainingChunks.length - 1) {
|
||||
// End stream after last chunk
|
||||
setTimeout(() => {
|
||||
if (prevState.mediaSource && prevState.mediaSource.readyState === 'open') {
|
||||
prevState.mediaSource.endOfStream()
|
||||
}
|
||||
}, 100)
|
||||
}
|
||||
} catch (error) {
|
||||
console.error('Error appending remaining chunk:', error)
|
||||
}
|
||||
}
|
||||
}, index * 50)
|
||||
})
|
||||
return {
|
||||
...prevState,
|
||||
chunkQueue: []
|
||||
}
|
||||
}
|
||||
|
||||
// Wait for any pending buffer operations to complete
|
||||
if (prevState.sourceBuffer && !prevState.sourceBuffer.updating) {
|
||||
prevState.mediaSource.endOfStream()
|
||||
} else if (prevState.sourceBuffer) {
|
||||
// Wait for buffer to finish updating
|
||||
prevState.sourceBuffer.addEventListener(
|
||||
'updateend',
|
||||
() => {
|
||||
if (prevState.mediaSource && prevState.mediaSource.readyState === 'open') {
|
||||
prevState.mediaSource.endOfStream()
|
||||
}
|
||||
},
|
||||
{ once: true }
|
||||
)
|
||||
}
|
||||
} catch (error) {
|
||||
console.error('Error ending TTS stream:', error)
|
||||
}
|
||||
}
|
||||
return prevState
|
||||
})
|
||||
}
|
||||
|
||||
// Set up sourceBuffer event listeners when it changes
|
||||
useEffect(() => {
|
||||
if (ttsStreamingState.sourceBuffer) {
|
||||
const sourceBuffer = ttsStreamingState.sourceBuffer
|
||||
|
||||
const handleUpdateEnd = () => {
|
||||
setTtsStreamingState((prevState) => ({
|
||||
...prevState,
|
||||
isBuffering: false
|
||||
}))
|
||||
// Process next chunk in queue
|
||||
setTimeout(() => processChunkQueue(), 0)
|
||||
}
|
||||
|
||||
sourceBuffer.addEventListener('updateend', handleUpdateEnd)
|
||||
|
||||
return () => {
|
||||
sourceBuffer.removeEventListener('updateend', handleUpdateEnd)
|
||||
}
|
||||
}
|
||||
}, [ttsStreamingState.sourceBuffer])
|
||||
|
||||
// Cleanup TTS streaming on component unmount
|
||||
useEffect(() => {
|
||||
return () => {
|
||||
cleanupTTSStreaming()
|
||||
}
|
||||
}, [])
|
||||
|
||||
const getInputDisabled = () => {
|
||||
return (
|
||||
loading ||
|
||||
|
|
|
|||
Loading…
Reference in New Issue