Flowise/packages/ui/src/ui-component/extended/TextToSpeech.jsx

625 lines
26 KiB
JavaScript

import { useDispatch } from 'react-redux'
import { useState, useEffect } from 'react'
import PropTypes from 'prop-types'
import { enqueueSnackbar as enqueueSnackbarAction, closeSnackbar as closeSnackbarAction, SET_CHATFLOW } from '@/store/actions'
// material-ui
import {
Typography,
Box,
Button,
FormControl,
ListItem,
ListItemAvatar,
ListItemText,
MenuItem,
Select,
CircularProgress
} from '@mui/material'
import { IconX, IconVolume } from '@tabler/icons-react'
import { useTheme } from '@mui/material/styles'
// Project import
import CredentialInputHandler from '@/views/canvas/CredentialInputHandler'
import { TooltipWithParser } from '@/ui-component/tooltip/TooltipWithParser'
import { SwitchInput } from '@/ui-component/switch/Switch'
import { Input } from '@/ui-component/input/Input'
import { StyledButton } from '@/ui-component/button/StyledButton'
import { Dropdown } from '@/ui-component/dropdown/Dropdown'
import openAISVG from '@/assets/images/openai.svg'
import elevenLabsSVG from '@/assets/images/elevenlabs.svg'
// store
import useNotifier from '@/utils/useNotifier'
// API
import chatflowsApi from '@/api/chatflows'
import ttsApi from '@/api/tts'
const TextToSpeechType = {
OPENAI_TTS: 'openai',
ELEVEN_LABS_TTS: 'elevenlabs'
}
// Weird quirk - the key must match the name property value.
const textToSpeechProviders = {
[TextToSpeechType.OPENAI_TTS]: {
label: 'OpenAI TTS',
name: TextToSpeechType.OPENAI_TTS,
icon: openAISVG,
url: 'https://platform.openai.com/docs/guides/text-to-speech',
inputs: [
{
label: 'Connect Credential',
name: 'credential',
type: 'credential',
credentialNames: ['openAIApi']
},
{
label: 'Voice',
name: 'voice',
type: 'voice_select',
description: 'The voice to use when generating the audio',
default: 'alloy',
optional: true
}
]
},
[TextToSpeechType.ELEVEN_LABS_TTS]: {
label: 'Eleven Labs TTS',
name: TextToSpeechType.ELEVEN_LABS_TTS,
icon: elevenLabsSVG,
url: 'https://elevenlabs.io/',
inputs: [
{
label: 'Connect Credential',
name: 'credential',
type: 'credential',
credentialNames: ['elevenLabsApi']
},
{
label: 'Voice',
name: 'voice',
type: 'voice_select',
description: 'The voice to use for text-to-speech',
default: '21m00Tcm4TlvDq8ikWAM',
optional: true
}
]
}
}
const TextToSpeech = ({ dialogProps }) => {
const dispatch = useDispatch()
useNotifier()
const theme = useTheme()
const enqueueSnackbar = (...args) => dispatch(enqueueSnackbarAction(...args))
const closeSnackbar = (...args) => dispatch(closeSnackbarAction(...args))
const [textToSpeech, setTextToSpeech] = useState(null)
const [selectedProvider, setSelectedProvider] = useState('none')
const [voices, setVoices] = useState([])
const [loadingVoices, setLoadingVoices] = useState(false)
const onSave = async () => {
const textToSpeechConfig = setValue(true, selectedProvider, 'status')
try {
const saveResp = await chatflowsApi.updateChatflow(dialogProps.chatflow.id, {
textToSpeech: JSON.stringify(textToSpeechConfig)
})
if (saveResp.data) {
enqueueSnackbar({
message: 'Text To Speech Configuration Saved',
options: {
key: Date.now() + Math.random(),
variant: 'success',
action: (key) => (
<Button style={{ color: 'white' }} onClick={() => closeSnackbar(key)}>
<IconX />
</Button>
)
}
})
dispatch({ type: SET_CHATFLOW, chatflow: saveResp.data })
}
} catch (error) {
enqueueSnackbar({
message: `Failed to save Text To Speech Configuration: ${
typeof error.response.data === 'object' ? error.response.data.message : error.response.data
}`,
options: {
key: Date.now() + Math.random(),
variant: 'error',
persist: true,
action: (key) => (
<Button style={{ color: 'white' }} onClick={() => closeSnackbar(key)}>
<IconX />
</Button>
)
}
})
}
}
const setValue = (value, providerName, inputParamName) => {
let newVal = {}
if (!textToSpeech || !Object.hasOwn(textToSpeech, providerName)) {
newVal = { ...(textToSpeech || {}), [providerName]: {} }
} else {
newVal = { ...textToSpeech }
}
newVal[providerName][inputParamName] = value
if (inputParamName === 'status' && value === true) {
// ensure that the others are turned off
Object.keys(textToSpeechProviders).forEach((key) => {
const provider = textToSpeechProviders[key]
if (provider.name !== providerName) {
newVal[provider.name] = { ...(textToSpeech?.[provider.name] || {}), status: false }
}
})
if (providerName !== 'none' && newVal['none']) {
newVal['none'].status = false
}
}
setTextToSpeech(newVal)
return newVal
}
const handleProviderChange = (provider, configOverride = null) => {
setSelectedProvider(() => provider)
setVoices([])
if (provider !== 'none') {
const config = configOverride || textToSpeech?.[provider]
const credentialId = config?.credentialId
if (credentialId) {
loadVoicesForProvider(provider, credentialId)
}
}
}
const loadVoicesForProvider = async (provider, credentialId) => {
if (provider === 'none' || !credentialId) return
setLoadingVoices(true)
try {
const params = new URLSearchParams({ provider })
params.append('credentialId', credentialId)
const response = await ttsApi.listVoices(params)
if (response.data) {
const voicesData = await response.data
setVoices(voicesData)
} else {
setVoices([])
}
} catch (error) {
console.error('Error loading voices:', error)
setVoices([])
} finally {
setLoadingVoices(false)
}
}
const testTTS = async () => {
if (selectedProvider === 'none' || !textToSpeech?.[selectedProvider]?.credentialId) {
enqueueSnackbar({
message: 'Please select a provider and configure credentials first',
options: { variant: 'warning' }
})
return
}
try {
const providerConfig = textToSpeech?.[selectedProvider] || {}
const body = {
text: 'Today is a wonderful day to build something with Flowise!',
provider: selectedProvider,
credentialId: providerConfig.credentialId,
voice: providerConfig.voice,
model: providerConfig.model
}
// Use streaming approach like in ChatMessage.jsx
const mediaSource = new MediaSource()
const audio = new Audio()
audio.src = URL.createObjectURL(mediaSource)
const streamingState = {
mediaSource,
sourceBuffer: null,
audio,
chunkQueue: [],
isBuffering: false,
abortController: new AbortController(),
streamEnded: false
}
mediaSource.addEventListener('sourceopen', () => {
try {
const mimeType = 'audio/mpeg'
streamingState.sourceBuffer = mediaSource.addSourceBuffer(mimeType)
streamingState.sourceBuffer.addEventListener('updateend', () => {
streamingState.isBuffering = false
if (streamingState.chunkQueue.length > 0 && !streamingState.sourceBuffer.updating) {
const chunk = streamingState.chunkQueue.shift()
try {
streamingState.sourceBuffer.appendBuffer(chunk)
streamingState.isBuffering = true
} catch (error) {
console.error('Error appending chunk:', error)
}
} else if (streamingState.streamEnded && streamingState.chunkQueue.length === 0) {
// All chunks processed and stream ended, now we can safely end the stream
try {
if (streamingState.mediaSource.readyState === 'open') {
streamingState.mediaSource.endOfStream()
}
} catch (error) {
console.error('Error ending MediaSource stream:', error)
}
}
})
audio.play().catch((playError) => {
console.error('Error starting audio playback:', playError)
})
} catch (error) {
console.error('Error setting up source buffer:', error)
}
})
audio.addEventListener('playing', () => {
enqueueSnackbar({
message: 'Test audio playing...',
options: { variant: 'info' }
})
})
audio.addEventListener('ended', () => {
enqueueSnackbar({
message: 'Test audio completed successfully',
options: { variant: 'success' }
})
// Cleanup
if (streamingState.audio.src) {
URL.revokeObjectURL(streamingState.audio.src)
}
})
const response = await fetch('/api/v1/text-to-speech/generate', {
method: 'POST',
headers: {
'Content-Type': 'application/json',
'x-request-from': 'internal'
},
credentials: 'include',
body: JSON.stringify(body),
signal: streamingState.abortController.signal
})
if (!response.ok) {
throw new Error(`HTTP error! status: ${response.status}`)
}
const reader = response.body.getReader()
let buffer = ''
let done = false
while (!done) {
if (streamingState.abortController.signal.aborted) {
reader.cancel()
break
}
const result = await reader.read()
done = result.done
if (done) break
const chunk = new TextDecoder().decode(result.value, { stream: true })
buffer += chunk
const lines = buffer.split('\n\n')
buffer = lines.pop() || ''
for (const eventBlock of lines) {
if (eventBlock.trim()) {
const event = parseSSEEvent(eventBlock)
if (event) {
switch (event.event) {
case 'tts_data':
if (event.data?.audioChunk) {
const audioBuffer = Uint8Array.from(atob(event.data.audioChunk), (c) => c.charCodeAt(0))
streamingState.chunkQueue.push(audioBuffer)
if (streamingState.sourceBuffer && !streamingState.sourceBuffer.updating) {
const chunk = streamingState.chunkQueue.shift()
try {
streamingState.sourceBuffer.appendBuffer(chunk)
streamingState.isBuffering = true
} catch (error) {
console.error('Error appending initial chunk:', error)
}
}
}
break
case 'tts_end':
streamingState.streamEnded = true
// Check if we can end the stream immediately (no chunks queued and not updating)
if (
streamingState.sourceBuffer &&
streamingState.chunkQueue.length === 0 &&
!streamingState.sourceBuffer.updating &&
streamingState.mediaSource.readyState === 'open'
) {
try {
streamingState.mediaSource.endOfStream()
} catch (error) {
console.error('Error ending MediaSource stream:', error)
}
}
break
}
}
}
}
}
} catch (error) {
console.error('Error testing TTS:', error)
enqueueSnackbar({
message: `TTS test failed: ${error.message}`,
options: { variant: 'error' }
})
}
}
const parseSSEEvent = (eventBlock) => {
const lines = eventBlock.trim().split('\n')
const event = { event: null, data: null }
for (const line of lines) {
if (line.startsWith('event:')) {
event.event = line.substring(6).trim()
} else if (line.startsWith('data:')) {
const dataStr = line.substring(5).trim()
try {
const parsed = JSON.parse(dataStr)
if (parsed.data) {
event.data = parsed.data
}
} catch (e) {
console.error('Error parsing SSE data:', e)
}
}
}
return event.event ? event : null
}
useEffect(() => {
if (dialogProps.chatflow && dialogProps.chatflow.textToSpeech) {
try {
const textToSpeechConfig = JSON.parse(dialogProps.chatflow.textToSpeech)
let selectedProvider = 'none'
Object.keys(textToSpeechProviders).forEach((key) => {
const providerConfig = textToSpeechConfig[key]
if (providerConfig && providerConfig.status) {
selectedProvider = key
}
})
setSelectedProvider(selectedProvider)
setTextToSpeech(textToSpeechConfig)
handleProviderChange(selectedProvider, textToSpeechConfig)
} catch {
setTextToSpeech(null)
setSelectedProvider('none')
}
}
return () => {
setTextToSpeech(null)
setSelectedProvider('none')
setVoices([])
}
// eslint-disable-next-line react-hooks/exhaustive-deps
}, [dialogProps])
return (
<>
<Box fullWidth sx={{ mb: 1, display: 'flex', flexDirection: 'column', gap: 1 }}>
<Typography>Providers</Typography>
<FormControl fullWidth>
<Select
size='small'
value={selectedProvider}
onChange={(event) => handleProviderChange(event.target.value)}
sx={{
'& .MuiSvgIcon-root': {
color: theme?.customization?.isDarkMode ? '#fff' : 'inherit'
}
}}
>
<MenuItem value='none'>None</MenuItem>
{Object.values(textToSpeechProviders).map((provider) => (
<MenuItem key={provider.name} value={provider.name}>
{provider.label}
</MenuItem>
))}
</Select>
</FormControl>
</Box>
{selectedProvider !== 'none' && (
<>
<ListItem sx={{ mt: 3 }} alignItems='center'>
<ListItemAvatar>
<div
style={{
width: 50,
height: 50,
borderRadius: '50%',
backgroundColor: 'white',
flexShrink: 0,
display: 'flex',
alignItems: 'center',
justifyContent: 'center'
}}
>
<img
style={{
width: '100%',
height: '100%',
padding: 10,
objectFit: 'contain'
}}
alt='TTS Provider'
src={textToSpeechProviders[selectedProvider].icon}
/>
</div>
</ListItemAvatar>
<ListItemText
sx={{ ml: 1 }}
primary={textToSpeechProviders[selectedProvider].label}
secondary={
<a target='_blank' rel='noreferrer' href={textToSpeechProviders[selectedProvider].url}>
{textToSpeechProviders[selectedProvider].url}
</a>
}
/>
</ListItem>
{textToSpeechProviders[selectedProvider].inputs.map((inputParam) => (
<Box key={`${selectedProvider}-${inputParam.name}`} sx={{ p: 2 }}>
<div style={{ display: 'flex', flexDirection: 'row' }}>
<Typography>
{inputParam.label}
{!inputParam.optional && <span style={{ color: 'red' }}>&nbsp;*</span>}
{inputParam.description && (
<TooltipWithParser style={{ marginLeft: 10 }} title={inputParam.description} />
)}
</Typography>
</div>
{inputParam.type === 'credential' && (
<CredentialInputHandler
key={textToSpeech?.[selectedProvider]?.credentialId}
data={
textToSpeech?.[selectedProvider]?.credentialId
? { credential: textToSpeech?.[selectedProvider]?.credentialId }
: {}
}
inputParam={inputParam}
onSelect={(newValue) => {
setValue(newValue, selectedProvider, 'credentialId')
// Load voices when credential is updated
if (newValue && selectedProvider !== 'none') {
setTimeout(() => loadVoicesForProvider(selectedProvider, newValue), 100)
}
}}
/>
)}
{inputParam.type === 'boolean' && (
<SwitchInput
onChange={(newValue) => setValue(newValue, selectedProvider, inputParam.name)}
value={
textToSpeech?.[selectedProvider]
? textToSpeech[selectedProvider][inputParam.name]
: inputParam.default ?? false
}
/>
)}
{(inputParam.type === 'string' || inputParam.type === 'password' || inputParam.type === 'number') && (
<Input
inputParam={inputParam}
onChange={(newValue) => setValue(newValue, selectedProvider, inputParam.name)}
value={
textToSpeech?.[selectedProvider]
? textToSpeech[selectedProvider][inputParam.name]
: inputParam.default ?? ''
}
/>
)}
{inputParam.type === 'options' && (
<Dropdown
name={inputParam.name}
options={inputParam.options}
onSelect={(newValue) => setValue(newValue, selectedProvider, inputParam.name)}
value={
textToSpeech?.[selectedProvider]
? textToSpeech[selectedProvider][inputParam.name]
: inputParam.default ?? 'choose an option'
}
/>
)}
{inputParam.type === 'voice_select' && (
<Box>
{loadingVoices ? (
<Box sx={{ display: 'flex', alignItems: 'center', gap: 1 }}>
<CircularProgress size={16} />
<Typography variant='body2'>Loading voices...</Typography>
</Box>
) : (
<Dropdown
name={inputParam.name}
options={voices.map((voice) => ({ label: voice.name, name: voice.id }))}
onSelect={(newValue) => setValue(newValue, selectedProvider, inputParam.name)}
value={
textToSpeech?.[selectedProvider]
? textToSpeech[selectedProvider][inputParam.name]
: inputParam.default ?? 'choose a voice'
}
/>
)}
</Box>
)}
</Box>
))}
{/* Auto-play Toggle */}
<Box sx={{ p: 2 }}>
<div style={{ display: 'flex', flexDirection: 'row', alignItems: 'center' }}>
<Typography>
Automatically play audio
<TooltipWithParser
style={{ marginLeft: 10 }}
title='When enabled, bot responses will be automatically converted to speech and played'
/>
</Typography>
</div>
<SwitchInput
onChange={(newValue) => setValue(newValue, selectedProvider, 'autoPlay')}
value={textToSpeech?.[selectedProvider] ? textToSpeech[selectedProvider].autoPlay ?? false : false}
/>
</Box>
{/* Test TTS Button */}
<Box sx={{ p: 2 }}>
<StyledButton
variant='outlined'
size='small'
startIcon={<IconVolume />}
onClick={testTTS}
disabled={!textToSpeech?.[selectedProvider]?.credentialId}
>
Test Voice
</StyledButton>
</Box>
</>
)}
<StyledButton
style={{ marginBottom: 10, marginTop: 10 }}
disabled={selectedProvider !== 'none' && !textToSpeech?.[selectedProvider]?.credentialId}
variant='contained'
onClick={onSave}
>
Save
</StyledButton>
</>
)
}
TextToSpeech.propTypes = {
dialogProps: PropTypes.object
}
export default TextToSpeech