Refactor Text-to-Speech Provider Selection and Enhance UI Components

- Updated the text-to-speech controller to select the active provider based on status instead of the first available provider - Added audio waveform controls and test audio functionality in the TextToSpeech component, allowing users to play and pause test audio - Integrated Autocomplete for voice selection in the TextToSpeech component - Implemented TTS action management in ChatMessage to prevent auto-scrolling during TTS actions
2025-09-16 18:54:47 +01:00 · 2025-09-16 18:54:47 +01:00 · 123ab3c85e
parent 5ea714098d
commit 123ab3c85e
5 changed files with 541 additions and 153 deletions
--- a/packages/server/src/controllers/text-to-speech/index.ts
+++ b/packages/server/src/controllers/text-to-speech/index.ts
@ -34,17 +34,17 @@ const generateTextToSpeech = async (req: Request, res: Response) => {
            const chatflow = await chatflowsService.getChatflowById(chatflowId)
            const ttsConfig = JSON.parse(chatflow.textToSpeech)

-            // Extract the first provider config (assuming single provider per chatflow)
-            const providerKey = Object.keys(ttsConfig)[0]
-            if (!providerKey) {
+            // Find the provider with status: true
+            const activeProviderKey = Object.keys(ttsConfig).find((key) => ttsConfig[key].status === true)
+            if (!activeProviderKey) {
                throw new InternalFlowiseError(
                    StatusCodes.BAD_REQUEST,
-                    `Error: textToSpeechController.generateTextToSpeech - no TTS provider configured in chatflow!`
+                    `Error: textToSpeechController.generateTextToSpeech - no active TTS provider configured in chatflow!`
                )
            }

-            const providerConfig = ttsConfig[providerKey]
-            provider = providerKey
+            const providerConfig = ttsConfig[activeProviderKey]
+            provider = activeProviderKey
            credentialId = providerConfig.credentialId
            voice = providerConfig.voice
            model = providerConfig.model
--- a/packages/ui/src/ui-component/extended/AudioWaveform.jsx
+++ b/packages/ui/src/ui-component/extended/AudioWaveform.jsx
@ -0,0 +1,311 @@
+import { useRef, useEffect, useState, useCallback } from 'react'
+import PropTypes from 'prop-types'
+import { Box, IconButton, CircularProgress } from '@mui/material'
+import { IconPlayerPlay, IconPlayerPause } from '@tabler/icons-react'
+import { useTheme } from '@mui/material/styles'
+
+const AudioWaveform = ({
+    audioSrc,
+    onPlay,
+    onPause,
+    onEnded,
+    isPlaying = false,
+    duration: _duration = 0,
+    isGenerating = false,
+    disabled = false,
+    externalAudioRef = null,
+    resetProgress = false
+}) => {
+    const canvasRef = useRef(null)
+    const audioRef = useRef(null)
+    const animationRef = useRef(null)
+    const theme = useTheme()
+
+    const [progress, setProgress] = useState(0)
+    const [_audioBuffer, setAudioBuffer] = useState(null)
+    const [waveformData, setWaveformData] = useState([])
+
+    // Generate waveform visualization data
+    const generateWaveform = useCallback((buffer) => {
+        if (!buffer) return []
+
+        const rawData = buffer.getChannelData(0)
+        const samples = 200 // More bars for smoother appearance like reference
+        const blockSize = Math.floor(rawData.length / samples)
+        const filteredData = []
+
+        for (let i = 0; i < samples; i++) {
+            let blockStart = blockSize * i
+            let sum = 0
+            for (let j = 0; j < blockSize; j++) {
+                sum += Math.abs(rawData[blockStart + j])
+            }
+            filteredData.push(sum / blockSize)
+        }
+
+        // Normalize the data
+        const maxValue = Math.max(...filteredData)
+        return filteredData.map((value) => (value / maxValue) * 100)
+    }, [])
+
+    // Generate realistic placeholder waveform like in reference
+    const generatePlaceholderWaveform = useCallback(() => {
+        const samples = 200
+        const waveform = []
+
+        for (let i = 0; i < samples; i++) {
+            // Create a more realistic waveform pattern
+            const position = i / samples
+            const baseHeight = 20 + Math.sin(position * Math.PI * 4) * 15
+            const variation = Math.random() * 40 + 10
+            const envelope = Math.sin(position * Math.PI) * 0.8 + 0.2
+
+            waveform.push((baseHeight + variation) * envelope)
+        }
+
+        return waveform
+    }, [])
+
+    // Draw waveform on canvas
+    const drawWaveform = useCallback(() => {
+        const canvas = canvasRef.current
+        if (!canvas || waveformData.length === 0) return
+
+        const ctx = canvas.getContext('2d')
+
+        // Handle high DPI displays for crisp rendering
+        const dpr = window.devicePixelRatio || 1
+        const rect = canvas.getBoundingClientRect()
+
+        canvas.width = rect.width * dpr
+        canvas.height = rect.height * dpr
+        ctx.scale(dpr, dpr)
+
+        canvas.style.width = rect.width + 'px'
+        canvas.style.height = rect.height + 'px'
+
+        ctx.clearRect(0, 0, rect.width, rect.height)
+
+        // More bars for smoother appearance like the reference
+        const totalBars = waveformData.length
+        const barWidth = 2 // Fixed thin bar width like in reference
+        const barSpacing = 1 // Small gap between bars
+        const totalWidth = rect.width
+        const startX = (totalWidth - totalBars * (barWidth + barSpacing)) / 2
+        const centerY = rect.height / 2
+
+        waveformData.forEach((value, index) => {
+            const barHeight = Math.max(2, (value / 100) * (rect.height * 0.8))
+            const x = startX + index * (barWidth + barSpacing)
+
+            // Determine color based on playback progress
+            const progressIndex = Math.floor((progress / 100) * waveformData.length)
+            const isPlayed = index <= progressIndex
+
+            ctx.fillStyle = isPlayed ? theme.palette.primary.main : theme.palette.mode === 'dark' ? '#444' : '#ccc'
+
+            // Draw thin vertical bars like in reference
+            ctx.fillRect(x, centerY - barHeight / 2, barWidth, barHeight)
+        })
+    }, [waveformData, progress, theme])
+
+    // Load and decode audio for waveform generation
+    useEffect(() => {
+        if (audioSrc && audioSrc.startsWith('blob:')) {
+            const loadAudioBuffer = async () => {
+                try {
+                    const response = await fetch(audioSrc)
+                    const arrayBuffer = await response.arrayBuffer()
+                    const audioContext = new (window.AudioContext || window.webkitAudioContext)()
+                    const buffer = await audioContext.decodeAudioData(arrayBuffer)
+                    setAudioBuffer(buffer)
+                    const waveform = generateWaveform(buffer)
+                    setWaveformData(waveform)
+                } catch (error) {
+                    console.error('Error loading audio buffer:', error)
+                    // Generate placeholder waveform
+                    const placeholder = generatePlaceholderWaveform()
+                    setWaveformData(placeholder)
+                }
+            }
+            loadAudioBuffer()
+        } else {
+            // Always show placeholder waveform when no audio source
+            const placeholder = generatePlaceholderWaveform()
+            setWaveformData(placeholder)
+        }
+    }, [audioSrc, generateWaveform, generatePlaceholderWaveform])
+
+    // Reset progress when resetProgress prop is true
+    useEffect(() => {
+        if (resetProgress) {
+            setProgress(0)
+        }
+    }, [resetProgress])
+
+    // Draw waveform when data changes or progress updates
+    useEffect(() => {
+        drawWaveform()
+    }, [drawWaveform, progress])
+
+    // Update progress during playback
+    useEffect(() => {
+        const activeAudioRef = externalAudioRef || audioRef.current
+        if (isPlaying && activeAudioRef && audioSrc) {
+            const updateProgress = () => {
+                const audio = externalAudioRef || audioRef.current
+                if (audio && audio.duration && !isNaN(audio.duration)) {
+                    const currentProgress = (audio.currentTime / audio.duration) * 100
+                    setProgress(currentProgress)
+                }
+                if (isPlaying && audio && !audio.paused) {
+                    animationRef.current = requestAnimationFrame(updateProgress)
+                }
+            }
+
+            // Start the update loop
+            animationRef.current = requestAnimationFrame(updateProgress)
+        } else {
+            if (animationRef.current) {
+                cancelAnimationFrame(animationRef.current)
+            }
+        }
+
+        return () => {
+            if (animationRef.current) {
+                cancelAnimationFrame(animationRef.current)
+            }
+        }
+    }, [isPlaying, audioSrc, externalAudioRef])
+
+    const handlePlayPause = () => {
+        if (isPlaying) {
+            onPause?.()
+        } else {
+            onPlay?.()
+        }
+    }
+
+    // Handle canvas click for seeking
+    const handleCanvasClick = (event) => {
+        const activeAudio = externalAudioRef || audioRef.current
+        if (!activeAudio || !activeAudio.duration || disabled || isGenerating) return
+
+        const canvas = canvasRef.current
+        const rect = canvas.getBoundingClientRect()
+        const clickX = event.clientX - rect.left
+
+        // Use the actual canvas display width for more accurate clicking
+        const clickProgress = Math.max(0, Math.min(100, (clickX / rect.width) * 100))
+        const seekTime = (clickProgress / 100) * activeAudio.duration
+
+        activeAudio.currentTime = seekTime
+        setProgress(clickProgress)
+    }
+
+    return (
+        <Box sx={{ width: '100%' }}>
+            {/* Hidden audio element for duration and seeking - only if no external ref */}
+            {audioSrc && !externalAudioRef && (
+                <audio
+                    ref={audioRef}
+                    src={audioSrc}
+                    onLoadedMetadata={() => {
+                        if (audioRef.current) {
+                            setProgress(0)
+                        }
+                    }}
+                    onTimeUpdate={() => {
+                        // Additional progress update on timeupdate event
+                        const audio = audioRef.current
+                        if (audio && audio.duration && !isNaN(audio.duration)) {
+                            const currentProgress = (audio.currentTime / audio.duration) * 100
+                            setProgress(currentProgress)
+                        }
+                    }}
+                    onEnded={() => {
+                        setProgress(0)
+                        onEnded?.()
+                    }}
+                    style={{ display: 'none' }}
+                >
+                    <track kind='captions' />
+                </audio>
+            )}
+
+            {/* Play button and Waveform side by side */}
+            <Box sx={{ display: 'flex', alignItems: 'center', gap: 1 }}>
+                {/* Play/Pause Button */}
+                <IconButton
+                    onClick={handlePlayPause}
+                    disabled={disabled || isGenerating}
+                    size='small'
+                    sx={{
+                        width: 32,
+                        height: 32,
+                        flexShrink: 0,
+                        backgroundColor: isPlaying ? 'transparent' : theme.palette.primary.main,
+                        color: isPlaying ? theme.palette.primary.main : 'white',
+                        border: isPlaying ? `1px solid ${theme.palette.primary.main}` : 'none',
+                        '&:hover': {
+                            backgroundColor: isPlaying ? theme.palette.primary.main : theme.palette.primary.dark,
+                            color: 'white'
+                        },
+                        '&:disabled': {
+                            backgroundColor: theme.palette.action.disabled,
+                            color: theme.palette.action.disabled,
+                            border: 'none'
+                        }
+                    }}
+                >
+                    {isGenerating ? (
+                        <CircularProgress size={16} />
+                    ) : isPlaying ? (
+                        <IconPlayerPause size={16} />
+                    ) : (
+                        <IconPlayerPlay size={16} />
+                    )}
+                </IconButton>
+
+                {/* Waveform Canvas */}
+                <Box
+                    sx={{
+                        flex: 1,
+                        cursor: !disabled && !isGenerating && audioSrc ? 'pointer' : 'default',
+                        display: 'flex',
+                        alignItems: 'center'
+                    }}
+                >
+                    <canvas
+                        ref={canvasRef}
+                        width={400}
+                        height={32}
+                        onClick={handleCanvasClick}
+                        style={{
+                            width: '100%',
+                            height: '32px',
+                            backgroundColor: 'transparent',
+                            opacity: disabled ? 0.6 : 1,
+                            display: 'block'
+                        }}
+                    />
+                </Box>
+            </Box>
+        </Box>
+    )
+}
+
+AudioWaveform.propTypes = {
+    audioSrc: PropTypes.string,
+    onPlay: PropTypes.func,
+    onPause: PropTypes.func,
+    onEnded: PropTypes.func,
+    isPlaying: PropTypes.bool,
+    duration: PropTypes.number,
+    isGenerating: PropTypes.bool,
+    disabled: PropTypes.bool,
+    externalAudioRef: PropTypes.object,
+    resetProgress: PropTypes.bool
+}
+
+export default AudioWaveform
--- a/packages/ui/src/ui-component/extended/SpeechToText.jsx
+++ b/packages/ui/src/ui-component/extended/SpeechToText.jsx
@ -402,7 +402,15 @@ const SpeechToText = ({ dialogProps }) => {
                            sx={{ ml: 1 }}
                            primary={speechToTextProviders[selectedProvider].label}
                            secondary={
-                                <a target='_blank' rel='noreferrer' href={speechToTextProviders[selectedProvider].url}>
+                                <a
+                                    target='_blank'
+                                    rel='noreferrer'
+                                    href={speechToTextProviders[selectedProvider].url}
+                                    style={{
+                                        color: theme?.customization?.isDarkMode ? '#90caf9' : '#1976d2',
+                                        textDecoration: 'underline'
+                                    }}
+                                >
                                    {speechToTextProviders[selectedProvider].url}
                                </a>
                            }
--- a/packages/ui/src/ui-component/extended/TextToSpeech.jsx
+++ b/packages/ui/src/ui-component/extended/TextToSpeech.jsx
@ -14,7 +14,9 @@ import {
    ListItemText,
    MenuItem,
    Select,
-    CircularProgress
+    CircularProgress,
+    Autocomplete,
+    TextField
 } from '@mui/material'
 import { IconX, IconVolume } from '@tabler/icons-react'
 import { useTheme } from '@mui/material/styles'
@ -26,6 +28,7 @@ import { SwitchInput } from '@/ui-component/switch/Switch'
 import { Input } from '@/ui-component/input/Input'
 import { StyledButton } from '@/ui-component/button/StyledButton'
 import { Dropdown } from '@/ui-component/dropdown/Dropdown'
+import AudioWaveform from '@/ui-component/extended/AudioWaveform'
 import openAISVG from '@/assets/images/openai.svg'
 import elevenLabsSVG from '@/assets/images/elevenlabs.svg'

@ -102,6 +105,21 @@ const TextToSpeech = ({ dialogProps }) => {
    const [selectedProvider, setSelectedProvider] = useState('none')
    const [voices, setVoices] = useState([])
    const [loadingVoices, setLoadingVoices] = useState(false)
+    const [testAudioSrc, setTestAudioSrc] = useState(null)
+    const [isTestPlaying, setIsTestPlaying] = useState(false)
+    const [testAudioRef, setTestAudioRef] = useState(null)
+    const [isGeneratingTest, setIsGeneratingTest] = useState(false)
+    const [resetWaveform, setResetWaveform] = useState(false)
+
+    const resetTestAudio = () => {
+        if (testAudioSrc) {
+            URL.revokeObjectURL(testAudioSrc)
+            setTestAudioSrc(null)
+        }
+        setIsTestPlaying(false)
+        setResetWaveform(true)
+        setTimeout(() => setResetWaveform(false), 100)
+    }

    const onSave = async () => {
        const textToSpeechConfig = setValue(true, selectedProvider, 'status')
@ -164,16 +182,24 @@ const TextToSpeech = ({ dialogProps }) => {
                newVal['none'].status = false
            }
        }
+
+        // Reset test audio when voice or credential is changed
+        if ((inputParamName === 'voice' || inputParamName === 'credentialId') && providerName === selectedProvider) {
+            resetTestAudio()
+        }
+
        setTextToSpeech(newVal)
        return newVal
    }

    const handleProviderChange = (provider, configOverride = null) => {
-        setSelectedProvider(() => provider)
+        setSelectedProvider(provider)
        setVoices([])
+        resetTestAudio()
+
        if (provider !== 'none') {
            const config = configOverride || textToSpeech
-            const credentialId = config[provider]?.credentialId
+            const credentialId = config?.[provider]?.credentialId
            if (credentialId) {
                loadVoicesForProvider(provider, credentialId)
            }
@ -213,6 +239,8 @@ const TextToSpeech = ({ dialogProps }) => {
            return
        }

+        setIsGeneratingTest(true)
+
        try {
            const providerConfig = textToSpeech?.[selectedProvider] || {}
            const body = {
@ -223,74 +251,6 @@ const TextToSpeech = ({ dialogProps }) => {
                model: providerConfig.model
            }

-            // Use streaming approach like in ChatMessage.jsx
-            const mediaSource = new MediaSource()
-            const audio = new Audio()
-            audio.src = URL.createObjectURL(mediaSource)
-
-            const streamingState = {
-                mediaSource,
-                sourceBuffer: null,
-                audio,
-                chunkQueue: [],
-                isBuffering: false,
-                abortController: new AbortController(),
-                streamEnded: false
-            }
-
-            mediaSource.addEventListener('sourceopen', () => {
-                try {
-                    const mimeType = 'audio/mpeg'
-                    streamingState.sourceBuffer = mediaSource.addSourceBuffer(mimeType)
-
-                    streamingState.sourceBuffer.addEventListener('updateend', () => {
-                        streamingState.isBuffering = false
-                        if (streamingState.chunkQueue.length > 0 && !streamingState.sourceBuffer.updating) {
-                            const chunk = streamingState.chunkQueue.shift()
-                            try {
-                                streamingState.sourceBuffer.appendBuffer(chunk)
-                                streamingState.isBuffering = true
-                            } catch (error) {
-                                console.error('Error appending chunk:', error)
-                            }
-                        } else if (streamingState.streamEnded && streamingState.chunkQueue.length === 0) {
-                            // All chunks processed and stream ended, now we can safely end the stream
-                            try {
-                                if (streamingState.mediaSource.readyState === 'open') {
-                                    streamingState.mediaSource.endOfStream()
-                                }
-                            } catch (error) {
-                                console.error('Error ending MediaSource stream:', error)
-                            }
-                        }
-                    })
-
-                    audio.play().catch((playError) => {
-                        console.error('Error starting audio playback:', playError)
-                    })
-                } catch (error) {
-                    console.error('Error setting up source buffer:', error)
-                }
-            })
-
-            audio.addEventListener('playing', () => {
-                enqueueSnackbar({
-                    message: 'Test audio playing...',
-                    options: { variant: 'info' }
-                })
-            })
-
-            audio.addEventListener('ended', () => {
-                enqueueSnackbar({
-                    message: 'Test audio completed successfully',
-                    options: { variant: 'success' }
-                })
-                // Cleanup
-                if (streamingState.audio.src) {
-                    URL.revokeObjectURL(streamingState.audio.src)
-                }
-            })
-
            const response = await fetch('/api/v1/text-to-speech/generate', {
                method: 'POST',
                headers: {
@ -298,24 +258,19 @@ const TextToSpeech = ({ dialogProps }) => {
                    'x-request-from': 'internal'
                },
                credentials: 'include',
-                body: JSON.stringify(body),
-                signal: streamingState.abortController.signal
+                body: JSON.stringify(body)
            })

            if (!response.ok) {
                throw new Error(`HTTP error! status: ${response.status}`)
            }

+            const audioChunks = []
            const reader = response.body.getReader()
            let buffer = ''

            let done = false
            while (!done) {
-                if (streamingState.abortController.signal.aborted) {
-                    reader.cancel()
-                    break
-                }
-
                const result = await reader.read()
                done = result.done
                if (done) break
@ -328,51 +283,45 @@ const TextToSpeech = ({ dialogProps }) => {
                for (const eventBlock of lines) {
                    if (eventBlock.trim()) {
                        const event = parseSSEEvent(eventBlock)
-                        if (event) {
-                            switch (event.event) {
-                                case 'tts_data':
-                                    if (event.data?.audioChunk) {
-                                        const audioBuffer = Uint8Array.from(atob(event.data.audioChunk), (c) => c.charCodeAt(0))
-                                        streamingState.chunkQueue.push(audioBuffer)
-
-                                        if (streamingState.sourceBuffer && !streamingState.sourceBuffer.updating) {
-                                            const chunk = streamingState.chunkQueue.shift()
-                                            try {
-                                                streamingState.sourceBuffer.appendBuffer(chunk)
-                                                streamingState.isBuffering = true
-                                            } catch (error) {
-                                                console.error('Error appending initial chunk:', error)
-                                            }
-                                        }
-                                    }
-                                    break
-                                case 'tts_end':
-                                    streamingState.streamEnded = true
-                                    // Check if we can end the stream immediately (no chunks queued and not updating)
-                                    if (
-                                        streamingState.sourceBuffer &&
-                                        streamingState.chunkQueue.length === 0 &&
-                                        !streamingState.sourceBuffer.updating &&
-                                        streamingState.mediaSource.readyState === 'open'
-                                    ) {
-                                        try {
-                                            streamingState.mediaSource.endOfStream()
-                                        } catch (error) {
-                                            console.error('Error ending MediaSource stream:', error)
-                                        }
-                                    }
-                                    break
-                            }
+                        if (event && event.event === 'tts_data' && event.data?.audioChunk) {
+                            const audioBuffer = Uint8Array.from(atob(event.data.audioChunk), (c) => c.charCodeAt(0))
+                            audioChunks.push(audioBuffer)
                        }
                    }
                }
            }
+
+            if (audioChunks.length > 0) {
+                // Combine all chunks into a single blob
+                const totalLength = audioChunks.reduce((sum, chunk) => sum + chunk.length, 0)
+                const combinedBuffer = new Uint8Array(totalLength)
+                let offset = 0
+
+                for (const chunk of audioChunks) {
+                    combinedBuffer.set(chunk, offset)
+                    offset += chunk.length
+                }
+
+                const audioBlob = new Blob([combinedBuffer], { type: 'audio/mpeg' })
+                const audioUrl = URL.createObjectURL(audioBlob)
+
+                // Clean up previous audio
+                if (testAudioSrc) {
+                    URL.revokeObjectURL(testAudioSrc)
+                }
+
+                setTestAudioSrc(audioUrl)
+            } else {
+                throw new Error('No audio data received')
+            }
        } catch (error) {
            console.error('Error testing TTS:', error)
            enqueueSnackbar({
                message: `TTS test failed: ${error.message}`,
                options: { variant: 'error' }
            })
+        } finally {
+            setIsGeneratingTest(false)
        }
    }

@ -398,6 +347,46 @@ const TextToSpeech = ({ dialogProps }) => {
        return event.event ? event : null
    }

+    // Audio control functions for waveform component
+    const handleTestPlay = async () => {
+        // If audio already exists, just play it
+        if (testAudioRef && testAudioSrc) {
+            testAudioRef.play()
+            setIsTestPlaying(true)
+            return
+        }
+
+        // If no audio exists, generate it first
+        if (!testAudioSrc) {
+            await testTTS()
+            // testTTS will set the audio source, and we'll play it in the next useEffect
+        }
+    }
+
+    const handleTestPause = () => {
+        if (testAudioRef) {
+            testAudioRef.pause()
+            setIsTestPlaying(false)
+        }
+    }
+
+    const handleTestEnded = () => {
+        setIsTestPlaying(false)
+    }
+
+    // Auto-play when audio is generated (if user clicked play)
+    useEffect(() => {
+        if (testAudioSrc && testAudioRef && !isTestPlaying) {
+            // Small delay to ensure audio element is ready
+            setTimeout(() => {
+                testAudioRef.play()
+                setIsTestPlaying(true)
+            }, 100)
+        }
+
+        // eslint-disable-next-line react-hooks/exhaustive-deps
+    }, [testAudioSrc, testAudioRef])
+
    useEffect(() => {
        if (dialogProps.chatflow && dialogProps.chatflow.textToSpeech) {
            try {
@ -422,6 +411,7 @@ const TextToSpeech = ({ dialogProps }) => {
            setTextToSpeech(null)
            setSelectedProvider('none')
            setVoices([])
+            resetTestAudio()
        }
        // eslint-disable-next-line react-hooks/exhaustive-deps
    }, [dialogProps])
@ -482,7 +472,15 @@ const TextToSpeech = ({ dialogProps }) => {
                            sx={{ ml: 1 }}
                            primary={textToSpeechProviders[selectedProvider].label}
                            secondary={
-                                <a target='_blank' rel='noreferrer' href={textToSpeechProviders[selectedProvider].url}>
+                                <a
+                                    target='_blank'
+                                    rel='noreferrer'
+                                    href={textToSpeechProviders[selectedProvider].url}
+                                    style={{
+                                        color: theme?.customization?.isDarkMode ? '#90caf9' : '#1976d2',
+                                        textDecoration: 'underline'
+                                    }}
+                                >
                                    {textToSpeechProviders[selectedProvider].url}
                                </a>
                            }
@ -551,25 +549,38 @@ const TextToSpeech = ({ dialogProps }) => {
                                />
                            )}
                            {inputParam.type === 'voice_select' && (
-                                <Box>
-                                    {loadingVoices ? (
-                                        <Box sx={{ display: 'flex', alignItems: 'center', gap: 1 }}>
-                                            <CircularProgress size={16} />
-                                            <Typography variant='body2'>Loading voices...</Typography>
-                                        </Box>
-                                    ) : (
-                                        <Dropdown
-                                            name={inputParam.name}
-                                            options={voices.map((voice) => ({ label: voice.name, name: voice.id }))}
-                                            onSelect={(newValue) => setValue(newValue, selectedProvider, inputParam.name)}
-                                            value={
-                                                textToSpeech?.[selectedProvider]
-                                                    ? textToSpeech[selectedProvider][inputParam.name]
-                                                    : inputParam.default ?? 'choose a voice'
-                                            }
+                                <Autocomplete
+                                    size='small'
+                                    sx={{ mt: 1 }}
+                                    options={voices}
+                                    loading={loadingVoices}
+                                    getOptionLabel={(option) => option.name || ''}
+                                    value={
+                                        voices.find(
+                                            (voice) =>
+                                                voice.id === (textToSpeech?.[selectedProvider]?.[inputParam.name] || inputParam.default)
+                                        ) || null
+                                    }
+                                    onChange={(event, newValue) => {
+                                        setValue(newValue ? newValue.id : '', selectedProvider, inputParam.name)
+                                    }}
+                                    renderInput={(params) => (
+                                        <TextField
+                                            {...params}
+                                            placeholder={loadingVoices ? 'Loading voices...' : 'Choose a voice'}
+                                            InputProps={{
+                                                ...params.InputProps,
+                                                endAdornment: (
+                                                    <>
+                                                        {loadingVoices ? <CircularProgress color='inherit' size={20} /> : null}
+                                                        {params.InputProps.endAdornment}
+                                                    </>
+                                                )
+                                            }}
                                        />
                                    )}
-                                </Box>
+                                    disabled={loadingVoices || !textToSpeech?.[selectedProvider]?.credentialId}
+                                />
                            )}
                        </Box>
                    ))}
@ -591,17 +602,42 @@ const TextToSpeech = ({ dialogProps }) => {
                        />
                    </Box>

-                    {/* Test TTS Button */}
+                    {/* Test Voice Section */}
                    <Box sx={{ p: 2 }}>
-                        <StyledButton
-                            variant='outlined'
-                            size='small'
-                            startIcon={<IconVolume />}
-                            onClick={testTTS}
-                            disabled={!textToSpeech?.[selectedProvider]?.credentialId}
-                        >
+                        <Typography variant='h6' sx={{ mb: 2, display: 'flex', alignItems: 'center', gap: 1 }}>
+                            <IconVolume size={20} />
                            Test Voice
-                        </StyledButton>
+                        </Typography>
+
+                        <Typography variant='body2' color='textSecondary' sx={{ mb: 2 }}>
+                            Test text: &quot;Today is a wonderful day to build something with Flowise!&quot;
+                        </Typography>
+
+                        <AudioWaveform
+                            audioSrc={testAudioSrc}
+                            onPlay={handleTestPlay}
+                            onPause={handleTestPause}
+                            onEnded={handleTestEnded}
+                            isPlaying={isTestPlaying}
+                            isGenerating={isGeneratingTest}
+                            disabled={!textToSpeech?.[selectedProvider]?.credentialId}
+                            externalAudioRef={testAudioRef}
+                            resetProgress={resetWaveform}
+                        />
+
+                        {/* Hidden audio element for waveform control */}
+                        {testAudioSrc && (
+                            <audio
+                                ref={(ref) => setTestAudioRef(ref)}
+                                src={testAudioSrc}
+                                onPlay={() => setIsTestPlaying(true)}
+                                onPause={() => setIsTestPlaying(false)}
+                                onEnded={handleTestEnded}
+                                style={{ display: 'none' }}
+                            >
+                                <track kind='captions' />
+                            </audio>
+                        )}
                    </Box>
                </>
            )}
--- a/packages/ui/src/views/chatmessage/ChatMessage.jsx
+++ b/packages/ui/src/views/chatmessage/ChatMessage.jsx
@ -39,8 +39,7 @@ import {
    IconCheck,
    IconPaperclip,
    IconSparkles,
-    IconVolume,
-    IconSquare
+    IconVolume
 } from '@tabler/icons-react'
 import robotPNG from '@/assets/images/robot.png'
 import userPNG from '@/assets/images/account.png'
@ -270,6 +269,10 @@ const ChatMessage = ({ open, chatflowid, isAgentCanvas, isDialog, previews, setP
        abortController: null
    })

+    // Ref to prevent auto-scroll during TTS actions (using ref to avoid re-renders)
+    const isTTSActionRef = useRef(false)
+    const ttsTimeoutRef = useRef(null)
+
    const isFileAllowedForUpload = (file) => {
        const constraints = getAllowChatFlowUploads.data
        /**
@ -555,6 +558,22 @@ const ChatMessage = ({ open, chatflowid, isAgentCanvas, isDialog, previews, setP
        }
    }

+    // Helper function to manage TTS action flag
+    const setTTSAction = (isActive) => {
+        isTTSActionRef.current = isActive
+        if (ttsTimeoutRef.current) {
+            clearTimeout(ttsTimeoutRef.current)
+            ttsTimeoutRef.current = null
+        }
+        if (isActive) {
+            // Reset the flag after a longer delay to ensure all state changes are complete
+            ttsTimeoutRef.current = setTimeout(() => {
+                isTTSActionRef.current = false
+                ttsTimeoutRef.current = null
+            }, 300)
+        }
+    }
+
    const onChange = useCallback((e) => setUserInput(e.target.value), [setUserInput])

    const updateLastMessage = (text) => {
@ -1374,9 +1393,11 @@ const ChatMessage = ({ open, chatflowid, isAgentCanvas, isDialog, previews, setP
        }
    }, [isChatFlowAvailableForRAGFileUploads, fullFileUpload])

-    // Auto scroll chat to bottom
+    // Auto scroll chat to bottom (but not during TTS actions)
    useEffect(() => {
-        scrollToBottom()
+        if (!isTTSActionRef.current) {
+            scrollToBottom()
+        }
    }, [messages])

    useEffect(() => {
@ -1563,6 +1584,8 @@ const ChatMessage = ({ open, chatflowid, isAgentCanvas, isDialog, previews, setP
    }

    const handleTTSStop = (messageId) => {
+        setTTSAction(true)
+
        if (ttsAudio[messageId]) {
            ttsAudio[messageId].pause()
            ttsAudio[messageId].currentTime = 0
@ -1621,6 +1644,7 @@ const ChatMessage = ({ open, chatflowid, isAgentCanvas, isDialog, previews, setP
            return
        }

+        setTTSAction(true)
        stopAllTTS()

        handleTTSStart({ chatMessageId: messageId, format: 'mp3' })
@ -1868,6 +1892,7 @@ const ChatMessage = ({ open, chatflowid, isAgentCanvas, isDialog, previews, setP
    }

    const handleTTSStart = (data) => {
+        setTTSAction(true)
        setIsTTSLoading((prevState) => ({
            ...prevState,
            [data.chatMessageId]: true
@ -1988,6 +2013,11 @@ const ChatMessage = ({ open, chatflowid, isAgentCanvas, isDialog, previews, setP
    useEffect(() => {
        return () => {
            cleanupTTSStreaming()
+            // Cleanup TTS timeout on unmount
+            if (ttsTimeoutRef.current) {
+                clearTimeout(ttsTimeoutRef.current)
+                ttsTimeoutRef.current = null
+            }
        }
    }, [])

@ -2672,9 +2702,12 @@ const ChatMessage = ({ open, chatflowid, isAgentCanvas, isDialog, previews, setP
                                                            {isTTSLoading[message.id] ? (
                                                                <CircularProgress size={16} />
                                                            ) : isTTSPlaying[message.id] ? (
-                                                                <IconSquare size={16} />
+                                                                <IconCircleDot style={{ width: '20px', height: '20px' }} color={'red'} />
                                                            ) : (
-                                                                <IconVolume size={16} />
+                                                                <IconVolume
+                                                                    style={{ width: '20px', height: '20px' }}
+                                                                    color={customization.isDarkMode ? 'white' : '#1e88e5'}
+                                                                />
                                                            )}
                                                        </IconButton>
                                                    )}