497 lines
20 KiB
JavaScript
497 lines
20 KiB
JavaScript
import { useDispatch } from 'react-redux'
|
|
import { useState, useEffect } from 'react'
|
|
import PropTypes from 'prop-types'
|
|
import { enqueueSnackbar as enqueueSnackbarAction, closeSnackbar as closeSnackbarAction, SET_CHATFLOW } from '@/store/actions'
|
|
|
|
// material-ui
|
|
import { Typography, Box, Button, FormControl, ListItem, ListItemAvatar, ListItemText, MenuItem, Select } from '@mui/material'
|
|
import { IconX } from '@tabler/icons-react'
|
|
import { useTheme } from '@mui/material/styles'
|
|
|
|
// Project import
|
|
import CredentialInputHandler from '@/views/canvas/CredentialInputHandler'
|
|
import { TooltipWithParser } from '@/ui-component/tooltip/TooltipWithParser'
|
|
import { SwitchInput } from '@/ui-component/switch/Switch'
|
|
import { Input } from '@/ui-component/input/Input'
|
|
import { StyledButton } from '@/ui-component/button/StyledButton'
|
|
import { Dropdown } from '@/ui-component/dropdown/Dropdown'
|
|
import openAISVG from '@/assets/images/openai.svg'
|
|
import assemblyAIPng from '@/assets/images/assemblyai.png'
|
|
import localAiPng from '@/assets/images/localai.png'
|
|
import azureSvg from '@/assets/images/azure_openai.svg'
|
|
import groqPng from '@/assets/images/groq.png'
|
|
|
|
// store
|
|
import useNotifier from '@/utils/useNotifier'
|
|
|
|
// API
|
|
import chatflowsApi from '@/api/chatflows'
|
|
|
|
// If implementing a new provider, this must be updated in
|
|
// components/src/speechToText.ts as well
|
|
const SpeechToTextType = {
|
|
OPENAI_WHISPER: 'openAIWhisper',
|
|
ASSEMBLYAI_TRANSCRIBE: 'assemblyAiTranscribe',
|
|
LOCALAI_STT: 'localAISTT',
|
|
AZURE_COGNITIVE: 'azureCognitive',
|
|
GROQ_WHISPER: 'groqWhisper'
|
|
}
|
|
|
|
// Weird quirk - the key must match the name property value.
|
|
const speechToTextProviders = {
|
|
[SpeechToTextType.OPENAI_WHISPER]: {
|
|
label: 'OpenAI Whisper',
|
|
name: SpeechToTextType.OPENAI_WHISPER,
|
|
icon: openAISVG,
|
|
url: 'https://platform.openai.com/docs/guides/speech-to-text',
|
|
inputs: [
|
|
{
|
|
label: 'Connect Credential',
|
|
name: 'credential',
|
|
type: 'credential',
|
|
credentialNames: ['openAIApi']
|
|
},
|
|
{
|
|
label: 'Language',
|
|
name: 'language',
|
|
type: 'string',
|
|
description:
|
|
'The language of the input audio. Supplying the input language in ISO-639-1 format will improve accuracy and latency.',
|
|
placeholder: 'en',
|
|
optional: true
|
|
},
|
|
{
|
|
label: 'Prompt',
|
|
name: 'prompt',
|
|
type: 'string',
|
|
rows: 4,
|
|
description: `An optional text to guide the model's style or continue a previous audio segment. The prompt should match the audio language.`,
|
|
optional: true
|
|
},
|
|
{
|
|
label: 'Temperature',
|
|
name: 'temperature',
|
|
type: 'number',
|
|
step: 0.1,
|
|
description: `The sampling temperature, between 0 and 1. Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic.`,
|
|
optional: true
|
|
}
|
|
]
|
|
},
|
|
[SpeechToTextType.ASSEMBLYAI_TRANSCRIBE]: {
|
|
label: 'Assembly AI',
|
|
name: SpeechToTextType.ASSEMBLYAI_TRANSCRIBE,
|
|
icon: assemblyAIPng,
|
|
url: 'https://www.assemblyai.com/',
|
|
inputs: [
|
|
{
|
|
label: 'Connect Credential',
|
|
name: 'credential',
|
|
type: 'credential',
|
|
credentialNames: ['assemblyAIApi']
|
|
}
|
|
]
|
|
},
|
|
[SpeechToTextType.LOCALAI_STT]: {
|
|
label: 'LocalAi STT',
|
|
name: SpeechToTextType.LOCALAI_STT,
|
|
icon: localAiPng,
|
|
url: 'https://localai.io/features/audio-to-text/',
|
|
inputs: [
|
|
{
|
|
label: 'Connect Credential',
|
|
name: 'credential',
|
|
type: 'credential',
|
|
credentialNames: ['localAIApi']
|
|
},
|
|
{
|
|
label: 'Base URL',
|
|
name: 'baseUrl',
|
|
type: 'string',
|
|
description: 'The base URL of the local AI server'
|
|
},
|
|
{
|
|
label: 'Language',
|
|
name: 'language',
|
|
type: 'string',
|
|
description:
|
|
'The language of the input audio. Supplying the input language in ISO-639-1 format will improve accuracy and latency.',
|
|
placeholder: 'en',
|
|
optional: true
|
|
},
|
|
{
|
|
label: 'Model',
|
|
name: 'model',
|
|
type: 'string',
|
|
description: `The STT model to load. Defaults to whisper-1 if left blank.`,
|
|
placeholder: 'whisper-1',
|
|
optional: true
|
|
},
|
|
{
|
|
label: 'Prompt',
|
|
name: 'prompt',
|
|
type: 'string',
|
|
rows: 4,
|
|
description: `An optional text to guide the model's style or continue a previous audio segment. The prompt should match the audio language.`,
|
|
optional: true
|
|
},
|
|
{
|
|
label: 'Temperature',
|
|
name: 'temperature',
|
|
type: 'number',
|
|
step: 0.1,
|
|
description: `The sampling temperature, between 0 and 1. Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic.`,
|
|
optional: true
|
|
}
|
|
]
|
|
},
|
|
[SpeechToTextType.AZURE_COGNITIVE]: {
|
|
label: 'Azure Cognitive Services',
|
|
name: SpeechToTextType.AZURE_COGNITIVE,
|
|
icon: azureSvg,
|
|
url: 'https://azure.microsoft.com/en-us/products/cognitive-services/speech-services',
|
|
inputs: [
|
|
{
|
|
label: 'Connect Credential',
|
|
name: 'credential',
|
|
type: 'credential',
|
|
credentialNames: ['azureCognitiveServices']
|
|
},
|
|
{
|
|
label: 'Language',
|
|
name: 'language',
|
|
type: 'string',
|
|
description: 'The recognition language (e.g., "en-US", "es-ES")',
|
|
placeholder: 'en-US',
|
|
optional: true
|
|
},
|
|
{
|
|
label: 'Profanity Filter Mode',
|
|
name: 'profanityFilterMode',
|
|
type: 'options',
|
|
description: 'How to handle profanity in the transcription',
|
|
options: [
|
|
{
|
|
label: 'None',
|
|
name: 'None'
|
|
},
|
|
{
|
|
label: 'Masked',
|
|
name: 'Masked'
|
|
},
|
|
{
|
|
label: 'Removed',
|
|
name: 'Removed'
|
|
}
|
|
],
|
|
default: 'Masked',
|
|
optional: true
|
|
},
|
|
{
|
|
label: 'Audio Channels',
|
|
name: 'channels',
|
|
type: 'string',
|
|
description: 'Comma-separated list of audio channels to process (e.g., "0,1")',
|
|
placeholder: '0,1',
|
|
default: '0,1'
|
|
}
|
|
]
|
|
},
|
|
[SpeechToTextType.GROQ_WHISPER]: {
|
|
label: 'Groq Whisper',
|
|
name: SpeechToTextType.GROQ_WHISPER,
|
|
icon: groqPng,
|
|
url: 'https://console.groq.com/',
|
|
inputs: [
|
|
{
|
|
label: 'Model',
|
|
name: 'model',
|
|
type: 'string',
|
|
description: `The STT model to load. Defaults to whisper-large-v3 if left blank.`,
|
|
placeholder: 'whisper-large-v3',
|
|
optional: true
|
|
},
|
|
{
|
|
label: 'Connect Credential',
|
|
name: 'credential',
|
|
type: 'credential',
|
|
credentialNames: ['groqApi']
|
|
},
|
|
{
|
|
label: 'Language',
|
|
name: 'language',
|
|
type: 'string',
|
|
description:
|
|
'The language of the input audio. Supplying the input language in ISO-639-1 format will improve accuracy and latency.',
|
|
placeholder: 'en',
|
|
optional: true
|
|
},
|
|
{
|
|
label: 'Temperature',
|
|
name: 'temperature',
|
|
type: 'number',
|
|
step: 0.1,
|
|
description:
|
|
'The sampling temperature, between 0 and 1. Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic.',
|
|
optional: true
|
|
}
|
|
]
|
|
}
|
|
}
|
|
|
|
const SpeechToText = ({ dialogProps }) => {
|
|
const dispatch = useDispatch()
|
|
|
|
useNotifier()
|
|
const theme = useTheme()
|
|
|
|
const enqueueSnackbar = (...args) => dispatch(enqueueSnackbarAction(...args))
|
|
const closeSnackbar = (...args) => dispatch(closeSnackbarAction(...args))
|
|
|
|
const [speechToText, setSpeechToText] = useState({})
|
|
const [selectedProvider, setSelectedProvider] = useState('none')
|
|
|
|
const onSave = async () => {
|
|
const speechToText = setValue(true, selectedProvider, 'status')
|
|
try {
|
|
const saveResp = await chatflowsApi.updateChatflow(dialogProps.chatflow.id, {
|
|
speechToText: JSON.stringify(speechToText)
|
|
})
|
|
if (saveResp.data) {
|
|
enqueueSnackbar({
|
|
message: 'Speech To Text Configuration Saved',
|
|
options: {
|
|
key: new Date().getTime() + Math.random(),
|
|
variant: 'success',
|
|
action: (key) => (
|
|
<Button style={{ color: 'white' }} onClick={() => closeSnackbar(key)}>
|
|
<IconX />
|
|
</Button>
|
|
)
|
|
}
|
|
})
|
|
dispatch({ type: SET_CHATFLOW, chatflow: saveResp.data })
|
|
}
|
|
} catch (error) {
|
|
enqueueSnackbar({
|
|
message: `Failed to save Speech To Text Configuration: ${
|
|
typeof error.response.data === 'object' ? error.response.data.message : error.response.data
|
|
}`,
|
|
options: {
|
|
key: new Date().getTime() + Math.random(),
|
|
variant: 'error',
|
|
persist: true,
|
|
action: (key) => (
|
|
<Button style={{ color: 'white' }} onClick={() => closeSnackbar(key)}>
|
|
<IconX />
|
|
</Button>
|
|
)
|
|
}
|
|
})
|
|
}
|
|
}
|
|
|
|
const setValue = (value, providerName, inputParamName) => {
|
|
let newVal = {}
|
|
if (!Object.prototype.hasOwnProperty.call(speechToText, providerName)) {
|
|
newVal = { ...speechToText, [providerName]: {} }
|
|
} else {
|
|
newVal = { ...speechToText }
|
|
}
|
|
|
|
newVal[providerName][inputParamName] = value
|
|
if (inputParamName === 'status' && value === true) {
|
|
// ensure that the others are turned off
|
|
Object.keys(speechToTextProviders).forEach((key) => {
|
|
const provider = speechToTextProviders[key]
|
|
if (provider.name !== providerName) {
|
|
newVal[provider.name] = { ...speechToText[provider.name], status: false }
|
|
}
|
|
})
|
|
if (providerName !== 'none' && newVal['none']) {
|
|
newVal['none'].status = false
|
|
}
|
|
}
|
|
setSpeechToText(newVal)
|
|
return newVal
|
|
}
|
|
|
|
const handleProviderChange = (event) => {
|
|
setSelectedProvider(event.target.value)
|
|
}
|
|
|
|
useEffect(() => {
|
|
if (dialogProps.chatflow && dialogProps.chatflow.speechToText) {
|
|
try {
|
|
const speechToText = JSON.parse(dialogProps.chatflow.speechToText)
|
|
let selectedProvider = 'none'
|
|
Object.keys(speechToTextProviders).forEach((key) => {
|
|
const providerConfig = speechToText[key]
|
|
if (providerConfig && providerConfig.status) {
|
|
selectedProvider = key
|
|
}
|
|
})
|
|
setSelectedProvider(selectedProvider)
|
|
setSpeechToText(speechToText)
|
|
} catch (e) {
|
|
setSpeechToText({})
|
|
setSelectedProvider('none')
|
|
console.error(e)
|
|
}
|
|
}
|
|
|
|
return () => {
|
|
setSpeechToText({})
|
|
setSelectedProvider('none')
|
|
}
|
|
}, [dialogProps])
|
|
|
|
return (
|
|
<>
|
|
<Box fullWidth sx={{ mb: 1, display: 'flex', flexDirection: 'column', gap: 1 }}>
|
|
<Typography>Providers</Typography>
|
|
<FormControl fullWidth>
|
|
<Select
|
|
size='small'
|
|
value={selectedProvider}
|
|
onChange={handleProviderChange}
|
|
sx={{
|
|
'& .MuiSvgIcon-root': {
|
|
color: theme?.customization?.isDarkMode ? '#fff' : 'inherit'
|
|
}
|
|
}}
|
|
>
|
|
<MenuItem value='none'>None</MenuItem>
|
|
{Object.values(speechToTextProviders).map((provider) => (
|
|
<MenuItem key={provider.name} value={provider.name}>
|
|
{provider.label}
|
|
</MenuItem>
|
|
))}
|
|
</Select>
|
|
</FormControl>
|
|
</Box>
|
|
{selectedProvider !== 'none' && (
|
|
<>
|
|
<ListItem sx={{ mt: 3 }} alignItems='center'>
|
|
<ListItemAvatar>
|
|
<div
|
|
style={{
|
|
width: 50,
|
|
height: 50,
|
|
borderRadius: '50%',
|
|
backgroundColor: 'white',
|
|
flexShrink: 0,
|
|
display: 'flex',
|
|
alignItems: 'center',
|
|
justifyContent: 'center'
|
|
}}
|
|
>
|
|
<img
|
|
style={{
|
|
width: '100%',
|
|
height: '100%',
|
|
padding: 10,
|
|
objectFit: 'contain'
|
|
}}
|
|
alt='AI'
|
|
src={speechToTextProviders[selectedProvider].icon}
|
|
/>
|
|
</div>
|
|
</ListItemAvatar>
|
|
<ListItemText
|
|
sx={{ ml: 1 }}
|
|
primary={speechToTextProviders[selectedProvider].label}
|
|
secondary={
|
|
<a
|
|
target='_blank'
|
|
rel='noreferrer'
|
|
href={speechToTextProviders[selectedProvider].url}
|
|
style={{
|
|
color: theme?.customization?.isDarkMode ? '#90caf9' : '#1976d2',
|
|
textDecoration: 'underline'
|
|
}}
|
|
>
|
|
{speechToTextProviders[selectedProvider].url}
|
|
</a>
|
|
}
|
|
/>
|
|
</ListItem>
|
|
{speechToTextProviders[selectedProvider].inputs.map((inputParam, index) => (
|
|
<Box key={index} sx={{ p: 2 }}>
|
|
<div style={{ display: 'flex', flexDirection: 'row' }}>
|
|
<Typography>
|
|
{inputParam.label}
|
|
{!inputParam.optional && <span style={{ color: 'red' }}> *</span>}
|
|
{inputParam.description && (
|
|
<TooltipWithParser style={{ marginLeft: 10 }} title={inputParam.description} />
|
|
)}
|
|
</Typography>
|
|
</div>
|
|
{inputParam.type === 'credential' && (
|
|
<CredentialInputHandler
|
|
key={speechToText[selectedProvider]?.credentialId}
|
|
data={
|
|
speechToText[selectedProvider]?.credentialId
|
|
? { credential: speechToText[selectedProvider].credentialId }
|
|
: {}
|
|
}
|
|
inputParam={inputParam}
|
|
onSelect={(newValue) => setValue(newValue, selectedProvider, 'credentialId')}
|
|
/>
|
|
)}
|
|
{inputParam.type === 'boolean' && (
|
|
<SwitchInput
|
|
onChange={(newValue) => setValue(newValue, selectedProvider, inputParam.name)}
|
|
value={
|
|
speechToText[selectedProvider]
|
|
? speechToText[selectedProvider][inputParam.name]
|
|
: inputParam.default ?? false
|
|
}
|
|
/>
|
|
)}
|
|
{(inputParam.type === 'string' || inputParam.type === 'password' || inputParam.type === 'number') && (
|
|
<Input
|
|
inputParam={inputParam}
|
|
onChange={(newValue) => setValue(newValue, selectedProvider, inputParam.name)}
|
|
value={
|
|
speechToText[selectedProvider]
|
|
? speechToText[selectedProvider][inputParam.name]
|
|
: inputParam.default ?? ''
|
|
}
|
|
/>
|
|
)}
|
|
|
|
{inputParam.type === 'options' && (
|
|
<Dropdown
|
|
name={inputParam.name}
|
|
options={inputParam.options}
|
|
onSelect={(newValue) => setValue(newValue, selectedProvider, inputParam.name)}
|
|
value={
|
|
speechToText[selectedProvider]
|
|
? speechToText[selectedProvider][inputParam.name]
|
|
: inputParam.default ?? 'choose an option'
|
|
}
|
|
/>
|
|
)}
|
|
</Box>
|
|
))}
|
|
</>
|
|
)}
|
|
<StyledButton
|
|
style={{ marginBottom: 10, marginTop: 10 }}
|
|
disabled={selectedProvider !== 'none' && !speechToText[selectedProvider]?.credentialId}
|
|
variant='contained'
|
|
onClick={onSave}
|
|
>
|
|
Save
|
|
</StyledButton>
|
|
</>
|
|
)
|
|
}
|
|
|
|
SpeechToText.propTypes = {
|
|
dialogProps: PropTypes.object
|
|
}
|
|
|
|
export default SpeechToText
|