Feature/Add Groq Whisper support (#3706)

* feat: Add Groq Whisper support to SpeechToText component - Introduced a new speech-to-text provider, Groq Whisper, in both the backend and UI components. - Updated SpeechToTextType to include GROQ_WHISPER. - Implemented Groq client integration for audio transcription with customizable model, language, and temperature options. - Added UI elements for Groq Whisper configuration, including input fields for model, language, and temperature settings. * turn speech to text none status to false when other was selected --------- Co-authored-by: Henry <hzj94@hotmail.com>
2024-12-17 18:11:07 -05:00 · 2024-12-17 18:11:07 -05:00 · 4c29b2390c
parent d5498858ec
commit 4c29b2390c
3 changed files with 66 additions and 2 deletions
--- a/packages/components/src/speechToText.ts
+++ b/packages/components/src/speechToText.ts
@ -3,11 +3,13 @@ import { getCredentialData } from './utils'
 import { type ClientOptions, OpenAIClient, toFile } from '@langchain/openai'
 import { AssemblyAI } from 'assemblyai'
 import { getFileFromStorage } from './storageUtils'
+import Groq from 'groq-sdk'

 const SpeechToTextType = {
    OPENAI_WHISPER: 'openAIWhisper',
    ASSEMBLYAI_TRANSCRIBE: 'assemblyAiTranscribe',
-    LOCALAI_STT: 'localAISTT'
+    LOCALAI_STT: 'localAISTT',
+    GROQ_WHISPER: 'groqWhisper'
 }

 export const convertSpeechToText = async (upload: IFileUpload, speechToTextConfig: ICommonObject, options: ICommonObject) => {
@ -70,6 +72,23 @@ export const convertSpeechToText = async (upload: IFileUpload, speechToTextConfi
                }
                break
            }
+            case SpeechToTextType.GROQ_WHISPER: {
+                const groqClient = new Groq({
+                    apiKey: credentialData.groqApiKey
+                })
+                const file = await toFile(audio_file, upload.name)
+                const groqTranscription = await groqClient.audio.transcriptions.create({
+                    file,
+                    model: speechToTextConfig?.model || 'whisper-large-v3',
+                    language: speechToTextConfig?.language,
+                    temperature: speechToTextConfig?.temperature ? parseFloat(speechToTextConfig.temperature) : undefined,
+                    response_format: 'verbose_json'
+                })
+                if (groqTranscription?.text) {
+                    return groqTranscription.text
+                }
+                break
+            }
        }
    } else {
        throw new Error('Speech to text is not selected, but found a recorded audio file. Please fix the chain.')
--- a/packages/ui/src/assets/images/groq.png
+++ b/packages/ui/src/assets/images/groq.png
--- a/packages/ui/src/ui-component/extended/SpeechToText.jsx
+++ b/packages/ui/src/ui-component/extended/SpeechToText.jsx
@ -17,6 +17,7 @@ import { Dropdown } from '@/ui-component/dropdown/Dropdown'
 import openAISVG from '@/assets/images/openai.svg'
 import assemblyAIPng from '@/assets/images/assemblyai.png'
 import localAiPng from '@/assets/images/localai.png'
+import groqPng from '@/assets/images/groq.png'

 // store
 import useNotifier from '@/utils/useNotifier'
@ -29,7 +30,8 @@ import chatflowsApi from '@/api/chatflows'
 const SpeechToTextType = {
    OPENAI_WHISPER: 'openAIWhisper',
    ASSEMBLYAI_TRANSCRIBE: 'assemblyAiTranscribe',
-    LOCALAI_STT: 'localAISTT'
+    LOCALAI_STT: 'localAISTT',
+    GROQ_WHISPER: 'groqWhisper'
 }

 // Weird quirk - the key must match the name property value.
@ -139,6 +141,46 @@ const speechToTextProviders = {
                optional: true
            }
        ]
+    },
+    [SpeechToTextType.GROQ_WHISPER]: {
+        label: 'Groq Whisper',
+        name: SpeechToTextType.GROQ_WHISPER,
+        icon: groqPng,
+        url: 'https://console.groq.com/',
+        inputs: [
+            {
+                label: 'Model',
+                name: 'model',
+                type: 'string',
+                description: `The STT model to load. Defaults to whisper-large-v3 if left blank.`,
+                placeholder: 'whisper-large-v3',
+                optional: true
+            },
+            {
+                label: 'Connect Credential',
+                name: 'credential',
+                type: 'credential',
+                credentialNames: ['groqApi']
+            },
+            {
+                label: 'Language',
+                name: 'language',
+                type: 'string',
+                description:
+                    'The language of the input audio. Supplying the input language in ISO-639-1 format will improve accuracy and latency.',
+                placeholder: 'en',
+                optional: true
+            },
+            {
+                label: 'Temperature',
+                name: 'temperature',
+                type: 'number',
+                step: 0.1,
+                description:
+                    'The sampling temperature, between 0 and 1. Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic.',
+                optional: true
+            }
+        ]
    }
 }

@ -210,6 +252,9 @@ const SpeechToText = ({ dialogProps }) => {
                    newVal[provider.name] = { ...speechToText[provider.name], status: false }
                }
            })
+            if (providerName !== 'none') {
+                newVal['none'].status = false
+            }
        }
        setSpeechToText(newVal)
        return newVal