From 6bd8aaefc897fed1f39c9c6bf1bbb61d12c27d8c Mon Sep 17 00:00:00 2001
From: Vinod Kiran <vinodkiran@outlook.in>
Date: Fri, 19 Apr 2024 05:58:09 +0530
Subject: [PATCH] Bugfix/speech input on Safari/iOS (#1971)

* debug to identify

* Safari sends audio file as mp4 and nor webp

* Safari on iOS needs special handling

* lint fixes

* updated condition

* Remove unused import

---------

Co-authored-by: Ilango <rajagopalilango@gmail.com>
---
 packages/components/src/speechToText.ts              | 2 --
 packages/server/src/utils/buildChatflow.ts           | 3 ++-
 packages/ui/src/views/chatmessage/ChatMessage.jsx    | 8 +++++++-
 packages/ui/src/views/chatmessage/audio-recording.js | 9 ++++++++-
 4 files changed, 17 insertions(+), 5 deletions(-)

diff --git a/packages/components/src/speechToText.ts b/packages/components/src/speechToText.ts
index 8524b5252..20d72e40a 100644
--- a/packages/components/src/speechToText.ts
+++ b/packages/components/src/speechToText.ts
@@ -10,7 +10,6 @@ export const convertSpeechToText = async (upload: IFileUpload, speechToTextConfi
         const credentialId = speechToTextConfig.credentialId as string
         const credentialData = await getCredentialData(credentialId ?? '', options)
         const filePath = path.join(getStoragePath(), options.chatflowid, options.chatId, upload.name)
-
         const audio_file = fs.createReadStream(filePath)
 
         if (speechToTextConfig.name === 'openAIWhisper') {
@@ -18,7 +17,6 @@ export const convertSpeechToText = async (upload: IFileUpload, speechToTextConfi
                 apiKey: credentialData.openAIApiKey
             }
             const openAIClient = new OpenAIClient(openAIClientOptions)
-
             const transcription = await openAIClient.audio.transcriptions.create({
                 file: audio_file,
                 model: 'whisper-1',
diff --git a/packages/server/src/utils/buildChatflow.ts b/packages/server/src/utils/buildChatflow.ts
index 0dbc71976..42118b4d3 100644
--- a/packages/server/src/utils/buildChatflow.ts
+++ b/packages/server/src/utils/buildChatflow.ts
@@ -66,6 +66,7 @@ export const utilBuildChatflow = async (req: Request, socketIO?: Server, isInter
             fileUploads = incomingInput.uploads
             for (let i = 0; i < fileUploads.length; i += 1) {
                 const upload = fileUploads[i]
+
                 if ((upload.type === 'file' || upload.type === 'audio') && upload.data) {
                     const filename = upload.name
                     const dir = path.join(getStoragePath(), chatflowid, chatId)
@@ -83,7 +84,7 @@ export const utilBuildChatflow = async (req: Request, socketIO?: Server, isInter
                 }
 
                 // Run Speech to Text conversion
-                if (upload.mime === 'audio/webm') {
+                if (upload.mime === 'audio/webm' || upload.mime === 'audio/mp4') {
                     let speechToTextConfig: ICommonObject = {}
                     if (chatflow.speechToText) {
                         const speechToTextProviders = JSON.parse(chatflow.speechToText)
diff --git a/packages/ui/src/views/chatmessage/ChatMessage.jsx b/packages/ui/src/views/chatmessage/ChatMessage.jsx
index d7e6de72c..988114848 100644
--- a/packages/ui/src/views/chatmessage/ChatMessage.jsx
+++ b/packages/ui/src/views/chatmessage/ChatMessage.jsx
@@ -241,7 +241,13 @@ export const ChatMessage = ({ open, chatflowid, isDialog, previews, setPreviews
     }
 
     const addRecordingToPreviews = (blob) => {
-        const mimeType = blob.type.substring(0, blob.type.indexOf(';'))
+        let mimeType = ''
+        const pos = blob.type.indexOf(';')
+        if (pos === -1) {
+            mimeType = blob.type
+        } else {
+            mimeType = blob.type.substring(0, pos)
+        }
         // read blob and add to previews
         const reader = new FileReader()
         reader.readAsDataURL(blob)
diff --git a/packages/ui/src/views/chatmessage/audio-recording.js b/packages/ui/src/views/chatmessage/audio-recording.js
index 1fbaddc10..37c5df08c 100644
--- a/packages/ui/src/views/chatmessage/audio-recording.js
+++ b/packages/ui/src/views/chatmessage/audio-recording.js
@@ -2,6 +2,7 @@
  * @fileoverview This file contains the API to handle audio recording.
  * Originally from 'https://ralzohairi.medium.com/audio-recording-in-javascript-96eed45b75ee'
  */
+import { isSafari } from 'react-device-detect'
 
 // audio-recording.js ---------------
 let microphoneButton, elapsedTimeTag
@@ -277,7 +278,13 @@ export const audioRecorder = {
                         })
 
                         //start the recording by calling the start method on the media recorder
-                        audioRecorder.mediaRecorder.start()
+                        if (isSafari) {
+                            // https://community.openai.com/t/whisper-problem-with-audio-mp4-blobs-from-safari/322252
+                            // https://community.openai.com/t/whisper-api-cannot-read-files-correctly/93420/46
+                            audioRecorder.mediaRecorder.start(1000)
+                        } else {
+                            audioRecorder.mediaRecorder.start()
+                        }
                     })
             )