Spaces:

shukdevdatta123
/

WaveTalk

Running

App Files Files Community

shukdevdatta123 commited on 12 days ago

Commit

e2170c7

verified ·

1 Parent(s): 0a8ed36

Update app.py

Browse files

Files changed (1) hide show

app.py +65 -10

app.py CHANGED Viewed

@@ -32,7 +32,7 @@ def process_text_input(api_key, text_prompt, selected_voice):
         with open(temp_path, "wb") as f:
             f.write(wav_bytes)
-        # Get the text response
         text_response = completion.choices[0].message.content
         return text_response, temp_path
@@ -96,6 +96,24 @@ def process_audio_input(api_key, audio_path, text_prompt, selected_voice):
     except Exception as e:
         return f"Error: {str(e)}", None
 def download_example_audio():
     """Download an example audio file for testing"""
     try:
@@ -147,11 +165,24 @@ with gr.Blocks(title="OpenAI Audio Chat App") as app:
             with gr.Column():
                 text_output = gr.Textbox(label="AI Response (Text)", lines=5)
                 audio_output = gr.Audio(label="AI Response (Audio)")
         text_submit.click(
-            fn=process_text_input,
             inputs=[api_key, text_input, text_voice],
-            outputs=[text_output, audio_output]
         )
     with gr.Tab("Audio Input to Audio Response"):
@@ -179,11 +210,30 @@ with gr.Blocks(title="OpenAI Audio Chat App") as app:
             with gr.Column():
                 audio_text_output = gr.Textbox(label="AI Response (Text)", lines=5)
                 audio_audio_output = gr.Audio(label="AI Response (Audio)")
         audio_submit.click(
-            fn=process_audio_input,
             inputs=[api_key, audio_input, accompanying_text, audio_voice],
-            outputs=[audio_text_output, audio_audio_output]
         )
         example_btn.click(
@@ -198,7 +248,7 @@ with gr.Blocks(title="OpenAI Audio Chat App") as app:
         def generate_voice_sample(api_key, voice_type):
             try:
                 if not api_key:
-                    return "Please enter your OpenAI API key first.", None
                 client = OpenAI(api_key=api_key)
                 completion = client.chat.completions.create(
@@ -219,9 +269,12 @@ with gr.Blocks(title="OpenAI Audio Chat App") as app:
                 with open(temp_path, "wb") as f:
                     f.write(wav_bytes)
-                return f"Sample generated with voice: {voice_type}", temp_path
             except Exception as e:
-                return f"Error: {str(e)}", None
         with gr.Row():
             sample_voice = gr.Dropdown(
@@ -234,19 +287,21 @@ with gr.Blocks(title="OpenAI Audio Chat App") as app:
         with gr.Row():
             sample_text = gr.Textbox(label="Status")
             sample_audio = gr.Audio(label="Voice Sample")
         sample_btn.click(
             fn=generate_voice_sample,
             inputs=[api_key, sample_voice],
-            outputs=[sample_text, sample_audio]
         )
     gr.Markdown("""
     ## Notes:
     - You must provide your OpenAI API key in the field above
-    - The model used is `gpt-4o-audio-preview`
     - Audio inputs should be in WAV format
     - Available voices: alloy, ash, ballad, coral, echo, fable, onyx, nova, sage, shimmer, and verse
     """)
 if __name__ == "__main__":

         with open(temp_path, "wb") as f:
             f.write(wav_bytes)
+        # Get the text response directly from the API
         text_response = completion.choices[0].message.content
         return text_response, temp_path
     except Exception as e:
         return f"Error: {str(e)}", None
+def transcribe_audio(api_key, audio_path):
+    """Transcribe an audio file using OpenAI's API"""
+    try:
+        if not audio_path:
+            return "No audio file provided for transcription."
+        client = OpenAI(api_key=api_key)
+        with open(audio_path, "rb") as audio_file:
+            transcription = client.audio.transcriptions.create(
+                model="gpt-4o-transcribe",
+                file=audio_file
+            )
+        return transcription.text
+    except Exception as e:
+        return f"Transcription error: {str(e)}"
 def download_example_audio():
     """Download an example audio file for testing"""
     try:
             with gr.Column():
                 text_output = gr.Textbox(label="AI Response (Text)", lines=5)
                 audio_output = gr.Audio(label="AI Response (Audio)")
+                transcribed_output = gr.Textbox(label="Transcription of Audio Response", lines=3)
+        # Function to process text input and then transcribe the resulting audio
+        def text_input_with_transcription(api_key, text_prompt, voice):
+            text_response, audio_path = process_text_input(api_key, text_prompt, voice)
+            # Get transcription of the generated audio
+            if audio_path:
+                transcription = transcribe_audio(api_key, audio_path)
+            else:
+                transcription = "No audio generated to transcribe."
+            return text_response, audio_path, transcription
         text_submit.click(
+            fn=text_input_with_transcription,
             inputs=[api_key, text_input, text_voice],
+            outputs=[text_output, audio_output, transcribed_output]
         )
     with gr.Tab("Audio Input to Audio Response"):
             with gr.Column():
                 audio_text_output = gr.Textbox(label="AI Response (Text)", lines=5)
                 audio_audio_output = gr.Audio(label="AI Response (Audio)")
+                audio_transcribed_output = gr.Textbox(label="Transcription of Audio Response", lines=3)
+                input_transcription = gr.Textbox(label="Transcription of Input Audio", lines=3)
+        # Function to process audio input, generate response, and provide transcriptions
+        def audio_input_with_transcription(api_key, audio_path, text_prompt, voice):
+            # First transcribe the input audio
+            input_transcription = "N/A"
+            if audio_path:
+                input_transcription = transcribe_audio(api_key, audio_path)
+            # Process the audio input and get response
+            text_response, response_audio_path = process_audio_input(api_key, audio_path, text_prompt, voice)
+            # Transcribe the response audio
+            response_transcription = "No audio generated to transcribe."
+            if response_audio_path:
+                response_transcription = transcribe_audio(api_key, response_audio_path)
+            return text_response, response_audio_path, response_transcription, input_transcription
         audio_submit.click(
+            fn=audio_input_with_transcription,
             inputs=[api_key, audio_input, accompanying_text, audio_voice],
+            outputs=[audio_text_output, audio_audio_output, audio_transcribed_output, input_transcription]
         )
         example_btn.click(
         def generate_voice_sample(api_key, voice_type):
             try:
                 if not api_key:
+                    return "Please enter your OpenAI API key first.", None, "No transcription available."
                 client = OpenAI(api_key=api_key)
                 completion = client.chat.completions.create(
                 with open(temp_path, "wb") as f:
                     f.write(wav_bytes)
+                # Get transcription
+                transcription = transcribe_audio(api_key, temp_path)
+                return f"Sample generated with voice: {voice_type}", temp_path, transcription
             except Exception as e:
+                return f"Error: {str(e)}", None, "No transcription available."
         with gr.Row():
             sample_voice = gr.Dropdown(
         with gr.Row():
             sample_text = gr.Textbox(label="Status")
             sample_audio = gr.Audio(label="Voice Sample")
+            sample_transcription = gr.Textbox(label="Transcription", lines=3)
         sample_btn.click(
             fn=generate_voice_sample,
             inputs=[api_key, sample_voice],
+            outputs=[sample_text, sample_audio, sample_transcription]
         )
     gr.Markdown("""
     ## Notes:
     - You must provide your OpenAI API key in the field above
+    - The model used is `gpt-4o-audio-preview` for conversation and `gpt-4o-transcribe` for transcriptions
     - Audio inputs should be in WAV format
     - Available voices: alloy, ash, ballad, coral, echo, fable, onyx, nova, sage, shimmer, and verse
+    - Each audio response is automatically transcribed for verification
     """)
 if __name__ == "__main__":