Spaces:

Athspi
/

Whshhs

Runtime error

App Files Files Community

Athspi commited on Mar 29

Commit

a493c8c

verified ·

1 Parent(s): af3c122

Update app.py

Browse files

Files changed (1) hide show

app.py +54 -52

app.py CHANGED Viewed

@@ -1,23 +1,20 @@
 import gradio as gr
 import asyncio
-import base64
-import io
 import numpy as np
 from google import genai
 from google.genai import types
 import soundfile as sf
 # Configuration
 SAMPLE_RATE = 24000
-MODEL = "models/gemini-2.0-flash-exp"
 class GeminiTTS:
     def __init__(self, api_key):
         if not api_key:
             raise ValueError("API key cannot be empty")
         self.client = genai.Client(http_options={"api_version": "v1alpha"}, api_key=api_key)
-        self.session = None
         self.config = types.LiveConnectConfig(
             response_modalities=["audio"],
             speech_config=types.SpeechConfig(
@@ -26,94 +23,99 @@ class GeminiTTS:
                 )
             ),
             system_instruction=types.Content(
-                parts=[types.Part.from_text(text="Answer user ask, replay same thing user say no other word explain")],
                 role="user"
             ),
         )
-    async def process_text(self, text):
         try:
             async with self.client.aio.live.connect(model=MODEL, config=self.config) as session:
-                await session.send(input=text or ".", end_of_turn=True)
-                # Get response
                 turn = session.receive()
                 async for response in turn:
-                    if data := response.data:
-                        # Convert to properly formatted numpy array
-                        audio_data = np.frombuffer(data, dtype=np.float32)
                         # Normalize audio to prevent processing warnings
-                        if audio_data.size > 0:
-                            max_val = np.max(np.abs(audio_data))
-                            if max_val > 0:
-                                audio_data = audio_data / max_val
-                        return (SAMPLE_RATE, audio_data)
-                    if text := response.text:
-                        return text
-                return "No response received"
         except Exception as e:
             return f"Error: {str(e)}"
-def create_gradio_interface():
-    tts_handler = None
-    def init_tts(api_key):
-        nonlocal tts_handler
         try:
-            tts_handler = GeminiTTS(api_key)
-            return "Gemini TTS Initialized Successfully!"
         except Exception as e:
-            return f"Initialization Failed: {str(e)}"
-    async def generate_response(text):
-        if not tts_handler:
-            raise gr.Error("Please initialize the TTS system first with your API key")
-        result = await tts_handler.process_text(text)
-        if isinstance(result, tuple) and len(result) == 2:
-            # Audio response (sample_rate, audio_data)
-            return result
-        else:
-            # Text response
-            return result
-    with gr.Blocks(title="Gemini TTS Interface") as demo:
-        gr.Markdown("# 🎤 Gemini Text-to-Speech Interface")
         with gr.Row():
             api_key = gr.Textbox(
-                label="Gemini API Key",
                 type="password",
-                placeholder="Enter your Gemini API key here"
             )
-            init_btn = gr.Button("Initialize TTS")
-        init_status = gr.Textbox(label="Initialization Status", interactive=False)
-        init_btn.click(init_tts, inputs=api_key, outputs=init_status)
         with gr.Group():
             text_input = gr.Textbox(
-                label="Enter Text",
                 lines=3,
-                placeholder="Type something to convert to speech..."
             )
             generate_btn = gr.Button("Generate Speech")
-        audio_output = gr.Audio(label="Generated Speech")
-        text_output = gr.Textbox(label="Text Response", visible=False)
         generate_btn.click(
-            generate_response,
             inputs=text_input,
             outputs=[audio_output, text_output]
         )
-    return demo
 if __name__ == "__main__":
-    demo = create_gradio_interface()
-    demo.launch(server_name="0.0.0.0", server_port=7860)

 import gradio as gr
 import asyncio
 import numpy as np
 from google import genai
 from google.genai import types
 import soundfile as sf
+import io
 # Configuration
 SAMPLE_RATE = 24000
+MODEL = "models/gemini-2.0-flash"
 class GeminiTTS:
     def __init__(self, api_key):
         if not api_key:
             raise ValueError("API key cannot be empty")
         self.client = genai.Client(http_options={"api_version": "v1alpha"}, api_key=api_key)
         self.config = types.LiveConnectConfig(
             response_modalities=["audio"],
             speech_config=types.SpeechConfig(
                 )
             ),
             system_instruction=types.Content(
+                parts=[types.Part.from_text(text="Speak exactly what the user says")],
                 role="user"
             ),
         )
+    async def text_to_speech(self, text):
         try:
             async with self.client.aio.live.connect(model=MODEL, config=self.config) as session:
+                await session.send(input=text or " ", end_of_turn=True)
                 turn = session.receive()
                 async for response in turn:
+                    if audio_data := response.data:
+                        # Convert to numpy array and normalize
+                        audio_array = np.frombuffer(audio_data, dtype=np.float32)
+                        # Handle empty/quiet audio
+                        if audio_array.size == 0:
+                            audio_array = np.zeros(int(SAMPLE_RATE * 0.5))  # 0.5s of silence
                         # Normalize audio to prevent processing warnings
+                        max_val = np.max(np.abs(audio_array))
+                        if max_val > 0:
+                            audio_array = audio_array / max_val
+                        # Convert to WAV bytes for Gradio
+                        with io.BytesIO() as wav_buffer:
+                            sf.write(wav_buffer, audio_array, SAMPLE_RATE, format='WAV')
+                            return wav_buffer.getvalue()
+                    if text_response := response.text:
+                        return text_response
+                return None
         except Exception as e:
             return f"Error: {str(e)}"
+def create_interface():
+    tts_engine = None
+    def init_engine(api_key):
+        nonlocal tts_engine
         try:
+            tts_engine = GeminiTTS(api_key)
+            return "✅ TTS Initialized Successfully"
         except Exception as e:
+            return f"❌ Initialization Failed: {str(e)}"
+    async def generate_speech(text):
+        if not tts_engine:
+            raise gr.Error("Please initialize the TTS first")
+        result = await tts_engine.text_to_speech(text)
+        if isinstance(result, str):
+            return None, result  # Return error message
+        elif result:
+            return (SAMPLE_RATE, result), ""  # Return audio and empty message
+        return None, "No response received"
+    with gr.Blocks(title="Gemini TTS") as app:
+        gr.Markdown("# 🎤 Gemini Text-to-Speech")
         with gr.Row():
             api_key = gr.Textbox(
+                label="API Key",
                 type="password",
+                placeholder="Enter your Gemini API key"
             )
+            init_btn = gr.Button("Initialize")
+        init_status = gr.Textbox(label="Status", interactive=False)
+        init_btn.click(init_engine, inputs=api_key, outputs=init_status)
         with gr.Group():
             text_input = gr.Textbox(
+                label="Input Text",
                 lines=3,
+                placeholder="Type something to speak..."
             )
             generate_btn = gr.Button("Generate Speech")
+        audio_output = gr.Audio(label="Output Audio")
+        text_output = gr.Textbox(label="Messages", interactive=False)
         generate_btn.click(
+            generate_speech,
             inputs=text_input,
             outputs=[audio_output, text_output]
         )
+    return app
 if __name__ == "__main__":
+    app = create_interface()
+    app.launch(server_name="0.0.0.0", server_port=7860)