Spaces:

Athspi
/

Whshhs

Runtime error

App Files Files Community

Athspi commited on Mar 29

Commit

f3ad9e5

verified ·

1 Parent(s): e8ee7fe

Update app.py

Browse files

Files changed (1) hide show

app.py +47 -32

app.py CHANGED Viewed

@@ -8,43 +8,56 @@ import io
 # Configuration
 SAMPLE_RATE = 24000
-MODEL = "models/gemini-2.0-flash-exp"  # Updated to a stable model version
 class GeminiTTS:
     def __init__(self, api_key):
         if not api_key:
             raise ValueError("API key cannot be empty")
-        self.client = genai.Client(api_key=api_key)  # Removed experimental http_options
-        self.config = types.GenerationConfig(
-            candidate_count=1,
-            max_output_tokens=2048,
-            temperature=0.9,
         )
     async def text_to_speech(self, text):
         try:
-            # Using standard generate_content instead of experimental live API
-            response = await self.client.generate_content_async(
-                contents=[types.Content(parts=[types.Part(text=text)])],
-                generation_config=self.config
-            )
-            # For actual TTS, you would use the text response with a TTS service
-            # This is a placeholder for the actual audio generation
-            text_response = response.text
-            # Generate synthetic audio (replace with actual TTS API call)
-            duration = min(max(len(text_response) * 0.1, 10)  # Max 10 seconds
-            t = np.linspace(0, duration, int(SAMPLE_RATE * duration), False)
-            audio_data = np.sin(2 * np.pi * 220 * t) * 0.5  # Simple sine wave
-            # Convert to WAV bytes for Gradio
-            with io.BytesIO() as wav_buffer:
-                sf.write(wav_buffer, audio_data, SAMPLE_RATE, format='WAV')
-                return wav_buffer.getvalue(), text_response
         except Exception as e:
-            return None, f"Error: {str(e)}"
 def create_interface():
     tts_engine = None
@@ -61,11 +74,13 @@ def create_interface():
         if not tts_engine:
             raise gr.Error("Please initialize the TTS first")
-        audio_data, message = await tts_engine.text_to_speech(text)
-        if audio_data:
-            return (SAMPLE_RATE, audio_data), message
-        return None, message
     with gr.Blocks(title="Gemini TTS") as app:
         gr.Markdown("# 🎤 Gemini Text-to-Speech")
@@ -90,7 +105,7 @@ def create_interface():
             generate_btn = gr.Button("Generate Speech")
         audio_output = gr.Audio(label="Output Audio")
-        text_output = gr.Textbox(label="Response Message", interactive=False)
         generate_btn.click(
             generate_speech,

 # Configuration
 SAMPLE_RATE = 24000
+MODEL = "gemini-2.0-flash-exp"  # Correct model name
 class GeminiTTS:
     def __init__(self, api_key):
         if not api_key:
             raise ValueError("API key cannot be empty")
+        self.client = genai.Client(http_options={"api_version": "v1alpha"}, api_key=api_key)
+        self.config = types.LiveConnectConfig(
+            response_modalities=["AUDIO"],
+            speech_config=types.SpeechConfig(
+                voice_config=types.VoiceConfig(
+                    prebuilt_voice_config=types.PrebuiltVoiceConfig(voice_name="Puck")
+                )
+            ),
+            system_instruction=types.Content(
+                parts=[types.Part.from_text(text="Speak exactly what the user says")],
+                role="user"
+            ),
         )
     async def text_to_speech(self, text):
         try:
+            async with self.client.aio.live.connect(model=MODEL, config=self.config) as session:
+                await session.send(input=text or " ", end_of_turn=True)
+                async for response in session.receive():
+                    if audio_data := response.data:
+                        # Convert to numpy array and normalize
+                        audio_array = np.frombuffer(audio_data, dtype=np.float32)
+                        # Handle empty/quiet audio
+                        if audio_array.size == 0:
+                            audio_array = np.zeros(int(SAMPLE_RATE * 0.5))  # 0.5s of silence
+                        # Normalize audio
+                        max_val = np.max(np.abs(audio_array))
+                        if max_val > 0:
+                            audio_array = audio_array / max_val
+                        # Convert to WAV bytes for Gradio
+                        with io.BytesIO() as wav_buffer:
+                            sf.write(wav_buffer, audio_array, SAMPLE_RATE, format='WAV')
+                            return (SAMPLE_RATE, wav_buffer.getvalue())
+                    if text_response := response.text:
+                        return text_response
+                return None
         except Exception as e:
+            return f"Error: {str(e)}"
 def create_interface():
     tts_engine = None
         if not tts_engine:
             raise gr.Error("Please initialize the TTS first")
+        result = await tts_engine.text_to_speech(text)
+        if isinstance(result, str):
+            return None, result  # Return error message
+        elif result:
+            return result, ""  # Return audio and empty message
+        return None, "No response received"
     with gr.Blocks(title="Gemini TTS") as app:
         gr.Markdown("# 🎤 Gemini Text-to-Speech")
             generate_btn = gr.Button("Generate Speech")
         audio_output = gr.Audio(label="Output Audio")
+        text_output = gr.Textbox(label="Messages", interactive=False)
         generate_btn.click(
             generate_speech,