Athspi commited on
Commit
f3ad9e5
·
verified ·
1 Parent(s): e8ee7fe

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +47 -32
app.py CHANGED
@@ -8,43 +8,56 @@ import io
8
 
9
  # Configuration
10
  SAMPLE_RATE = 24000
11
- MODEL = "models/gemini-2.0-flash-exp" # Updated to a stable model version
12
 
13
  class GeminiTTS:
14
  def __init__(self, api_key):
15
  if not api_key:
16
  raise ValueError("API key cannot be empty")
17
- self.client = genai.Client(api_key=api_key) # Removed experimental http_options
18
- self.config = types.GenerationConfig(
19
- candidate_count=1,
20
- max_output_tokens=2048,
21
- temperature=0.9,
 
 
 
 
 
 
 
22
  )
23
 
24
  async def text_to_speech(self, text):
25
  try:
26
- # Using standard generate_content instead of experimental live API
27
- response = await self.client.generate_content_async(
28
- contents=[types.Content(parts=[types.Part(text=text)])],
29
- generation_config=self.config
30
- )
31
-
32
- # For actual TTS, you would use the text response with a TTS service
33
- # This is a placeholder for the actual audio generation
34
- text_response = response.text
35
-
36
- # Generate synthetic audio (replace with actual TTS API call)
37
- duration = min(max(len(text_response) * 0.1, 10) # Max 10 seconds
38
- t = np.linspace(0, duration, int(SAMPLE_RATE * duration), False)
39
- audio_data = np.sin(2 * np.pi * 220 * t) * 0.5 # Simple sine wave
40
-
41
- # Convert to WAV bytes for Gradio
42
- with io.BytesIO() as wav_buffer:
43
- sf.write(wav_buffer, audio_data, SAMPLE_RATE, format='WAV')
44
- return wav_buffer.getvalue(), text_response
45
-
 
 
 
 
 
 
46
  except Exception as e:
47
- return None, f"Error: {str(e)}"
48
 
49
  def create_interface():
50
  tts_engine = None
@@ -61,11 +74,13 @@ def create_interface():
61
  if not tts_engine:
62
  raise gr.Error("Please initialize the TTS first")
63
 
64
- audio_data, message = await tts_engine.text_to_speech(text)
65
 
66
- if audio_data:
67
- return (SAMPLE_RATE, audio_data), message
68
- return None, message
 
 
69
 
70
  with gr.Blocks(title="Gemini TTS") as app:
71
  gr.Markdown("# 🎤 Gemini Text-to-Speech")
@@ -90,7 +105,7 @@ def create_interface():
90
  generate_btn = gr.Button("Generate Speech")
91
 
92
  audio_output = gr.Audio(label="Output Audio")
93
- text_output = gr.Textbox(label="Response Message", interactive=False)
94
 
95
  generate_btn.click(
96
  generate_speech,
 
8
 
9
  # Configuration
10
  SAMPLE_RATE = 24000
11
+ MODEL = "gemini-2.0-flash-exp" # Correct model name
12
 
13
  class GeminiTTS:
14
  def __init__(self, api_key):
15
  if not api_key:
16
  raise ValueError("API key cannot be empty")
17
+ self.client = genai.Client(http_options={"api_version": "v1alpha"}, api_key=api_key)
18
+ self.config = types.LiveConnectConfig(
19
+ response_modalities=["AUDIO"],
20
+ speech_config=types.SpeechConfig(
21
+ voice_config=types.VoiceConfig(
22
+ prebuilt_voice_config=types.PrebuiltVoiceConfig(voice_name="Puck")
23
+ )
24
+ ),
25
+ system_instruction=types.Content(
26
+ parts=[types.Part.from_text(text="Speak exactly what the user says")],
27
+ role="user"
28
+ ),
29
  )
30
 
31
  async def text_to_speech(self, text):
32
  try:
33
+ async with self.client.aio.live.connect(model=MODEL, config=self.config) as session:
34
+ await session.send(input=text or " ", end_of_turn=True)
35
+
36
+ async for response in session.receive():
37
+ if audio_data := response.data:
38
+ # Convert to numpy array and normalize
39
+ audio_array = np.frombuffer(audio_data, dtype=np.float32)
40
+
41
+ # Handle empty/quiet audio
42
+ if audio_array.size == 0:
43
+ audio_array = np.zeros(int(SAMPLE_RATE * 0.5)) # 0.5s of silence
44
+
45
+ # Normalize audio
46
+ max_val = np.max(np.abs(audio_array))
47
+ if max_val > 0:
48
+ audio_array = audio_array / max_val
49
+
50
+ # Convert to WAV bytes for Gradio
51
+ with io.BytesIO() as wav_buffer:
52
+ sf.write(wav_buffer, audio_array, SAMPLE_RATE, format='WAV')
53
+ return (SAMPLE_RATE, wav_buffer.getvalue())
54
+
55
+ if text_response := response.text:
56
+ return text_response
57
+
58
+ return None
59
  except Exception as e:
60
+ return f"Error: {str(e)}"
61
 
62
  def create_interface():
63
  tts_engine = None
 
74
  if not tts_engine:
75
  raise gr.Error("Please initialize the TTS first")
76
 
77
+ result = await tts_engine.text_to_speech(text)
78
 
79
+ if isinstance(result, str):
80
+ return None, result # Return error message
81
+ elif result:
82
+ return result, "" # Return audio and empty message
83
+ return None, "No response received"
84
 
85
  with gr.Blocks(title="Gemini TTS") as app:
86
  gr.Markdown("# 🎤 Gemini Text-to-Speech")
 
105
  generate_btn = gr.Button("Generate Speech")
106
 
107
  audio_output = gr.Audio(label="Output Audio")
108
+ text_output = gr.Textbox(label="Messages", interactive=False)
109
 
110
  generate_btn.click(
111
  generate_speech,