Athspi commited on
Commit
a493c8c
Β·
verified Β·
1 Parent(s): af3c122

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +54 -52
app.py CHANGED
@@ -1,23 +1,20 @@
1
  import gradio as gr
2
  import asyncio
3
- import base64
4
- import io
5
  import numpy as np
6
  from google import genai
7
  from google.genai import types
8
  import soundfile as sf
 
9
 
10
  # Configuration
11
  SAMPLE_RATE = 24000
12
- MODEL = "models/gemini-2.0-flash-exp"
13
 
14
  class GeminiTTS:
15
  def __init__(self, api_key):
16
  if not api_key:
17
  raise ValueError("API key cannot be empty")
18
  self.client = genai.Client(http_options={"api_version": "v1alpha"}, api_key=api_key)
19
- self.session = None
20
-
21
  self.config = types.LiveConnectConfig(
22
  response_modalities=["audio"],
23
  speech_config=types.SpeechConfig(
@@ -26,94 +23,99 @@ class GeminiTTS:
26
  )
27
  ),
28
  system_instruction=types.Content(
29
- parts=[types.Part.from_text(text="Answer user ask, replay same thing user say no other word explain")],
30
  role="user"
31
  ),
32
  )
33
 
34
- async def process_text(self, text):
35
  try:
36
  async with self.client.aio.live.connect(model=MODEL, config=self.config) as session:
37
- await session.send(input=text or ".", end_of_turn=True)
38
 
39
- # Get response
40
  turn = session.receive()
41
  async for response in turn:
42
- if data := response.data:
43
- # Convert to properly formatted numpy array
44
- audio_data = np.frombuffer(data, dtype=np.float32)
 
 
 
 
45
 
46
  # Normalize audio to prevent processing warnings
47
- if audio_data.size > 0:
48
- max_val = np.max(np.abs(audio_data))
49
- if max_val > 0:
50
- audio_data = audio_data / max_val
51
 
52
- return (SAMPLE_RATE, audio_data)
53
- if text := response.text:
54
- return text
 
 
 
 
55
 
56
- return "No response received"
57
  except Exception as e:
58
  return f"Error: {str(e)}"
59
 
60
- def create_gradio_interface():
61
- tts_handler = None
62
 
63
- def init_tts(api_key):
64
- nonlocal tts_handler
65
  try:
66
- tts_handler = GeminiTTS(api_key)
67
- return "Gemini TTS Initialized Successfully!"
68
  except Exception as e:
69
- return f"Initialization Failed: {str(e)}"
70
 
71
- async def generate_response(text):
72
- if not tts_handler:
73
- raise gr.Error("Please initialize the TTS system first with your API key")
74
 
75
- result = await tts_handler.process_text(text)
76
 
77
- if isinstance(result, tuple) and len(result) == 2:
78
- # Audio response (sample_rate, audio_data)
79
- return result
80
- else:
81
- # Text response
82
- return result
83
 
84
- with gr.Blocks(title="Gemini TTS Interface") as demo:
85
- gr.Markdown("# 🎀 Gemini Text-to-Speech Interface")
86
 
87
  with gr.Row():
88
  api_key = gr.Textbox(
89
- label="Gemini API Key",
90
  type="password",
91
- placeholder="Enter your Gemini API key here"
92
  )
93
- init_btn = gr.Button("Initialize TTS")
94
 
95
- init_status = gr.Textbox(label="Initialization Status", interactive=False)
96
- init_btn.click(init_tts, inputs=api_key, outputs=init_status)
97
 
98
  with gr.Group():
99
  text_input = gr.Textbox(
100
- label="Enter Text",
101
  lines=3,
102
- placeholder="Type something to convert to speech..."
103
  )
104
  generate_btn = gr.Button("Generate Speech")
105
 
106
- audio_output = gr.Audio(label="Generated Speech")
107
- text_output = gr.Textbox(label="Text Response", visible=False)
108
 
109
  generate_btn.click(
110
- generate_response,
111
  inputs=text_input,
112
  outputs=[audio_output, text_output]
113
  )
114
 
115
- return demo
116
 
117
  if __name__ == "__main__":
118
- demo = create_gradio_interface()
119
- demo.launch(server_name="0.0.0.0", server_port=7860)
 
1
  import gradio as gr
2
  import asyncio
 
 
3
  import numpy as np
4
  from google import genai
5
  from google.genai import types
6
  import soundfile as sf
7
+ import io
8
 
9
  # Configuration
10
  SAMPLE_RATE = 24000
11
+ MODEL = "models/gemini-2.0-flash"
12
 
13
  class GeminiTTS:
14
  def __init__(self, api_key):
15
  if not api_key:
16
  raise ValueError("API key cannot be empty")
17
  self.client = genai.Client(http_options={"api_version": "v1alpha"}, api_key=api_key)
 
 
18
  self.config = types.LiveConnectConfig(
19
  response_modalities=["audio"],
20
  speech_config=types.SpeechConfig(
 
23
  )
24
  ),
25
  system_instruction=types.Content(
26
+ parts=[types.Part.from_text(text="Speak exactly what the user says")],
27
  role="user"
28
  ),
29
  )
30
 
31
+ async def text_to_speech(self, text):
32
  try:
33
  async with self.client.aio.live.connect(model=MODEL, config=self.config) as session:
34
+ await session.send(input=text or " ", end_of_turn=True)
35
 
 
36
  turn = session.receive()
37
  async for response in turn:
38
+ if audio_data := response.data:
39
+ # Convert to numpy array and normalize
40
+ audio_array = np.frombuffer(audio_data, dtype=np.float32)
41
+
42
+ # Handle empty/quiet audio
43
+ if audio_array.size == 0:
44
+ audio_array = np.zeros(int(SAMPLE_RATE * 0.5)) # 0.5s of silence
45
 
46
  # Normalize audio to prevent processing warnings
47
+ max_val = np.max(np.abs(audio_array))
48
+ if max_val > 0:
49
+ audio_array = audio_array / max_val
 
50
 
51
+ # Convert to WAV bytes for Gradio
52
+ with io.BytesIO() as wav_buffer:
53
+ sf.write(wav_buffer, audio_array, SAMPLE_RATE, format='WAV')
54
+ return wav_buffer.getvalue()
55
+
56
+ if text_response := response.text:
57
+ return text_response
58
 
59
+ return None
60
  except Exception as e:
61
  return f"Error: {str(e)}"
62
 
63
+ def create_interface():
64
+ tts_engine = None
65
 
66
+ def init_engine(api_key):
67
+ nonlocal tts_engine
68
  try:
69
+ tts_engine = GeminiTTS(api_key)
70
+ return "βœ… TTS Initialized Successfully"
71
  except Exception as e:
72
+ return f"❌ Initialization Failed: {str(e)}"
73
 
74
+ async def generate_speech(text):
75
+ if not tts_engine:
76
+ raise gr.Error("Please initialize the TTS first")
77
 
78
+ result = await tts_engine.text_to_speech(text)
79
 
80
+ if isinstance(result, str):
81
+ return None, result # Return error message
82
+ elif result:
83
+ return (SAMPLE_RATE, result), "" # Return audio and empty message
84
+ return None, "No response received"
 
85
 
86
+ with gr.Blocks(title="Gemini TTS") as app:
87
+ gr.Markdown("# 🎀 Gemini Text-to-Speech")
88
 
89
  with gr.Row():
90
  api_key = gr.Textbox(
91
+ label="API Key",
92
  type="password",
93
+ placeholder="Enter your Gemini API key"
94
  )
95
+ init_btn = gr.Button("Initialize")
96
 
97
+ init_status = gr.Textbox(label="Status", interactive=False)
98
+ init_btn.click(init_engine, inputs=api_key, outputs=init_status)
99
 
100
  with gr.Group():
101
  text_input = gr.Textbox(
102
+ label="Input Text",
103
  lines=3,
104
+ placeholder="Type something to speak..."
105
  )
106
  generate_btn = gr.Button("Generate Speech")
107
 
108
+ audio_output = gr.Audio(label="Output Audio")
109
+ text_output = gr.Textbox(label="Messages", interactive=False)
110
 
111
  generate_btn.click(
112
+ generate_speech,
113
  inputs=text_input,
114
  outputs=[audio_output, text_output]
115
  )
116
 
117
+ return app
118
 
119
  if __name__ == "__main__":
120
+ app = create_interface()
121
+ app.launch(server_name="0.0.0.0", server_port=7860)