Update app.py
Browse files
app.py
CHANGED
@@ -1,23 +1,20 @@
|
|
1 |
import gradio as gr
|
2 |
import asyncio
|
3 |
-
import base64
|
4 |
-
import io
|
5 |
import numpy as np
|
6 |
from google import genai
|
7 |
from google.genai import types
|
8 |
import soundfile as sf
|
|
|
9 |
|
10 |
# Configuration
|
11 |
SAMPLE_RATE = 24000
|
12 |
-
MODEL = "models/gemini-2.0-flash
|
13 |
|
14 |
class GeminiTTS:
|
15 |
def __init__(self, api_key):
|
16 |
if not api_key:
|
17 |
raise ValueError("API key cannot be empty")
|
18 |
self.client = genai.Client(http_options={"api_version": "v1alpha"}, api_key=api_key)
|
19 |
-
self.session = None
|
20 |
-
|
21 |
self.config = types.LiveConnectConfig(
|
22 |
response_modalities=["audio"],
|
23 |
speech_config=types.SpeechConfig(
|
@@ -26,94 +23,99 @@ class GeminiTTS:
|
|
26 |
)
|
27 |
),
|
28 |
system_instruction=types.Content(
|
29 |
-
parts=[types.Part.from_text(text="
|
30 |
role="user"
|
31 |
),
|
32 |
)
|
33 |
|
34 |
-
async def
|
35 |
try:
|
36 |
async with self.client.aio.live.connect(model=MODEL, config=self.config) as session:
|
37 |
-
await session.send(input=text or "
|
38 |
|
39 |
-
# Get response
|
40 |
turn = session.receive()
|
41 |
async for response in turn:
|
42 |
-
if
|
43 |
-
# Convert to
|
44 |
-
|
|
|
|
|
|
|
|
|
45 |
|
46 |
# Normalize audio to prevent processing warnings
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
audio_data = audio_data / max_val
|
51 |
|
52 |
-
|
53 |
-
|
54 |
-
|
|
|
|
|
|
|
|
|
55 |
|
56 |
-
return
|
57 |
except Exception as e:
|
58 |
return f"Error: {str(e)}"
|
59 |
|
60 |
-
def
|
61 |
-
|
62 |
|
63 |
-
def
|
64 |
-
nonlocal
|
65 |
try:
|
66 |
-
|
67 |
-
return "
|
68 |
except Exception as e:
|
69 |
-
return f"Initialization Failed: {str(e)}"
|
70 |
|
71 |
-
async def
|
72 |
-
if not
|
73 |
-
raise gr.Error("Please initialize the TTS
|
74 |
|
75 |
-
result = await
|
76 |
|
77 |
-
if isinstance(result,
|
78 |
-
#
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
return result
|
83 |
|
84 |
-
with gr.Blocks(title="Gemini TTS
|
85 |
-
gr.Markdown("# π€ Gemini Text-to-Speech
|
86 |
|
87 |
with gr.Row():
|
88 |
api_key = gr.Textbox(
|
89 |
-
label="
|
90 |
type="password",
|
91 |
-
placeholder="Enter your Gemini API key
|
92 |
)
|
93 |
-
init_btn = gr.Button("Initialize
|
94 |
|
95 |
-
init_status = gr.Textbox(label="
|
96 |
-
init_btn.click(
|
97 |
|
98 |
with gr.Group():
|
99 |
text_input = gr.Textbox(
|
100 |
-
label="
|
101 |
lines=3,
|
102 |
-
placeholder="Type something to
|
103 |
)
|
104 |
generate_btn = gr.Button("Generate Speech")
|
105 |
|
106 |
-
audio_output = gr.Audio(label="
|
107 |
-
text_output = gr.Textbox(label="
|
108 |
|
109 |
generate_btn.click(
|
110 |
-
|
111 |
inputs=text_input,
|
112 |
outputs=[audio_output, text_output]
|
113 |
)
|
114 |
|
115 |
-
return
|
116 |
|
117 |
if __name__ == "__main__":
|
118 |
-
|
119 |
-
|
|
|
1 |
import gradio as gr
|
2 |
import asyncio
|
|
|
|
|
3 |
import numpy as np
|
4 |
from google import genai
|
5 |
from google.genai import types
|
6 |
import soundfile as sf
|
7 |
+
import io
|
8 |
|
9 |
# Configuration
|
10 |
SAMPLE_RATE = 24000
|
11 |
+
MODEL = "models/gemini-2.0-flash"
|
12 |
|
13 |
class GeminiTTS:
|
14 |
def __init__(self, api_key):
|
15 |
if not api_key:
|
16 |
raise ValueError("API key cannot be empty")
|
17 |
self.client = genai.Client(http_options={"api_version": "v1alpha"}, api_key=api_key)
|
|
|
|
|
18 |
self.config = types.LiveConnectConfig(
|
19 |
response_modalities=["audio"],
|
20 |
speech_config=types.SpeechConfig(
|
|
|
23 |
)
|
24 |
),
|
25 |
system_instruction=types.Content(
|
26 |
+
parts=[types.Part.from_text(text="Speak exactly what the user says")],
|
27 |
role="user"
|
28 |
),
|
29 |
)
|
30 |
|
31 |
+
async def text_to_speech(self, text):
|
32 |
try:
|
33 |
async with self.client.aio.live.connect(model=MODEL, config=self.config) as session:
|
34 |
+
await session.send(input=text or " ", end_of_turn=True)
|
35 |
|
|
|
36 |
turn = session.receive()
|
37 |
async for response in turn:
|
38 |
+
if audio_data := response.data:
|
39 |
+
# Convert to numpy array and normalize
|
40 |
+
audio_array = np.frombuffer(audio_data, dtype=np.float32)
|
41 |
+
|
42 |
+
# Handle empty/quiet audio
|
43 |
+
if audio_array.size == 0:
|
44 |
+
audio_array = np.zeros(int(SAMPLE_RATE * 0.5)) # 0.5s of silence
|
45 |
|
46 |
# Normalize audio to prevent processing warnings
|
47 |
+
max_val = np.max(np.abs(audio_array))
|
48 |
+
if max_val > 0:
|
49 |
+
audio_array = audio_array / max_val
|
|
|
50 |
|
51 |
+
# Convert to WAV bytes for Gradio
|
52 |
+
with io.BytesIO() as wav_buffer:
|
53 |
+
sf.write(wav_buffer, audio_array, SAMPLE_RATE, format='WAV')
|
54 |
+
return wav_buffer.getvalue()
|
55 |
+
|
56 |
+
if text_response := response.text:
|
57 |
+
return text_response
|
58 |
|
59 |
+
return None
|
60 |
except Exception as e:
|
61 |
return f"Error: {str(e)}"
|
62 |
|
63 |
+
def create_interface():
|
64 |
+
tts_engine = None
|
65 |
|
66 |
+
def init_engine(api_key):
|
67 |
+
nonlocal tts_engine
|
68 |
try:
|
69 |
+
tts_engine = GeminiTTS(api_key)
|
70 |
+
return "β
TTS Initialized Successfully"
|
71 |
except Exception as e:
|
72 |
+
return f"β Initialization Failed: {str(e)}"
|
73 |
|
74 |
+
async def generate_speech(text):
|
75 |
+
if not tts_engine:
|
76 |
+
raise gr.Error("Please initialize the TTS first")
|
77 |
|
78 |
+
result = await tts_engine.text_to_speech(text)
|
79 |
|
80 |
+
if isinstance(result, str):
|
81 |
+
return None, result # Return error message
|
82 |
+
elif result:
|
83 |
+
return (SAMPLE_RATE, result), "" # Return audio and empty message
|
84 |
+
return None, "No response received"
|
|
|
85 |
|
86 |
+
with gr.Blocks(title="Gemini TTS") as app:
|
87 |
+
gr.Markdown("# π€ Gemini Text-to-Speech")
|
88 |
|
89 |
with gr.Row():
|
90 |
api_key = gr.Textbox(
|
91 |
+
label="API Key",
|
92 |
type="password",
|
93 |
+
placeholder="Enter your Gemini API key"
|
94 |
)
|
95 |
+
init_btn = gr.Button("Initialize")
|
96 |
|
97 |
+
init_status = gr.Textbox(label="Status", interactive=False)
|
98 |
+
init_btn.click(init_engine, inputs=api_key, outputs=init_status)
|
99 |
|
100 |
with gr.Group():
|
101 |
text_input = gr.Textbox(
|
102 |
+
label="Input Text",
|
103 |
lines=3,
|
104 |
+
placeholder="Type something to speak..."
|
105 |
)
|
106 |
generate_btn = gr.Button("Generate Speech")
|
107 |
|
108 |
+
audio_output = gr.Audio(label="Output Audio")
|
109 |
+
text_output = gr.Textbox(label="Messages", interactive=False)
|
110 |
|
111 |
generate_btn.click(
|
112 |
+
generate_speech,
|
113 |
inputs=text_input,
|
114 |
outputs=[audio_output, text_output]
|
115 |
)
|
116 |
|
117 |
+
return app
|
118 |
|
119 |
if __name__ == "__main__":
|
120 |
+
app = create_interface()
|
121 |
+
app.launch(server_name="0.0.0.0", server_port=7860)
|