Update app.py
Browse files
app.py
CHANGED
@@ -8,7 +8,7 @@ import io
|
|
8 |
|
9 |
# Configuration
|
10 |
SAMPLE_RATE = 24000
|
11 |
-
MODEL = "gemini-2.0-flash-exp" # Correct model name
|
12 |
|
13 |
class GeminiTTS:
|
14 |
def __init__(self, api_key):
|
@@ -35,22 +35,20 @@ class GeminiTTS:
|
|
35 |
|
36 |
async for response in session.receive():
|
37 |
if audio_data := response.data:
|
38 |
-
# Convert to numpy array
|
39 |
audio_array = np.frombuffer(audio_data, dtype=np.float32)
|
40 |
|
41 |
# Handle empty/quiet audio
|
42 |
if audio_array.size == 0:
|
43 |
audio_array = np.zeros(int(SAMPLE_RATE * 0.5)) # 0.5s of silence
|
44 |
|
45 |
-
# Normalize audio
|
46 |
max_val = np.max(np.abs(audio_array))
|
47 |
if max_val > 0:
|
48 |
audio_array = audio_array / max_val
|
49 |
|
50 |
-
# Convert to
|
51 |
-
|
52 |
-
sf.write(wav_buffer, audio_array, SAMPLE_RATE, format='WAV')
|
53 |
-
return (SAMPLE_RATE, wav_buffer.getvalue())
|
54 |
|
55 |
if text_response := response.text:
|
56 |
return text_response
|
@@ -59,6 +57,26 @@ class GeminiTTS:
|
|
59 |
except Exception as e:
|
60 |
return f"Error: {str(e)}"
|
61 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
62 |
def create_interface():
|
63 |
tts_engine = None
|
64 |
|
@@ -104,7 +122,7 @@ def create_interface():
|
|
104 |
)
|
105 |
generate_btn = gr.Button("Generate Speech")
|
106 |
|
107 |
-
audio_output = gr.Audio(label="Output Audio")
|
108 |
text_output = gr.Textbox(label="Messages", interactive=False)
|
109 |
|
110 |
generate_btn.click(
|
|
|
8 |
|
9 |
# Configuration
|
10 |
SAMPLE_RATE = 24000
|
11 |
+
MODEL = "gemini-2.0-flash-exp" # Correct experimental model name
|
12 |
|
13 |
class GeminiTTS:
|
14 |
def __init__(self, api_key):
|
|
|
35 |
|
36 |
async for response in session.receive():
|
37 |
if audio_data := response.data:
|
38 |
+
# Convert to numpy array
|
39 |
audio_array = np.frombuffer(audio_data, dtype=np.float32)
|
40 |
|
41 |
# Handle empty/quiet audio
|
42 |
if audio_array.size == 0:
|
43 |
audio_array = np.zeros(int(SAMPLE_RATE * 0.5)) # 0.5s of silence
|
44 |
|
45 |
+
# Normalize audio to prevent processing warnings
|
46 |
max_val = np.max(np.abs(audio_array))
|
47 |
if max_val > 0:
|
48 |
audio_array = audio_array / max_val
|
49 |
|
50 |
+
# Convert to proper format for Gradio
|
51 |
+
return self._create_audio_response(audio_array)
|
|
|
|
|
52 |
|
53 |
if text_response := response.text:
|
54 |
return text_response
|
|
|
57 |
except Exception as e:
|
58 |
return f"Error: {str(e)}"
|
59 |
|
60 |
+
def _create_audio_response(self, audio_array):
|
61 |
+
"""Create properly formatted audio response for Gradio"""
|
62 |
+
# Convert to 16-bit PCM format
|
63 |
+
audio_array = (audio_array * 32767).astype(np.int16)
|
64 |
+
|
65 |
+
# Create WAV file in memory
|
66 |
+
with io.BytesIO() as wav_buffer:
|
67 |
+
with sf.SoundFile(
|
68 |
+
wav_buffer,
|
69 |
+
mode='w',
|
70 |
+
samplerate=SAMPLE_RATE,
|
71 |
+
channels=1,
|
72 |
+
format='WAV',
|
73 |
+
subtype='PCM_16'
|
74 |
+
) as sf_file:
|
75 |
+
sf_file.write(audio_array)
|
76 |
+
wav_bytes = wav_buffer.getvalue()
|
77 |
+
|
78 |
+
return (SAMPLE_RATE, wav_bytes)
|
79 |
+
|
80 |
def create_interface():
|
81 |
tts_engine = None
|
82 |
|
|
|
122 |
)
|
123 |
generate_btn = gr.Button("Generate Speech")
|
124 |
|
125 |
+
audio_output = gr.Audio(label="Output Audio", type="filepath")
|
126 |
text_output = gr.Textbox(label="Messages", interactive=False)
|
127 |
|
128 |
generate_btn.click(
|