Athspi commited on
Commit
c02bb52
·
verified ·
1 Parent(s): f3ad9e5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +26 -8
app.py CHANGED
@@ -8,7 +8,7 @@ import io
8
 
9
  # Configuration
10
  SAMPLE_RATE = 24000
11
- MODEL = "gemini-2.0-flash-exp" # Correct model name
12
 
13
  class GeminiTTS:
14
  def __init__(self, api_key):
@@ -35,22 +35,20 @@ class GeminiTTS:
35
 
36
  async for response in session.receive():
37
  if audio_data := response.data:
38
- # Convert to numpy array and normalize
39
  audio_array = np.frombuffer(audio_data, dtype=np.float32)
40
 
41
  # Handle empty/quiet audio
42
  if audio_array.size == 0:
43
  audio_array = np.zeros(int(SAMPLE_RATE * 0.5)) # 0.5s of silence
44
 
45
- # Normalize audio
46
  max_val = np.max(np.abs(audio_array))
47
  if max_val > 0:
48
  audio_array = audio_array / max_val
49
 
50
- # Convert to WAV bytes for Gradio
51
- with io.BytesIO() as wav_buffer:
52
- sf.write(wav_buffer, audio_array, SAMPLE_RATE, format='WAV')
53
- return (SAMPLE_RATE, wav_buffer.getvalue())
54
 
55
  if text_response := response.text:
56
  return text_response
@@ -59,6 +57,26 @@ class GeminiTTS:
59
  except Exception as e:
60
  return f"Error: {str(e)}"
61
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
  def create_interface():
63
  tts_engine = None
64
 
@@ -104,7 +122,7 @@ def create_interface():
104
  )
105
  generate_btn = gr.Button("Generate Speech")
106
 
107
- audio_output = gr.Audio(label="Output Audio")
108
  text_output = gr.Textbox(label="Messages", interactive=False)
109
 
110
  generate_btn.click(
 
8
 
9
  # Configuration
10
  SAMPLE_RATE = 24000
11
+ MODEL = "gemini-2.0-flash-exp" # Correct experimental model name
12
 
13
  class GeminiTTS:
14
  def __init__(self, api_key):
 
35
 
36
  async for response in session.receive():
37
  if audio_data := response.data:
38
+ # Convert to numpy array
39
  audio_array = np.frombuffer(audio_data, dtype=np.float32)
40
 
41
  # Handle empty/quiet audio
42
  if audio_array.size == 0:
43
  audio_array = np.zeros(int(SAMPLE_RATE * 0.5)) # 0.5s of silence
44
 
45
+ # Normalize audio to prevent processing warnings
46
  max_val = np.max(np.abs(audio_array))
47
  if max_val > 0:
48
  audio_array = audio_array / max_val
49
 
50
+ # Convert to proper format for Gradio
51
+ return self._create_audio_response(audio_array)
 
 
52
 
53
  if text_response := response.text:
54
  return text_response
 
57
  except Exception as e:
58
  return f"Error: {str(e)}"
59
 
60
+ def _create_audio_response(self, audio_array):
61
+ """Create properly formatted audio response for Gradio"""
62
+ # Convert to 16-bit PCM format
63
+ audio_array = (audio_array * 32767).astype(np.int16)
64
+
65
+ # Create WAV file in memory
66
+ with io.BytesIO() as wav_buffer:
67
+ with sf.SoundFile(
68
+ wav_buffer,
69
+ mode='w',
70
+ samplerate=SAMPLE_RATE,
71
+ channels=1,
72
+ format='WAV',
73
+ subtype='PCM_16'
74
+ ) as sf_file:
75
+ sf_file.write(audio_array)
76
+ wav_bytes = wav_buffer.getvalue()
77
+
78
+ return (SAMPLE_RATE, wav_bytes)
79
+
80
  def create_interface():
81
  tts_engine = None
82
 
 
122
  )
123
  generate_btn = gr.Button("Generate Speech")
124
 
125
+ audio_output = gr.Audio(label="Output Audio", type="filepath")
126
  text_output = gr.Textbox(label="Messages", interactive=False)
127
 
128
  generate_btn.click(