Update app.py
Browse filesNormalize the audio volume from the microphone
app.py
CHANGED
@@ -142,7 +142,14 @@ def transcribe_microphone_stream(audio_chunk, stream_state, language):
|
|
142 |
sample_rate, waveform_np = audio_chunk
|
143 |
if len(waveform_np.shape) > 1:
|
144 |
waveform_np = waveform_np.mean(axis=1)
|
145 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
146 |
# Resample if needed
|
147 |
if sample_rate != 16000:
|
148 |
waveform = torch.from_numpy(waveform_np).float().unsqueeze(0)
|
@@ -151,6 +158,8 @@ def transcribe_microphone_stream(audio_chunk, stream_state, language):
|
|
151 |
waveform_np = waveform.squeeze(0).numpy()
|
152 |
sample_rate = 16000
|
153 |
|
|
|
|
|
154 |
# Initialize stream if first chunk
|
155 |
if stream_state is None:
|
156 |
stream_state = recognizer.create_stream()
|
|
|
142 |
sample_rate, waveform_np = audio_chunk
|
143 |
if len(waveform_np.shape) > 1:
|
144 |
waveform_np = waveform_np.mean(axis=1)
|
145 |
+
|
146 |
+
# Normalize if needed
|
147 |
+
if waveform_np.dtype != np.float32:
|
148 |
+
waveform_np = waveform_np.astype(np.float32)
|
149 |
+
|
150 |
+
if np.max(np.abs(waveform_np)) > 1.0:
|
151 |
+
waveform_np = waveform_np / np.max(np.abs(waveform_np))
|
152 |
+
|
153 |
# Resample if needed
|
154 |
if sample_rate != 16000:
|
155 |
waveform = torch.from_numpy(waveform_np).float().unsqueeze(0)
|
|
|
158 |
waveform_np = waveform.squeeze(0).numpy()
|
159 |
sample_rate = 16000
|
160 |
|
161 |
+
waveform_np = np.clip(waveform_np, -1.0, 1.0)
|
162 |
+
|
163 |
# Initialize stream if first chunk
|
164 |
if stream_state is None:
|
165 |
stream_state = recognizer.create_stream()
|