Spaces:
Sleeping
Sleeping
Fix Error processing audio: start (0) + length (25651) exceeds dimension size (1132).
Browse files
app.py
CHANGED
@@ -22,13 +22,20 @@ def transcribe(audio):
|
|
22 |
# Load audio
|
23 |
waveform, sample_rate = torchaudio.load(audio)
|
24 |
|
|
|
|
|
|
|
|
|
25 |
# Ensure correct sample rate (16kHz expected by the model)
|
26 |
if sample_rate != 16000:
|
27 |
resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
|
28 |
waveform = resampler(waveform)
|
29 |
|
30 |
-
# Compute waveform length
|
31 |
-
wav_lens = torch.tensor([waveform.shape[1] /
|
|
|
|
|
|
|
32 |
|
33 |
# Transcribe
|
34 |
transcription = asr_model.transcribe_batch(waveform, wav_lens)
|
|
|
22 |
# Load audio
|
23 |
waveform, sample_rate = torchaudio.load(audio)
|
24 |
|
25 |
+
# Convert to single-channel (mono) if stereo
|
26 |
+
if waveform.shape[0] > 1:
|
27 |
+
waveform = torch.mean(waveform, dim=0, keepdim=True)
|
28 |
+
|
29 |
# Ensure correct sample rate (16kHz expected by the model)
|
30 |
if sample_rate != 16000:
|
31 |
resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
|
32 |
waveform = resampler(waveform)
|
33 |
|
34 |
+
# Compute waveform length as a relative fraction
|
35 |
+
wav_lens = torch.tensor([waveform.shape[1] / waveform.shape[1]], dtype=torch.float32)
|
36 |
+
|
37 |
+
# Add batch dimension (SpeechBrain expects a batch format)
|
38 |
+
waveform = waveform.unsqueeze(0)
|
39 |
|
40 |
# Transcribe
|
41 |
transcription = asr_model.transcribe_batch(waveform, wav_lens)
|