pheodoraa commited on
Commit
facd705
·
verified ·
1 Parent(s): 03bcefe

Fix Error processing audio: start (0) + length (25651) exceeds dimension size (1132).

Browse files
Files changed (1) hide show
  1. app.py +9 -2
app.py CHANGED
@@ -22,13 +22,20 @@ def transcribe(audio):
22
  # Load audio
23
  waveform, sample_rate = torchaudio.load(audio)
24
 
 
 
 
 
25
  # Ensure correct sample rate (16kHz expected by the model)
26
  if sample_rate != 16000:
27
  resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
28
  waveform = resampler(waveform)
29
 
30
- # Compute waveform length in seconds relative to batch size
31
- wav_lens = torch.tensor([waveform.shape[1] / 16000], dtype=torch.float32)
 
 
 
32
 
33
  # Transcribe
34
  transcription = asr_model.transcribe_batch(waveform, wav_lens)
 
22
  # Load audio
23
  waveform, sample_rate = torchaudio.load(audio)
24
 
25
+ # Convert to single-channel (mono) if stereo
26
+ if waveform.shape[0] > 1:
27
+ waveform = torch.mean(waveform, dim=0, keepdim=True)
28
+
29
  # Ensure correct sample rate (16kHz expected by the model)
30
  if sample_rate != 16000:
31
  resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
32
  waveform = resampler(waveform)
33
 
34
+ # Compute waveform length as a relative fraction
35
+ wav_lens = torch.tensor([waveform.shape[1] / waveform.shape[1]], dtype=torch.float32)
36
+
37
+ # Add batch dimension (SpeechBrain expects a batch format)
38
+ waveform = waveform.unsqueeze(0)
39
 
40
  # Transcribe
41
  transcription = asr_model.transcribe_batch(waveform, wav_lens)