File size: 1,882 Bytes
c3ad381
 
 
b8590a9
c3ad381
b8590a9
 
 
 
 
2548d5a
b8590a9
 
 
c3ad381
 
2548d5a
b8590a9
 
 
 
 
 
 
2548d5a
facd705
 
 
2548d5a
b8590a9
03bcefe
 
 
2548d5a
 
 
facd705
2548d5a
 
b8590a9
 
03bcefe
b8590a9
 
 
 
c3ad381
b8590a9
c3ad381
 
b8590a9
c3ad381
 
 
 
 
b8590a9
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
import gradio as gr
import torch
import torchaudio
from speechbrain.pretrained import EncoderASR

# Load the model
try:
    asr_model = EncoderASR.from_hparams(
        source="speechbrain/asr-wav2vec2-dvoice-darija",
        savedir="tmp_model",
        run_opts={"device": "cpu"}  # Ensures compatibility with CPU environments
    )
except Exception as e:
    print(f"Error loading model: {str(e)}")

def transcribe(audio):
    """Transcribe uploaded audio to text using SpeechBrain ASR."""
    if audio is None:
        return "No audio file uploaded. Please upload a valid file."

    try:
        # Load audio
        waveform, sample_rate = torchaudio.load(audio)

        # Convert stereo to mono if needed
        if waveform.shape[0] > 1:
            waveform = torch.mean(waveform, dim=0, keepdim=True)

        # Resample if sample rate is not 16kHz
        if sample_rate != 16000:
            resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
            waveform = resampler(waveform)

        # Ensure waveform is 2D (1, time_steps)
        waveform = waveform.squeeze(0)  # Remove channel dim if present
        waveform = waveform.unsqueeze(0)  # Add batch dimension -> (1, time_steps)

        # Compute wav_lens as a relative fraction
        wav_lens = torch.tensor([waveform.shape[1] / waveform.shape[1]], dtype=torch.float32)

        # Transcribe
        transcription = asr_model.transcribe_batch(waveform, wav_lens)
        return transcription[0]

    except Exception as e:
        return f"Error processing audio: {str(e)}"

# Create Gradio Interface
iface = gr.Interface(
    fn=transcribe,
    inputs=gr.Audio(type="filepath"),
    outputs="text",
    title="Reconnaissance Vocale Darija",
    description="Parlez en Darija et obtenez la transcription."
)

# Launch the app
if __name__ == "__main__":
    iface.launch()