Spaces:

pheodoraa
/

speechbrain

Sleeping

File size: 1,882 Bytes

c3ad381
 
 
b8590a9
c3ad381
b8590a9
 
 
 
 
2548d5a
b8590a9
 
 
c3ad381
 
2548d5a
b8590a9
 
 
 
 
 
 
2548d5a
facd705
 
 
2548d5a
b8590a9
03bcefe
 
 
2548d5a
 
 
facd705
2548d5a
 
b8590a9
 
03bcefe
b8590a9
 
 
 
c3ad381
b8590a9
c3ad381
 
b8590a9
c3ad381
 
 
 
 
b8590a9

import gradio as gr
import torch
import torchaudio
from speechbrain.pretrained import EncoderASR

# Load the model
try:
    asr_model = EncoderASR.from_hparams(
        source="speechbrain/asr-wav2vec2-dvoice-darija",
        savedir="tmp_model",
        run_opts={"device": "cpu"}  # Ensures compatibility with CPU environments
    )
except Exception as e:
    print(f"Error loading model: {str(e)}")

def transcribe(audio):
    """Transcribe uploaded audio to text using SpeechBrain ASR."""
    if audio is None:
        return "No audio file uploaded. Please upload a valid file."

    try:
        # Load audio
        waveform, sample_rate = torchaudio.load(audio)

        # Convert stereo to mono if needed
        if waveform.shape[0] > 1:
            waveform = torch.mean(waveform, dim=0, keepdim=True)

        # Resample if sample rate is not 16kHz
        if sample_rate != 16000:
            resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
            waveform = resampler(waveform)

        # Ensure waveform is 2D (1, time_steps)
        waveform = waveform.squeeze(0)  # Remove channel dim if present
        waveform = waveform.unsqueeze(0)  # Add batch dimension -> (1, time_steps)

        # Compute wav_lens as a relative fraction
        wav_lens = torch.tensor([waveform.shape[1] / waveform.shape[1]], dtype=torch.float32)

        # Transcribe
        transcription = asr_model.transcribe_batch(waveform, wav_lens)
        return transcription[0]

    except Exception as e:
        return f"Error processing audio: {str(e)}"

# Create Gradio Interface
iface = gr.Interface(
    fn=transcribe,
    inputs=gr.Audio(type="filepath"),
    outputs="text",
    title="Reconnaissance Vocale Darija",
    description="Parlez en Darija et obtenez la transcription."
)

# Launch the app
if __name__ == "__main__":
    iface.launch()