🇲🇦 Moroccan Darija Text-to-Speech 🎙️
Transform your Darija text into lifelike speech with ease
import torch import soundfile as sf import os import re from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan from speechbrain.pretrained import EncoderClassifier # Define paths and device model_path = "HAMMALE/speecht5-darija" # Path to your model on HF Hub device = "cuda" if torch.cuda.is_available() else "cpu" print(f"Using device: {device}") # Load models processor = SpeechT5Processor.from_pretrained(model_path) model = SpeechT5ForTextToSpeech.from_pretrained(model_path).to(device) vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device) # Load speaker embedding model speaker_model = EncoderClassifier.from_hparams( source="speechbrain/spkrec-xvect-voxceleb", run_opts={"device": device}, savedir=os.path.join("/tmp", "spkrec-xvect-voxceleb"), ) # Load pre-computed speaker embeddings male_embedding = torch.load("male_embedding.pt") if os.path.exists("male_embedding.pt") else torch.randn(1, 512) female_embedding = torch.load("female_embedding.pt") if os.path.exists("female_embedding.pt") else torch.randn(1, 512) # Text normalization function def normalize_text(text): """Normalize text for TTS processing""" text = text.lower() # Keep letters, numbers, spaces and apostrophes - fixed regex text = re.sub(r'[^\w\s\'\u0600-\u06FF]', '', text) text = ' '.join(text.split()) return text # Function to synthesize speech def synthesize_speech(text, voice_type="male", speed=1.0): """Generate speech from text using the specified voice type""" try: # Select speaker embedding based on voice type if voice_type == "male": speaker_embeddings = male_embedding.to(device) else: speaker_embeddings = female_embedding.to(device) # Normalize and tokenize input text normalized_text = normalize_text(text) inputs = processor(text=normalized_text, return_tensors="pt").to(device) # Generate speech with torch.no_grad(): speech = model.generate_speech( inputs["input_ids"], speaker_embeddings, vocoder=vocoder ) # Convert to numpy array and adjust speed if needed speech_np = speech.cpu().numpy() # Apply speed adjustment (simple resampling) if speed != 1.0: # This is a simple approach - for production use a proper resampling library import numpy as np from scipy import signal sample_rate = 16000 new_length = int(len(speech_np) / speed) speech_np = signal.resample(speech_np, new_length) # Save temporary audio file output_file = "output_speech.wav" sf.write(output_file, speech_np, 16000) return output_file, None except Exception as e: return None, f"Error generating speech: {str(e)}" # Gradio imports need to be added import gradio as gr # Custom CSS for a full-screen, modern design custom_css = """ body, html { margin: 0; padding: 0; height: 100%; width: 100%; overflow-x: hidden; } .gradio-container { font-family: 'Montserrat', 'Arial', sans-serif !important; height: 100vh; width: 100vw; background: linear-gradient(135deg, #f5f7fa 0%, #c3cfe2 100%); display: flex; flex-direction: column; padding: 0; margin: 0; overflow-y: auto; } .main-header { background: linear-gradient(90deg, #d32f2f, #1976d2); color: white; padding: 2em; text-align: center; box-shadow: 0 6px 12px rgba(0, 0, 0, 0.15); border-bottom: 4px solid #ffffff33; } .main-header h1 { font-size: 2.8em; margin: 0; font-weight: 700; letter-spacing: 1px; text-shadow: 2px 2px 4px rgba(0, 0, 0, 0.2); } .main-header p { font-size: 1.2em; margin: 0.5em 0 0; opacity: 0.9; font-weight: 300; } .container { max-width: 1200px; margin: 2em auto; padding: 0 1em; flex: 1; } .row { display: flex; gap: 2em; background: white; border-radius: 15px; padding: 2em; box-shadow: 0 8px 16px rgba(0, 0, 0, 0.1); margin-bottom: 2em; } .column { flex: 1; padding: 1em; } .info-box { background: #fef6f6; border-left: 5px solid #d32f2f; padding: 1.5em; border-radius: 8px; margin-bottom: 1.5em; font-size: 1em; line-height: 1.6; box-shadow: 0 2px 4px rgba(0, 0, 0, 0.05); } .textbox textarea { border: 2px solid #e0e0e0 !important; border-radius: 10px !important; padding: 1em !important; font-size: 1.1em !important; transition: border-color 0.3s ease !important; } .textbox textarea:focus { border-color: #d32f2f !important; box-shadow: 0 0 8px rgba(211, 47, 47, 0.2) !important; } .radio { display: flex; justify-content: center; gap: 1.5em; margin: 1em 0; } .radio label { background: #f5f5f5; padding: 0.8em 1.5em; border-radius: 25px; border: 2px solid #e0e0e0; cursor: pointer; transition: all 0.3s ease; } .radio input:checked + label { background: #d32f2f; color: white; border-color: #d32f2f; box-shadow: 0 4px 8px rgba(211, 47, 47, 0.2); } .slider { margin: 1.5em 0; } .slider input { accent-color: #d32f2f !important; } .button { background: linear-gradient(90deg, #d32f2f, #1976d2) !important; color: white !important; padding: 1em 2em !important; border-radius: 25px !important; border: none !important; font-size: 1.1em !important; font-weight: 600 !important; transition: transform 0.2s ease, box-shadow 0.3s ease !important; box-shadow: 0 4px 8px rgba(0, 0, 0, 0.15) !important; } .button:hover { transform: translateY(-2px) !important; box-shadow: 0 6px 12px rgba(0, 0, 0, 0.25) !important; } .audio { margin-top: 1em; } .audio audio { width: 100%; border-radius: 10px; box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1); } .example-header { font-weight: 600; color: #d32f2f; margin: 1.5em 0 0.5em; font-size: 1.2em; } ul { padding-left: 1.5em; color: #333; } li { margin: 0.5em 0; font-size: 1em; } .examples { margin-top: 1.5em; padding: 1em; background: #f9f9f9; border-radius: 10px; box-shadow: 0 4px 8px rgba(0, 0, 0, 0.05); } footer { text-align: center; padding: 1.5em; background: #ffffff; color: #666; font-size: 0.95em; border-top: 1px solid #e0e0e0; margin-top: auto; } .flag-icon { width: 30px; height: 30px; vertical-align: middle; margin-right: 10px; } """ # Create Gradio interface with enhanced design with gr.Blocks(css=custom_css) as demo: gr.HTML( """
Transform your Darija text into lifelike speech with ease
Experience high-quality Darija speech synthesis powered by the SpeechT5 model, fine-tuned on the DODa audio dataset. Customize the voice and speed to suit your needs.