Spaces:
Running
Running
import torch | |
import soundfile as sf | |
import os | |
import re | |
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan | |
from speechbrain.pretrained import EncoderClassifier | |
# Define paths and device | |
model_path = "HAMMALE/speecht5-darija" # Path to your model on HF Hub | |
device = "cuda" if torch.cuda.is_available() else "cpu" | |
print(f"Using device: {device}") | |
# Load models | |
processor = SpeechT5Processor.from_pretrained(model_path) | |
model = SpeechT5ForTextToSpeech.from_pretrained(model_path).to(device) | |
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device) | |
# Load speaker embedding model | |
speaker_model = EncoderClassifier.from_hparams( | |
source="speechbrain/spkrec-xvect-voxceleb", | |
run_opts={"device": device}, | |
savedir=os.path.join("/tmp", "spkrec-xvect-voxceleb"), | |
) | |
# Load pre-computed speaker embeddings | |
male_embedding = torch.load("male_embedding.pt") if os.path.exists("male_embedding.pt") else torch.randn(1, 512) | |
female_embedding = torch.load("female_embedding.pt") if os.path.exists("female_embedding.pt") else torch.randn(1, 512) | |
# Text normalization function | |
def normalize_text(text): | |
"""Normalize text for TTS processing""" | |
text = text.lower() | |
# Keep letters, numbers, spaces and apostrophes - fixed regex | |
text = re.sub(r'[^\w\s\'\u0600-\u06FF]', '', text) | |
text = ' '.join(text.split()) | |
return text | |
# Function to synthesize speech | |
def synthesize_speech(text, voice_type="male", speed=1.0): | |
"""Generate speech from text using the specified voice type""" | |
try: | |
# Select speaker embedding based on voice type | |
if voice_type == "male": | |
speaker_embeddings = male_embedding.to(device) | |
else: | |
speaker_embeddings = female_embedding.to(device) | |
# Normalize and tokenize input text | |
normalized_text = normalize_text(text) | |
inputs = processor(text=normalized_text, return_tensors="pt").to(device) | |
# Generate speech | |
with torch.no_grad(): | |
speech = model.generate_speech( | |
inputs["input_ids"], | |
speaker_embeddings, | |
vocoder=vocoder | |
) | |
# Convert to numpy array and adjust speed if needed | |
speech_np = speech.cpu().numpy() | |
# Apply speed adjustment (simple resampling) | |
if speed != 1.0: | |
# This is a simple approach - for production use a proper resampling library | |
import numpy as np | |
from scipy import signal | |
sample_rate = 16000 | |
new_length = int(len(speech_np) / speed) | |
speech_np = signal.resample(speech_np, new_length) | |
# Save temporary audio file | |
output_file = "output_speech.wav" | |
sf.write(output_file, speech_np, 16000) | |
return output_file, None | |
except Exception as e: | |
return None, f"Error generating speech: {str(e)}" | |
# Gradio imports need to be added | |
import gradio as gr | |
# Custom CSS for a full-screen, modern design | |
custom_css = """ | |
body, html { | |
margin: 0; | |
padding: 0; | |
height: 100%; | |
width: 100%; | |
overflow-x: hidden; | |
} | |
.gradio-container { | |
font-family: 'Montserrat', 'Arial', sans-serif !important; | |
height: 100vh; | |
width: 100vw; | |
background: linear-gradient(135deg, #f5f7fa 0%, #c3cfe2 100%); | |
display: flex; | |
flex-direction: column; | |
padding: 0; | |
margin: 0; | |
overflow-y: auto; | |
} | |
.main-header { | |
background: linear-gradient(90deg, #d32f2f, #1976d2); | |
color: white; | |
padding: 2em; | |
text-align: center; | |
box-shadow: 0 6px 12px rgba(0, 0, 0, 0.15); | |
border-bottom: 4px solid #ffffff33; | |
} | |
.main-header h1 { | |
font-size: 2.8em; | |
margin: 0; | |
font-weight: 700; | |
letter-spacing: 1px; | |
text-shadow: 2px 2px 4px rgba(0, 0, 0, 0.2); | |
} | |
.main-header p { | |
font-size: 1.2em; | |
margin: 0.5em 0 0; | |
opacity: 0.9; | |
font-weight: 300; | |
} | |
.container { | |
max-width: 1200px; | |
margin: 2em auto; | |
padding: 0 1em; | |
flex: 1; | |
} | |
.row { | |
display: flex; | |
gap: 2em; | |
background: white; | |
border-radius: 15px; | |
padding: 2em; | |
box-shadow: 0 8px 16px rgba(0, 0, 0, 0.1); | |
margin-bottom: 2em; | |
} | |
.column { | |
flex: 1; | |
padding: 1em; | |
} | |
.info-box { | |
background: #fef6f6; | |
border-left: 5px solid #d32f2f; | |
padding: 1.5em; | |
border-radius: 8px; | |
margin-bottom: 1.5em; | |
font-size: 1em; | |
line-height: 1.6; | |
box-shadow: 0 2px 4px rgba(0, 0, 0, 0.05); | |
} | |
.textbox textarea { | |
border: 2px solid #e0e0e0 !important; | |
border-radius: 10px !important; | |
padding: 1em !important; | |
font-size: 1.1em !important; | |
transition: border-color 0.3s ease !important; | |
} | |
.textbox textarea:focus { | |
border-color: #d32f2f !important; | |
box-shadow: 0 0 8px rgba(211, 47, 47, 0.2) !important; | |
} | |
.radio { | |
display: flex; | |
justify-content: center; | |
gap: 1.5em; | |
margin: 1em 0; | |
} | |
.radio label { | |
background: #f5f5f5; | |
padding: 0.8em 1.5em; | |
border-radius: 25px; | |
border: 2px solid #e0e0e0; | |
cursor: pointer; | |
transition: all 0.3s ease; | |
} | |
.radio input:checked + label { | |
background: #d32f2f; | |
color: white; | |
border-color: #d32f2f; | |
box-shadow: 0 4px 8px rgba(211, 47, 47, 0.2); | |
} | |
.slider { | |
margin: 1.5em 0; | |
} | |
.slider input { | |
accent-color: #d32f2f !important; | |
} | |
.button { | |
background: linear-gradient(90deg, #d32f2f, #1976d2) !important; | |
color: white !important; | |
padding: 1em 2em !important; | |
border-radius: 25px !important; | |
border: none !important; | |
font-size: 1.1em !important; | |
font-weight: 600 !important; | |
transition: transform 0.2s ease, box-shadow 0.3s ease !important; | |
box-shadow: 0 4px 8px rgba(0, 0, 0, 0.15) !important; | |
} | |
.button:hover { | |
transform: translateY(-2px) !important; | |
box-shadow: 0 6px 12px rgba(0, 0, 0, 0.25) !important; | |
} | |
.audio { | |
margin-top: 1em; | |
} | |
.audio audio { | |
width: 100%; | |
border-radius: 10px; | |
box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1); | |
} | |
.example-header { | |
font-weight: 600; | |
color: #d32f2f; | |
margin: 1.5em 0 0.5em; | |
font-size: 1.2em; | |
} | |
ul { | |
padding-left: 1.5em; | |
color: #333; | |
} | |
li { | |
margin: 0.5em 0; | |
font-size: 1em; | |
} | |
.examples { | |
margin-top: 1.5em; | |
padding: 1em; | |
background: #f9f9f9; | |
border-radius: 10px; | |
box-shadow: 0 4px 8px rgba(0, 0, 0, 0.05); | |
} | |
footer { | |
text-align: center; | |
padding: 1.5em; | |
background: #ffffff; | |
color: #666; | |
font-size: 0.95em; | |
border-top: 1px solid #e0e0e0; | |
margin-top: auto; | |
} | |
.flag-icon { | |
width: 30px; | |
height: 30px; | |
vertical-align: middle; | |
margin-right: 10px; | |
} | |
""" | |
# Create Gradio interface with enhanced design | |
with gr.Blocks(css=custom_css) as demo: | |
gr.HTML( | |
""" | |
<div class="main-header"> | |
<h1>๐ฒ๐ฆ Moroccan Darija Text-to-Speech ๐๏ธ</h1> | |
<p>Transform your Darija text into lifelike speech with ease</p> | |
</div> | |
""" | |
) | |
with gr.Row(elem_classes="row"): | |
with gr.Column(elem_classes="column"): | |
gr.HTML( | |
""" | |
<div class="info-box"> | |
<p>Experience high-quality Darija speech synthesis powered by the SpeechT5 model, fine-tuned on the DODa audio dataset. Customize the voice and speed to suit your needs.</p> | |
</div> | |
""" | |
) | |
text_input = gr.Textbox( | |
label="Enter Darija Text", | |
placeholder="Kteb chi jomla b darija hna, bhal 'Salam, kifach nta?'...", | |
lines=3, | |
elem_classes="textbox" | |
) | |
with gr.Row(elem_classes="radio"): | |
voice_type = gr.Radio( | |
["male", "female"], | |
label="Voice Type", | |
value="male" | |
) | |
speed = gr.Slider( | |
minimum=0.5, | |
maximum=2.0, | |
value=1.0, | |
step=0.1, | |
label="Speech Speed", | |
elem_classes="slider" | |
) | |
generate_btn = gr.Button("Generate Speech", variant="primary", elem_classes="button") | |
gr.HTML( | |
""" | |
<div class="example-header">Try These Phrases:</div> | |
<ul> | |
<li>"Ana Nadi Bezzaaf hhh"</li> | |
<li>"Lyoum ajwaa zwina bezzaaf."</li> | |
<li>"Lmaghrib ahssan blad fi l3alam"</li> | |
<li>"Chukran bzzaf 3la lmosanada!"</li> | |
</ul> | |
""" | |
) | |
with gr.Column(elem_classes="column"): | |
audio_output = gr.Audio(label="Generated Speech", elem_classes="audio") | |
error_output = gr.Textbox(label="Error (if any)", visible=False) | |
gr.Examples( | |
examples=[ | |
["Ana Nadi Bezzaaf hhh", "male", 1.0], | |
["Lyoum ajwaa zwina bezzaaf.", "female", 1.0], | |
["Lmaghrib ahssan blad fi l3alam", "male", 1.0], | |
["Filistine horaa mina lbari ila lbarri", "female", 0.8], | |
], | |
inputs=[text_input, voice_type, speed], | |
outputs=[audio_output, error_output], | |
fn=synthesize_speech | |
) | |
gr.HTML( | |
""" | |
<footer> | |
<p>Developed by HAMMALE | Data: DODa Audio Dataset</p> | |
</footer> | |
""" | |
) | |
# Set button click action | |
generate_btn.click( | |
fn=synthesize_speech, | |
inputs=[text_input, voice_type, speed], | |
outputs=[audio_output, error_output] | |
) | |
# Launch the demo | |
if __name__ == "__main__": | |
demo.launch() |