Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -8,13 +8,45 @@ import gradio as gr
|
|
8 |
import os
|
9 |
import matplotlib.pyplot as plt
|
10 |
from huggingface_hub import hf_hub_download
|
|
|
|
|
|
|
|
|
11 |
|
12 |
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
|
13 |
print(f"Using device: {device}")
|
14 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
# Modele CNN
|
16 |
class modele_CNN(nn.Module):
|
17 |
-
def __init__(self, num_classes=
|
18 |
super(modele_CNN, self).__init__()
|
19 |
self.conv1 = nn.Conv2d(1, 16, 3, padding=1)
|
20 |
self.conv2 = nn.Conv2d(16, 32, 3, padding=1)
|
@@ -90,14 +122,68 @@ class AudioProcessor:
|
|
90 |
|
91 |
return mfcc_tensor
|
92 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
93 |
# Fonction prédiction
|
94 |
def predict_speaker(audio, model, processor):
|
95 |
if audio is None:
|
96 |
-
return "Aucun audio détecté.", None
|
97 |
|
98 |
try:
|
99 |
-
|
100 |
-
audio_data, sr = sf.read(audio) # <- ici tu lis direct l'audio
|
101 |
input_tensor = processor.process_audio(audio_data, sr)
|
102 |
|
103 |
device = next(model.parameters()).device
|
@@ -116,10 +202,13 @@ def predict_speaker(audio, model, processor):
|
|
116 |
|
117 |
probs_dict = {speakers[i]: float(probs) for i, probs in enumerate(probabilities[0].cpu().numpy())}
|
118 |
|
119 |
-
|
|
|
|
|
|
|
120 |
|
121 |
except Exception as e:
|
122 |
-
return f"Erreur : {str(e)}", None
|
123 |
|
124 |
# Charger modèle
|
125 |
def load_model(model_id="nareauow/my_speech_recognition", model_filename="model_3.pth"):
|
@@ -155,10 +244,14 @@ def create_interface():
|
|
155 |
with gr.Column():
|
156 |
result_text = gr.Textbox(label="Résultat")
|
157 |
plot_output = gr.Plot(label="Confiance par locuteur")
|
|
|
|
|
158 |
|
159 |
def recognize(audio, selected_model):
|
160 |
-
model = load_model(model_filename=selected_model)
|
161 |
-
res, probs = predict_speaker(audio, model, processor)
|
|
|
|
|
162 |
fig = None
|
163 |
if probs:
|
164 |
fig, ax = plt.subplots()
|
@@ -167,9 +260,18 @@ def create_interface():
|
|
167 |
ax.set_ylabel("Confiance")
|
168 |
ax.set_xlabel("Locuteurs")
|
169 |
plt.xticks(rotation=45)
|
170 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
171 |
|
172 |
-
record_btn.click(fn=recognize,
|
|
|
|
|
173 |
|
174 |
gr.Markdown("""### Comment utiliser ?
|
175 |
- Choisissez le modèle.
|
@@ -182,4 +284,4 @@ def create_interface():
|
|
182 |
# Lancer
|
183 |
if __name__ == "__main__":
|
184 |
app = create_interface()
|
185 |
-
app.launch(share=True)
|
|
|
8 |
import os
|
9 |
import matplotlib.pyplot as plt
|
10 |
from huggingface_hub import hf_hub_download
|
11 |
+
from transformers import Speech2TextForConditionalGeneration, Speech2TextProcessor
|
12 |
+
from transformers import pipeline, SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
|
13 |
+
from datasets import load_dataset
|
14 |
+
import soundfile as sf
|
15 |
|
16 |
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
|
17 |
print(f"Using device: {device}")
|
18 |
|
19 |
+
# Load speech-to-text model
|
20 |
+
try:
|
21 |
+
speech_recognizer = Speech2TextForConditionalGeneration.from_pretrained("facebook/s2t-small-librispeech-asr").to(device)
|
22 |
+
speech_processor = Speech2TextProcessor.from_pretrained("facebook/s2t-small-librispeech-asr")
|
23 |
+
print("Speech recognition model loaded successfully!")
|
24 |
+
except Exception as e:
|
25 |
+
print(f"Error loading speech recognition model: {e}")
|
26 |
+
speech_recognizer = None
|
27 |
+
speech_processor = None
|
28 |
+
|
29 |
+
# Load text-to-speech models
|
30 |
+
try:
|
31 |
+
# Load processor and model
|
32 |
+
tts_processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
|
33 |
+
tts_model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts").to(device)
|
34 |
+
tts_vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
|
35 |
+
|
36 |
+
# Load speaker embeddings
|
37 |
+
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
|
38 |
+
speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0).to(device)
|
39 |
+
print("Text-to-speech models loaded successfully!")
|
40 |
+
except Exception as e:
|
41 |
+
print(f"Error loading text-to-speech models: {e}")
|
42 |
+
tts_processor = None
|
43 |
+
tts_model = None
|
44 |
+
tts_vocoder = None
|
45 |
+
speaker_embeddings = None
|
46 |
+
|
47 |
# Modele CNN
|
48 |
class modele_CNN(nn.Module):
|
49 |
+
def __init__(self, num_classes=7, dropout=0.3):
|
50 |
super(modele_CNN, self).__init__()
|
51 |
self.conv1 = nn.Conv2d(1, 16, 3, padding=1)
|
52 |
self.conv2 = nn.Conv2d(16, 32, 3, padding=1)
|
|
|
122 |
|
123 |
return mfcc_tensor
|
124 |
|
125 |
+
# Speech recognition function
|
126 |
+
def recognize_speech(audio_path):
|
127 |
+
if speech_recognizer is None or speech_processor is None:
|
128 |
+
return "Speech recognition model not available"
|
129 |
+
|
130 |
+
try:
|
131 |
+
# Read audio file
|
132 |
+
audio_data, sr = sf.read(audio_path)
|
133 |
+
|
134 |
+
# Resample to 16kHz if needed
|
135 |
+
if sr != 16000:
|
136 |
+
audio_data = np.interp(
|
137 |
+
np.linspace(0, len(audio_data), int(16000 * len(audio_data) / sr)),
|
138 |
+
np.arange(len(audio_data)),
|
139 |
+
audio_data
|
140 |
+
)
|
141 |
+
sr = 16000
|
142 |
+
|
143 |
+
# Process audio
|
144 |
+
inputs = speech_processor(audio_data, sampling_rate=sr, return_tensors="pt")
|
145 |
+
inputs = {k: v.to(device) for k, v in inputs.items()}
|
146 |
+
|
147 |
+
# Generate transcription
|
148 |
+
generated_ids = speech_recognizer.generate(**inputs)
|
149 |
+
transcription = speech_processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
|
150 |
+
|
151 |
+
return transcription
|
152 |
+
except Exception as e:
|
153 |
+
return f"Speech recognition error: {str(e)}"
|
154 |
+
|
155 |
+
# Speech synthesis function
|
156 |
+
def synthesize_speech(text):
|
157 |
+
if tts_processor is None or tts_model is None or tts_vocoder is None or speaker_embeddings is None:
|
158 |
+
return None
|
159 |
+
|
160 |
+
try:
|
161 |
+
# Preprocess text
|
162 |
+
inputs = tts_processor(text=text, return_tensors="pt").to(device)
|
163 |
+
|
164 |
+
# Generate speech with speaker embeddings
|
165 |
+
spectrogram = tts_model.generate_speech(inputs["input_ids"], speaker_embeddings)
|
166 |
+
|
167 |
+
# Convert to waveform
|
168 |
+
with torch.no_grad():
|
169 |
+
speech = tts_vocoder(spectrogram)
|
170 |
+
|
171 |
+
# Convert to numpy array and normalize
|
172 |
+
speech = speech.cpu().numpy()
|
173 |
+
speech = speech / np.max(np.abs(speech))
|
174 |
+
|
175 |
+
return (16000, speech.squeeze())
|
176 |
+
except Exception as e:
|
177 |
+
print(f"Speech synthesis error: {str(e)}")
|
178 |
+
return None
|
179 |
+
|
180 |
# Fonction prédiction
|
181 |
def predict_speaker(audio, model, processor):
|
182 |
if audio is None:
|
183 |
+
return "Aucun audio détecté.", None, None
|
184 |
|
185 |
try:
|
186 |
+
audio_data, sr = sf.read(audio)
|
|
|
187 |
input_tensor = processor.process_audio(audio_data, sr)
|
188 |
|
189 |
device = next(model.parameters()).device
|
|
|
202 |
|
203 |
probs_dict = {speakers[i]: float(probs) for i, probs in enumerate(probabilities[0].cpu().numpy())}
|
204 |
|
205 |
+
# Recognize speech
|
206 |
+
recognized_text = recognize_speech(audio)
|
207 |
+
|
208 |
+
return result, probs_dict, recognized_text,predicted_speaker
|
209 |
|
210 |
except Exception as e:
|
211 |
+
return f"Erreur : {str(e)}", None, None
|
212 |
|
213 |
# Charger modèle
|
214 |
def load_model(model_id="nareauow/my_speech_recognition", model_filename="model_3.pth"):
|
|
|
244 |
with gr.Column():
|
245 |
result_text = gr.Textbox(label="Résultat")
|
246 |
plot_output = gr.Plot(label="Confiance par locuteur")
|
247 |
+
recognized_text = gr.Textbox(label="Texte reconnu")
|
248 |
+
audio_output = gr.Audio(label="Synthèse vocale", type="numpy")
|
249 |
|
250 |
def recognize(audio, selected_model):
|
251 |
+
model = load_model(model_filename=selected_model)
|
252 |
+
res, probs, text,locuteur = predict_speaker(audio, model, processor)
|
253 |
+
|
254 |
+
# Generate plot
|
255 |
fig = None
|
256 |
if probs:
|
257 |
fig, ax = plt.subplots()
|
|
|
260 |
ax.set_ylabel("Confiance")
|
261 |
ax.set_xlabel("Locuteurs")
|
262 |
plt.xticks(rotation=45)
|
263 |
+
|
264 |
+
# Generate speech synthesis if text was recognized
|
265 |
+
synth_audio = None
|
266 |
+
if text and "error" not in text.lower():
|
267 |
+
synth_text = f"{locuteur} said : {text}"
|
268 |
+
synth_audio = synthesize_speech(synth_text)
|
269 |
+
|
270 |
+
return res, fig, text, synth_audio
|
271 |
|
272 |
+
record_btn.click(fn=recognize,
|
273 |
+
inputs=[audio_input, model_selector],
|
274 |
+
outputs=[result_text, plot_output, recognized_text, audio_output])
|
275 |
|
276 |
gr.Markdown("""### Comment utiliser ?
|
277 |
- Choisissez le modèle.
|
|
|
284 |
# Lancer
|
285 |
if __name__ == "__main__":
|
286 |
app = create_interface()
|
287 |
+
app.launch(share=True)
|