nareauow commited on
Commit
09e9dae
·
verified ·
1 Parent(s): d3d626b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +113 -11
app.py CHANGED
@@ -8,13 +8,45 @@ import gradio as gr
8
  import os
9
  import matplotlib.pyplot as plt
10
  from huggingface_hub import hf_hub_download
 
 
 
 
11
 
12
  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
13
  print(f"Using device: {device}")
14
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  # Modele CNN
16
  class modele_CNN(nn.Module):
17
- def __init__(self, num_classes=8, dropout=0.3):
18
  super(modele_CNN, self).__init__()
19
  self.conv1 = nn.Conv2d(1, 16, 3, padding=1)
20
  self.conv2 = nn.Conv2d(16, 32, 3, padding=1)
@@ -90,14 +122,68 @@ class AudioProcessor:
90
 
91
  return mfcc_tensor
92
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
  # Fonction prédiction
94
  def predict_speaker(audio, model, processor):
95
  if audio is None:
96
- return "Aucun audio détecté.", None
97
 
98
  try:
99
- import soundfile as sf
100
- audio_data, sr = sf.read(audio) # <- ici tu lis direct l'audio
101
  input_tensor = processor.process_audio(audio_data, sr)
102
 
103
  device = next(model.parameters()).device
@@ -116,10 +202,13 @@ def predict_speaker(audio, model, processor):
116
 
117
  probs_dict = {speakers[i]: float(probs) for i, probs in enumerate(probabilities[0].cpu().numpy())}
118
 
119
- return result, probs_dict
 
 
 
120
 
121
  except Exception as e:
122
- return f"Erreur : {str(e)}", None
123
 
124
  # Charger modèle
125
  def load_model(model_id="nareauow/my_speech_recognition", model_filename="model_3.pth"):
@@ -155,10 +244,14 @@ def create_interface():
155
  with gr.Column():
156
  result_text = gr.Textbox(label="Résultat")
157
  plot_output = gr.Plot(label="Confiance par locuteur")
 
 
158
 
159
  def recognize(audio, selected_model):
160
- model = load_model(model_filename=selected_model) # Charger le modèle choisi
161
- res, probs = predict_speaker(audio, model, processor)
 
 
162
  fig = None
163
  if probs:
164
  fig, ax = plt.subplots()
@@ -167,9 +260,18 @@ def create_interface():
167
  ax.set_ylabel("Confiance")
168
  ax.set_xlabel("Locuteurs")
169
  plt.xticks(rotation=45)
170
- return res, fig
 
 
 
 
 
 
 
171
 
172
- record_btn.click(fn=recognize, inputs=[audio_input, model_selector], outputs=[result_text, plot_output])
 
 
173
 
174
  gr.Markdown("""### Comment utiliser ?
175
  - Choisissez le modèle.
@@ -182,4 +284,4 @@ def create_interface():
182
  # Lancer
183
  if __name__ == "__main__":
184
  app = create_interface()
185
- app.launch(share=True)
 
8
  import os
9
  import matplotlib.pyplot as plt
10
  from huggingface_hub import hf_hub_download
11
+ from transformers import Speech2TextForConditionalGeneration, Speech2TextProcessor
12
+ from transformers import pipeline, SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
13
+ from datasets import load_dataset
14
+ import soundfile as sf
15
 
16
  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
17
  print(f"Using device: {device}")
18
 
19
+ # Load speech-to-text model
20
+ try:
21
+ speech_recognizer = Speech2TextForConditionalGeneration.from_pretrained("facebook/s2t-small-librispeech-asr").to(device)
22
+ speech_processor = Speech2TextProcessor.from_pretrained("facebook/s2t-small-librispeech-asr")
23
+ print("Speech recognition model loaded successfully!")
24
+ except Exception as e:
25
+ print(f"Error loading speech recognition model: {e}")
26
+ speech_recognizer = None
27
+ speech_processor = None
28
+
29
+ # Load text-to-speech models
30
+ try:
31
+ # Load processor and model
32
+ tts_processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
33
+ tts_model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts").to(device)
34
+ tts_vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
35
+
36
+ # Load speaker embeddings
37
+ embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
38
+ speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0).to(device)
39
+ print("Text-to-speech models loaded successfully!")
40
+ except Exception as e:
41
+ print(f"Error loading text-to-speech models: {e}")
42
+ tts_processor = None
43
+ tts_model = None
44
+ tts_vocoder = None
45
+ speaker_embeddings = None
46
+
47
  # Modele CNN
48
  class modele_CNN(nn.Module):
49
+ def __init__(self, num_classes=7, dropout=0.3):
50
  super(modele_CNN, self).__init__()
51
  self.conv1 = nn.Conv2d(1, 16, 3, padding=1)
52
  self.conv2 = nn.Conv2d(16, 32, 3, padding=1)
 
122
 
123
  return mfcc_tensor
124
 
125
+ # Speech recognition function
126
+ def recognize_speech(audio_path):
127
+ if speech_recognizer is None or speech_processor is None:
128
+ return "Speech recognition model not available"
129
+
130
+ try:
131
+ # Read audio file
132
+ audio_data, sr = sf.read(audio_path)
133
+
134
+ # Resample to 16kHz if needed
135
+ if sr != 16000:
136
+ audio_data = np.interp(
137
+ np.linspace(0, len(audio_data), int(16000 * len(audio_data) / sr)),
138
+ np.arange(len(audio_data)),
139
+ audio_data
140
+ )
141
+ sr = 16000
142
+
143
+ # Process audio
144
+ inputs = speech_processor(audio_data, sampling_rate=sr, return_tensors="pt")
145
+ inputs = {k: v.to(device) for k, v in inputs.items()}
146
+
147
+ # Generate transcription
148
+ generated_ids = speech_recognizer.generate(**inputs)
149
+ transcription = speech_processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
150
+
151
+ return transcription
152
+ except Exception as e:
153
+ return f"Speech recognition error: {str(e)}"
154
+
155
+ # Speech synthesis function
156
+ def synthesize_speech(text):
157
+ if tts_processor is None or tts_model is None or tts_vocoder is None or speaker_embeddings is None:
158
+ return None
159
+
160
+ try:
161
+ # Preprocess text
162
+ inputs = tts_processor(text=text, return_tensors="pt").to(device)
163
+
164
+ # Generate speech with speaker embeddings
165
+ spectrogram = tts_model.generate_speech(inputs["input_ids"], speaker_embeddings)
166
+
167
+ # Convert to waveform
168
+ with torch.no_grad():
169
+ speech = tts_vocoder(spectrogram)
170
+
171
+ # Convert to numpy array and normalize
172
+ speech = speech.cpu().numpy()
173
+ speech = speech / np.max(np.abs(speech))
174
+
175
+ return (16000, speech.squeeze())
176
+ except Exception as e:
177
+ print(f"Speech synthesis error: {str(e)}")
178
+ return None
179
+
180
  # Fonction prédiction
181
  def predict_speaker(audio, model, processor):
182
  if audio is None:
183
+ return "Aucun audio détecté.", None, None
184
 
185
  try:
186
+ audio_data, sr = sf.read(audio)
 
187
  input_tensor = processor.process_audio(audio_data, sr)
188
 
189
  device = next(model.parameters()).device
 
202
 
203
  probs_dict = {speakers[i]: float(probs) for i, probs in enumerate(probabilities[0].cpu().numpy())}
204
 
205
+ # Recognize speech
206
+ recognized_text = recognize_speech(audio)
207
+
208
+ return result, probs_dict, recognized_text,predicted_speaker
209
 
210
  except Exception as e:
211
+ return f"Erreur : {str(e)}", None, None
212
 
213
  # Charger modèle
214
  def load_model(model_id="nareauow/my_speech_recognition", model_filename="model_3.pth"):
 
244
  with gr.Column():
245
  result_text = gr.Textbox(label="Résultat")
246
  plot_output = gr.Plot(label="Confiance par locuteur")
247
+ recognized_text = gr.Textbox(label="Texte reconnu")
248
+ audio_output = gr.Audio(label="Synthèse vocale", type="numpy")
249
 
250
  def recognize(audio, selected_model):
251
+ model = load_model(model_filename=selected_model)
252
+ res, probs, text,locuteur = predict_speaker(audio, model, processor)
253
+
254
+ # Generate plot
255
  fig = None
256
  if probs:
257
  fig, ax = plt.subplots()
 
260
  ax.set_ylabel("Confiance")
261
  ax.set_xlabel("Locuteurs")
262
  plt.xticks(rotation=45)
263
+
264
+ # Generate speech synthesis if text was recognized
265
+ synth_audio = None
266
+ if text and "error" not in text.lower():
267
+ synth_text = f"{locuteur} said : {text}"
268
+ synth_audio = synthesize_speech(synth_text)
269
+
270
+ return res, fig, text, synth_audio
271
 
272
+ record_btn.click(fn=recognize,
273
+ inputs=[audio_input, model_selector],
274
+ outputs=[result_text, plot_output, recognized_text, audio_output])
275
 
276
  gr.Markdown("""### Comment utiliser ?
277
  - Choisissez le modèle.
 
284
  # Lancer
285
  if __name__ == "__main__":
286
  app = create_interface()
287
+ app.launch(share=True)