Spaces:

nareauow
/

speaker-recognition

Running

App Files Files Community

nareauow commited on 3 days ago

Commit

09e9dae

verified ·

1 Parent(s): d3d626b

Update app.py

Browse files

Files changed (1) hide show

app.py +113 -11

app.py CHANGED Viewed

@@ -8,13 +8,45 @@ import gradio as gr
 import os
 import matplotlib.pyplot as plt
 from huggingface_hub import hf_hub_download
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 print(f"Using device: {device}")
 # Modele CNN
 class modele_CNN(nn.Module):
-    def __init__(self, num_classes=8, dropout=0.3):
         super(modele_CNN, self).__init__()
         self.conv1 = nn.Conv2d(1, 16, 3, padding=1)
         self.conv2 = nn.Conv2d(16, 32, 3, padding=1)
@@ -90,14 +122,68 @@ class AudioProcessor:
         return mfcc_tensor
 # Fonction prédiction
 def predict_speaker(audio, model, processor):
     if audio is None:
-        return "Aucun audio détecté.", None
     try:
-        import soundfile as sf
-        audio_data, sr = sf.read(audio)  # <- ici tu lis direct l'audio
         input_tensor = processor.process_audio(audio_data, sr)
         device = next(model.parameters()).device
@@ -116,10 +202,13 @@ def predict_speaker(audio, model, processor):
         probs_dict = {speakers[i]: float(probs) for i, probs in enumerate(probabilities[0].cpu().numpy())}
-        return result, probs_dict
     except Exception as e:
-        return f"Erreur : {str(e)}", None
 # Charger modèle
 def load_model(model_id="nareauow/my_speech_recognition", model_filename="model_3.pth"):
@@ -155,10 +244,14 @@ def create_interface():
             with gr.Column():
                 result_text = gr.Textbox(label="Résultat")
                 plot_output = gr.Plot(label="Confiance par locuteur")
         def recognize(audio, selected_model):
-            model = load_model(model_filename=selected_model)  # Charger le modèle choisi
-            res, probs = predict_speaker(audio, model, processor)
             fig = None
             if probs:
                 fig, ax = plt.subplots()
@@ -167,9 +260,18 @@ def create_interface():
                 ax.set_ylabel("Confiance")
                 ax.set_xlabel("Locuteurs")
                 plt.xticks(rotation=45)
-            return res, fig
-        record_btn.click(fn=recognize, inputs=[audio_input, model_selector], outputs=[result_text, plot_output])
         gr.Markdown("""### Comment utiliser ?
         - Choisissez le modèle.
@@ -182,4 +284,4 @@ def create_interface():
 # Lancer
 if __name__ == "__main__":
     app = create_interface()
-    app.launch(share=True)

 import os
 import matplotlib.pyplot as plt
 from huggingface_hub import hf_hub_download
+from transformers import Speech2TextForConditionalGeneration, Speech2TextProcessor
+from transformers import pipeline, SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
+from datasets import load_dataset
+import soundfile as sf
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 print(f"Using device: {device}")
+# Load speech-to-text model
+try:
+    speech_recognizer = Speech2TextForConditionalGeneration.from_pretrained("facebook/s2t-small-librispeech-asr").to(device)
+    speech_processor = Speech2TextProcessor.from_pretrained("facebook/s2t-small-librispeech-asr")
+    print("Speech recognition model loaded successfully!")
+except Exception as e:
+    print(f"Error loading speech recognition model: {e}")
+    speech_recognizer = None
+    speech_processor = None
+# Load text-to-speech models
+try:
+    # Load processor and model
+    tts_processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
+    tts_model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts").to(device)
+    tts_vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
+    # Load speaker embeddings
+    embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
+    speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0).to(device)
+    print("Text-to-speech models loaded successfully!")
+except Exception as e:
+    print(f"Error loading text-to-speech models: {e}")
+    tts_processor = None
+    tts_model = None
+    tts_vocoder = None
+    speaker_embeddings = None
 # Modele CNN
 class modele_CNN(nn.Module):
+    def __init__(self, num_classes=7, dropout=0.3):
         super(modele_CNN, self).__init__()
         self.conv1 = nn.Conv2d(1, 16, 3, padding=1)
         self.conv2 = nn.Conv2d(16, 32, 3, padding=1)
         return mfcc_tensor
+# Speech recognition function
+def recognize_speech(audio_path):
+    if speech_recognizer is None or speech_processor is None:
+        return "Speech recognition model not available"
+    try:
+        # Read audio file
+        audio_data, sr = sf.read(audio_path)
+        # Resample to 16kHz if needed
+        if sr != 16000:
+            audio_data = np.interp(
+                np.linspace(0, len(audio_data), int(16000 * len(audio_data) / sr)),
+                np.arange(len(audio_data)),
+                audio_data
+            )
+            sr = 16000
+        # Process audio
+        inputs = speech_processor(audio_data, sampling_rate=sr, return_tensors="pt")
+        inputs = {k: v.to(device) for k, v in inputs.items()}
+        # Generate transcription
+        generated_ids = speech_recognizer.generate(**inputs)
+        transcription = speech_processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
+        return transcription
+    except Exception as e:
+        return f"Speech recognition error: {str(e)}"
+# Speech synthesis function
+def synthesize_speech(text):
+    if tts_processor is None or tts_model is None or tts_vocoder is None or speaker_embeddings is None:
+        return None
+    try:
+        # Preprocess text
+        inputs = tts_processor(text=text, return_tensors="pt").to(device)
+        # Generate speech with speaker embeddings
+        spectrogram = tts_model.generate_speech(inputs["input_ids"], speaker_embeddings)
+        # Convert to waveform
+        with torch.no_grad():
+            speech = tts_vocoder(spectrogram)
+        # Convert to numpy array and normalize
+        speech = speech.cpu().numpy()
+        speech = speech / np.max(np.abs(speech))
+        return (16000, speech.squeeze())
+    except Exception as e:
+        print(f"Speech synthesis error: {str(e)}")
+        return None
 # Fonction prédiction
 def predict_speaker(audio, model, processor):
     if audio is None:
+        return "Aucun audio détecté.", None, None
     try:
+        audio_data, sr = sf.read(audio)
         input_tensor = processor.process_audio(audio_data, sr)
         device = next(model.parameters()).device
         probs_dict = {speakers[i]: float(probs) for i, probs in enumerate(probabilities[0].cpu().numpy())}
+        # Recognize speech
+        recognized_text = recognize_speech(audio)
+        return result, probs_dict, recognized_text,predicted_speaker
     except Exception as e:
+        return f"Erreur : {str(e)}", None, None
 # Charger modèle
 def load_model(model_id="nareauow/my_speech_recognition", model_filename="model_3.pth"):
             with gr.Column():
                 result_text = gr.Textbox(label="Résultat")
                 plot_output = gr.Plot(label="Confiance par locuteur")
+                recognized_text = gr.Textbox(label="Texte reconnu")
+                audio_output = gr.Audio(label="Synthèse vocale", type="numpy")
         def recognize(audio, selected_model):
+            model = load_model(model_filename=selected_model)
+            res, probs, text,locuteur = predict_speaker(audio, model, processor)
+            # Generate plot
             fig = None
             if probs:
                 fig, ax = plt.subplots()
                 ax.set_ylabel("Confiance")
                 ax.set_xlabel("Locuteurs")
                 plt.xticks(rotation=45)
+            # Generate speech synthesis if text was recognized
+            synth_audio = None
+            if text and "error" not in text.lower():
+                synth_text = f"{locuteur} said  : {text}"
+                synth_audio = synthesize_speech(synth_text)
+            return res, fig, text, synth_audio
+        record_btn.click(fn=recognize,
+                        inputs=[audio_input, model_selector],
+                        outputs=[result_text, plot_output, recognized_text, audio_output])
         gr.Markdown("""### Comment utiliser ?
         - Choisissez le modèle.
 # Lancer
 if __name__ == "__main__":
     app = create_interface()
+    app.launch(share=True)