Spaces:

nareauow
/

speaker-recognition

Restarting

App Files Files Community

nareauow commited on 5 days ago

Commit

d698901

verified ·

1 Parent(s): 2fdcc5e

Update app.py

Browse files

Files changed (1) hide show

app.py +31 -287

app.py CHANGED Viewed

@@ -1,293 +1,37 @@
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-import numpy as np
-import scipy.io.wavfile as wav
-from scipy.fftpack import idct
-import gradio as gr
-import os
-import matplotlib.pyplot as plt
-from huggingface_hub import hf_hub_download
-from transformers import Speech2TextForConditionalGeneration, Speech2TextProcessor
-from transformers import pipeline, SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
-from datasets import load_dataset
-import soundfile as sf
-device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
-print(f"Using device: {device}")
-# Load speech-to-text model
-try:
-    speech_recognizer = Speech2TextForConditionalGeneration.from_pretrained("facebook/s2t-small-librispeech-asr").to(device)
-    speech_processor = Speech2TextProcessor.from_pretrained("facebook/s2t-small-librispeech-asr")
-    print("Speech recognition model loaded successfully!")
-except Exception as e:
-    print(f"Error loading speech recognition model: {e}")
-    speech_recognizer = None
-    speech_processor = None
-# Load text-to-speech models
-try:
-    # Load processor and model
-    tts_processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
-    tts_model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts").to(device)
-    tts_vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
-    # Load speaker embeddings
-    speaker_embeddings = torch.load("./speaker_embedding.pt").to(device)
-except Exception as e:
-    print(f"Error loading text-to-speech models: {e}")
-    tts_processor = None
-    tts_model = None
-    tts_vocoder = None
-    speaker_embeddings = None
-# Modele CNN
-class modele_CNN(nn.Module):
-    def __init__(self, num_classes=7, dropout=0.3):
-        super(modele_CNN, self).__init__()
-        self.conv1 = nn.Conv2d(1, 16, 3, padding=1)
-        self.conv2 = nn.Conv2d(16, 32, 3, padding=1)
-        self.conv3 = nn.Conv2d(32, 64, 3, padding=1)
-        self.pool = nn.MaxPool2d(2, 2)
-        self.fc1 = nn.Linear(64 * 1 * 62, 128)
-        self.fc2 = nn.Linear(128, num_classes)
-        self.dropout = nn.Dropout(dropout)
-    def forward(self, x):
-        x = self.pool(F.relu(self.conv1(x)))
-        x = self.pool(F.relu(self.conv2(x)))
-        x = self.pool(F.relu(self.conv3(x)))
-        x = x.view(x.size(0), -1)
-        x = self.dropout(F.relu(self.fc1(x)))
-        x = self.fc2(x)
-        return x
-# Audio processor
-class AudioProcessor:
-    def Mel2Hz(self, mel): return 700 * (np.power(10, mel/2595)-1)
-    def Hz2Mel(self, freq): return 2595 * np.log10(1+freq/700)
-    def Hz2Ind(self, freq, fs, Tfft): return (freq*Tfft/fs).astype(int)
-    def hamming(self, T):
-        if T <= 1:
-            return np.ones(T)
-        return 0.54-0.46*np.cos(2*np.pi*np.arange(T)/(T-1))
-    def FiltresMel(self, fs, nf=36, Tfft=512, fmin=100, fmax=8000):
-        Indices = self.Hz2Ind(self.Mel2Hz(np.linspace(self.Hz2Mel(fmin), self.Hz2Mel(min(fmax, fs/2)), nf+2)), fs, Tfft)
-        filtres = np.zeros((int(Tfft/2), nf))
-        for i in range(nf): filtres[Indices[i]:Indices[i+2], i] = self.hamming(Indices[i+2]-Indices[i])
-        return filtres
-    def spectrogram(self, x, T, p, Tfft):
-        S = []
-        for i in range(0, len(x)-T, p): S.append(x[i:i+T]*self.hamming(T))
-        S = np.fft.fft(S, Tfft)
-        return np.abs(S), np.angle(S)
-    def mfcc(self, data, filtres, nc=13, T=256, p=64, Tfft=512):
-        data = (data[1]-np.mean(data[1]))/np.std(data[1])
-        amp, ph = self.spectrogram(data, T, p, Tfft)
-        amp_f = np.log10(np.dot(amp[:, :int(Tfft/2)], filtres)+1)
-        return idct(amp_f, n=nc, norm='ortho')
-    def process_audio(self, audio_data, sr, audio_length=32000):
-        if sr != 16000:
-            audio_resampled = np.interp(
-                np.linspace(0, len(audio_data), int(16000 * len(audio_data) / sr)),
-                np.arange(len(audio_data)),
-                audio_data
             )
-            sgn = audio_resampled
-            fs = 16000
-        else:
-            sgn = audio_data
-            fs = sr
-        sgn = np.array(sgn, dtype=np.float32)
-        if len(sgn) > audio_length:
-            sgn = sgn[:audio_length]
-        else:
-            sgn = np.pad(sgn, (0, audio_length - len(sgn)), mode='constant')
-        filtres = self.FiltresMel(fs)
-        sgn_features = self.mfcc([fs, sgn], filtres)
-        mfcc_tensor = torch.tensor(sgn_features.T, dtype=torch.float32)
-        mfcc_tensor = mfcc_tensor.unsqueeze(0).unsqueeze(0)
-        return mfcc_tensor
-# Speech recognition function
-def recognize_speech(audio_path):
-    if speech_recognizer is None or speech_processor is None:
-        return "Speech recognition model not available"
-    try:
-        # Read audio file
-        audio_data, sr = sf.read(audio_path)
-        # Resample to 16kHz if needed
-        if sr != 16000:
-            audio_data = np.interp(
-                np.linspace(0, len(audio_data), int(16000 * len(audio_data) / sr)),
-                np.arange(len(audio_data)),
-                audio_data
-            )
-            sr = 16000
-        # Process audio
-        inputs = speech_processor(audio_data, sampling_rate=sr, return_tensors="pt")
-        inputs = {k: v.to(device) for k, v in inputs.items()}
-        # Generate transcription
-        generated_ids = speech_recognizer.generate(**inputs)
-        transcription = speech_processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
-        return transcription
-    except Exception as e:
-        return f"Speech recognition error: {str(e)}"
-# Speech synthesis function
-def synthesize_speech(text):
-    if tts_processor is None or tts_model is None or tts_vocoder is None or speaker_embeddings is None:
-        return None
-    try:
-        # Preprocess text
-        inputs = tts_processor(text=text, return_tensors="pt").to(device)
-        # Generate speech with speaker embeddings
-        spectrogram = tts_model.generate_speech(inputs["input_ids"], speaker_embeddings)
-        # Convert to waveform
-        with torch.no_grad():
-            speech = tts_vocoder(spectrogram)
-        # Convert to numpy array and normalize
-        speech = speech.cpu().numpy()
-        speech = speech / np.max(np.abs(speech))
-        return (16000, speech.squeeze())
-    except Exception as e:
-        print(f"Speech synthesis error: {str(e)}")
-        return None
-# ... (keep all previous imports and class definitions)
-# Updated predict_speaker function to return consistent values
-def predict_speaker(audio, model, processor):
-    if audio is None:
-        return "Aucun audio détecté.", {}, "Aucun texte reconnu", "Inconnu"  # Now returns 4 values
-    try:
-        audio_data, sr = sf.read(audio)
-        input_tensor = processor.process_audio(audio_data, sr)
-        device = next(model.parameters()).device
-        input_tensor = input_tensor.to(device)
-        with torch.no_grad():
-            output = model(input_tensor)
-            print(output)  # Debug output
-            probabilities = F.softmax(output, dim=1)
-            confidence, predicted_class = torch.max(probabilities, 1)
-        speakers = ["George", "Jackson", "Lucas", "Nicolas", "Theo", "Yweweler", "Narimene"]
-        predicted_speaker = speakers[predicted_class.item()]
-        result = f"Locuteur reconnu : {predicted_speaker} (confiance : {confidence.item()*100:.2f}%)"
-        probs_dict = {speakers[i]: float(probs) for i, probs in enumerate(probabilities[0].cpu().numpy())}
-        # Recognize speech
-        recognized_text = recognize_speech(audio) if speech_recognizer else "Modèle de reconnaissance vocale non disponible"
-        return result, probs_dict, recognized_text, predicted_speaker  # Now returns 4 values
-    except Exception as e:
-        return f"Erreur : {str(e)}", {}, "Erreur de reconnaissance", "Inconnu"
-# Updated recognize function
-def recognize(audio, selected_model):
-    model = load_model(model_filename=selected_model)
-    if model is None:
-        return "Erreur: Modèle non chargé", None, "Erreur", None
-    res, probs, text, speaker = predict_speaker(audio, model, processor)  # Now expects 4 values
-    # Generate plot
-    fig = None
-    if probs:
-        fig, ax = plt.subplots(figsize=(10, 6))
-        ax.bar(probs.keys(), probs.values(), color='skyblue')
-        ax.set_ylim([0, 1])
-        ax.set_ylabel("Confiance")
-        ax.set_xlabel("Locuteurs")
-        ax.set_title("Probabilités de reconnaissance")
-        plt.xticks(rotation=45)
-        plt.tight_layout()
-    # Generate speech synthesis if text was recognized
-    synth_audio = None
-    if synthesizer is not None and text and "erreur" not in text.lower():
-        try:
-            synth_text = f"Le locuteur {speaker} a dit : {text}" if speaker else f"Le locuteur a dit : {text}"
-            synth_audio = synthesize_speech(synth_text)
-        except Exception as e:
-            print(f"Erreur de synthèse vocale: {e}")
-    return res, fig, text, synth_audio if synth_audio else None
-# Updated interface creation
-def create_interface():
-    processor = AudioProcessor()
-    with gr.Blocks(title="Reconnaissance de Locuteur") as interface:
-        gr.Markdown("# 🗣️ Reconnaissance de Locuteur")
-        gr.Markdown("Enregistrez votre voix pendant 2 secondes pour identifier qui parle.")
-        with gr.Row():
-            with gr.Column():
-                # Dropdown pour sélectionner le modèle
-                model_selector = gr.Dropdown(
-                    choices=["model_1.pth", "model_2.pth", "model_3.pth"],
-                    value="model_3.pth",
-                    label="Choisissez le modèle"
-                )
-                # Créer des onglets pour Microphone et Upload Audio
-                with gr.Tab("Microphone"):
-                    mic_input = gr.Audio(sources=["microphone"], type="filepath", label="🎙️ Enregistrer depuis le microphone")
-                with gr.Tab("Upload Audio"):
-                    file_input = gr.Audio(sources=["upload"], type="filepath", label="📁 Télécharger un fichier audio")
-                # Bouton pour démarrer la reconnaissance
-                record_btn = gr.Button("Reconnaître")
-            with gr.Column():
-                # Résultat, graphique et texte reconnu
-                result_text = gr.Textbox(label="Résultat")
-                plot_output = gr.Plot(label="Confiance par locuteur")
-                recognized_text = gr.Textbox(label="Texte reconnu")
-                audio_output = gr.Audio(label="Synthèse vocale", visible=False)
-            # Fonction de clique pour la reconnaissance
-            def recognize(audio, selected_model):
-                # Traitement audio et modèle à charger...
-                pass  # Remplace ici avec ton code de traitement
         # Lier le bouton "Reconnaître" à la fonction
         record_btn.click(
             fn=recognize,
             inputs=[mic_input, file_input, model_selector],  # Remplacer Union par les deux inputs distincts
             outputs=[result_text, plot_output, recognized_text, audio_output]
-        )
-    return interface
-if __name__ == "__main__":
-    app = create_interface()
-    app.launch(share=True)

+    with gr.Row():
+        with gr.Column():
+            # Dropdown pour sélectionner le modèle
+            model_selector = gr.Dropdown(
+                choices=["model_1.pth", "model_2.pth", "model_3.pth"],
+                value="model_3.pth",
+                label="Choisissez le modèle"
             )
+            # Créer des onglets pour Microphone et Upload Audio
+            with gr.Tab("Microphone"):
+                mic_input = gr.Audio(sources=["microphone"], type="filepath", label="🎙️ Enregistrer depuis le microphone")
+            with gr.Tab("Upload Audio"):
+                file_input = gr.Audio(sources=["upload"], type="filepath", label="📁 Télécharger un fichier audio")
+            # Bouton pour démarrer la reconnaissance
+            record_btn = gr.Button("Reconnaître")
+        with gr.Column():
+            # Résultat, graphique et texte reconnu
+            result_text = gr.Textbox(label="Résultat")
+            plot_output = gr.Plot(label="Confiance par locuteur")
+            recognized_text = gr.Textbox(label="Texte reconnu")
+            audio_output = gr.Audio(label="Synthèse vocale", visible=False)
+        # Fonction de clique pour la reconnaissance
+        def recognize(audio, selected_model):
+            # Traitement audio et modèle à charger...
+            pass  # Remplace ici avec ton code de traitement
         # Lier le bouton "Reconnaître" à la fonction
         record_btn.click(
             fn=recognize,
             inputs=[mic_input, file_input, model_selector],  # Remplacer Union par les deux inputs distincts
             outputs=[result_text, plot_output, recognized_text, audio_output]
+        )