import os import torch import torchaudio import gradio as gr import numpy as np from transformers import AutoFeatureExtractor, AutoModelForAudioClassification import torchaudio.transforms as transforms MODEL_NAME = "alefiury/wav2vec2-large-xlsr-53-gender-recognition-librispeech" device = torch.device("cuda" if torch.cuda.is_available() else "cpu") feature_extractor = AutoFeatureExtractor.from_pretrained(MODEL_NAME) model = AutoModelForAudioClassification.from_pretrained(MODEL_NAME).to(device) label2id = {"female": 0, "male": 1} id2label = {0: "Female", 1: "Male"} def preprocess_audio(audio): """Convert stereo to mono, normalize, resample, and pad audio if needed.""" # Check if audio is not blank if audio is None: return None sr, audio_data = audio if audio_data is None: return None if audio_data.ndim > 1: audio_data = np.mean(audio_data, axis=0) audio_tensor = torch.tensor(audio_data, dtype=torch.float32) resampler = torchaudio.transforms.Resample(sr, 16000) audio_data_resampled = resampler(audio_tensor).numpy() min_length = 16000 if audio_data_resampled.shape[0] < min_length: padding = np.zeros(min_length - audio_data_resampled.shape[0], dtype=audio_data_resampled.dtype) audio_data_resampled = np.concatenate([audio_data_resampled, padding]) return audio_data_resampled def predict_gender(audio): if audio is None: return {"Error": "No audio provided."} audio_data = preprocess_audio(audio) if audio_data is None: return {"Error": "Invalid audio input."} inputs = feature_extractor(audio_data, sampling_rate=16000, return_tensors="pt", padding=True) # Move each tensor in the inputs dictionary to the desired device. inputs = {k: v.to(device) for k, v in inputs.items()} with torch.no_grad(): logits = model(**inputs).logits scores = torch.nn.functional.softmax(logits, dim=-1).squeeze().tolist() return { id2label[0]: scores[0], id2label[1]: scores[1] } demo = gr.Interface( fn=predict_gender, inputs=gr.Audio(type="numpy"), outputs=gr.Label(num_top_classes=2), title="Voice Gender Detection", description="Please use the microphone option and speak into the microphone to predict real time gender from voice." ) demo.launch(debug=False, share=True)