Spaces:
Sleeping
Sleeping
import os | |
import torch | |
import torchaudio | |
import gradio as gr | |
import numpy as np | |
from transformers import AutoFeatureExtractor, AutoModelForAudioClassification | |
import torchaudio.transforms as transforms | |
MODEL_NAME = "alefiury/wav2vec2-large-xlsr-53-gender-recognition-librispeech" | |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
feature_extractor = AutoFeatureExtractor.from_pretrained(MODEL_NAME) | |
model = AutoModelForAudioClassification.from_pretrained(MODEL_NAME).to(device) | |
label2id = {"female": 0, "male": 1} | |
id2label = {0: "Female", 1: "Male"} | |
def preprocess_audio(audio): | |
"""Convert stereo to mono, normalize, resample, and pad audio if needed.""" | |
# Check if audio is not blank | |
if audio is None: | |
return None | |
sr, audio_data = audio | |
if audio_data is None: | |
return None | |
if audio_data.ndim > 1: | |
audio_data = np.mean(audio_data, axis=0) | |
audio_tensor = torch.tensor(audio_data, dtype=torch.float32) | |
resampler = torchaudio.transforms.Resample(sr, 16000) | |
audio_data_resampled = resampler(audio_tensor).numpy() | |
min_length = 16000 | |
if audio_data_resampled.shape[0] < min_length: | |
padding = np.zeros(min_length - audio_data_resampled.shape[0], dtype=audio_data_resampled.dtype) | |
audio_data_resampled = np.concatenate([audio_data_resampled, padding]) | |
return audio_data_resampled | |
def predict_gender(audio): | |
if audio is None: | |
return {"Error": "No audio provided."} | |
audio_data = preprocess_audio(audio) | |
if audio_data is None: | |
return {"Error": "Invalid audio input."} | |
inputs = feature_extractor(audio_data, sampling_rate=16000, return_tensors="pt", padding=True) | |
# Move each tensor in the inputs dictionary to the desired device. | |
inputs = {k: v.to(device) for k, v in inputs.items()} | |
with torch.no_grad(): | |
logits = model(**inputs).logits | |
scores = torch.nn.functional.softmax(logits, dim=-1).squeeze().tolist() | |
return { id2label[0]: scores[0], id2label[1]: scores[1] } | |
demo = gr.Interface( | |
fn=predict_gender, | |
inputs=gr.Audio(type="numpy"), | |
outputs=gr.Label(num_top_classes=2), | |
title="Voice Gender Detection", | |
description="Please use the microphone option and speak into the microphone to predict real time gender from voice." | |
) | |
demo.launch(debug=False, share=True) | |