VoiceGenX / app.py
morbiwalaq's picture
Create app.py
03ec144 verified
import os
import torch
import torchaudio
import gradio as gr
import numpy as np
from transformers import AutoFeatureExtractor, AutoModelForAudioClassification
import torchaudio.transforms as transforms
MODEL_NAME = "alefiury/wav2vec2-large-xlsr-53-gender-recognition-librispeech"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
feature_extractor = AutoFeatureExtractor.from_pretrained(MODEL_NAME)
model = AutoModelForAudioClassification.from_pretrained(MODEL_NAME).to(device)
label2id = {"female": 0, "male": 1}
id2label = {0: "Female", 1: "Male"}
def preprocess_audio(audio):
"""Convert stereo to mono, normalize, resample, and pad audio if needed."""
# Check if audio is not blank
if audio is None:
return None
sr, audio_data = audio
if audio_data is None:
return None
if audio_data.ndim > 1:
audio_data = np.mean(audio_data, axis=0)
audio_tensor = torch.tensor(audio_data, dtype=torch.float32)
resampler = torchaudio.transforms.Resample(sr, 16000)
audio_data_resampled = resampler(audio_tensor).numpy()
min_length = 16000
if audio_data_resampled.shape[0] < min_length:
padding = np.zeros(min_length - audio_data_resampled.shape[0], dtype=audio_data_resampled.dtype)
audio_data_resampled = np.concatenate([audio_data_resampled, padding])
return audio_data_resampled
def predict_gender(audio):
if audio is None:
return {"Error": "No audio provided."}
audio_data = preprocess_audio(audio)
if audio_data is None:
return {"Error": "Invalid audio input."}
inputs = feature_extractor(audio_data, sampling_rate=16000, return_tensors="pt", padding=True)
# Move each tensor in the inputs dictionary to the desired device.
inputs = {k: v.to(device) for k, v in inputs.items()}
with torch.no_grad():
logits = model(**inputs).logits
scores = torch.nn.functional.softmax(logits, dim=-1).squeeze().tolist()
return { id2label[0]: scores[0], id2label[1]: scores[1] }
demo = gr.Interface(
fn=predict_gender,
inputs=gr.Audio(type="numpy"),
outputs=gr.Label(num_top_classes=2),
title="Voice Gender Detection",
description="Please use the microphone option and speak into the microphone to predict real time gender from voice."
)
demo.launch(debug=False, share=True)