File size: 2,389 Bytes
03ec144
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
import os
import torch
import torchaudio
import gradio as gr
import numpy as np
from transformers import AutoFeatureExtractor, AutoModelForAudioClassification
import torchaudio.transforms as transforms


MODEL_NAME = "alefiury/wav2vec2-large-xlsr-53-gender-recognition-librispeech"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

feature_extractor = AutoFeatureExtractor.from_pretrained(MODEL_NAME)
model = AutoModelForAudioClassification.from_pretrained(MODEL_NAME).to(device)

label2id = {"female": 0, "male": 1}
id2label = {0: "Female", 1: "Male"}


def preprocess_audio(audio):
    """Convert stereo to mono, normalize, resample, and pad audio if needed."""
    # Check if audio is not blank
    if audio is None:
        return None
    sr, audio_data = audio
    if audio_data is None:
        return None

    if audio_data.ndim > 1:
        audio_data = np.mean(audio_data, axis=0)

    audio_tensor = torch.tensor(audio_data, dtype=torch.float32)
    resampler = torchaudio.transforms.Resample(sr, 16000)
    audio_data_resampled = resampler(audio_tensor).numpy()
    
    
    min_length = 16000
    if audio_data_resampled.shape[0] < min_length:
        padding = np.zeros(min_length - audio_data_resampled.shape[0], dtype=audio_data_resampled.dtype)
        audio_data_resampled = np.concatenate([audio_data_resampled, padding])
    
    return audio_data_resampled

def predict_gender(audio):

    if audio is None:
        return {"Error": "No audio provided."}
    audio_data = preprocess_audio(audio)
    if audio_data is None:
        return {"Error": "Invalid audio input."}
    
    inputs = feature_extractor(audio_data, sampling_rate=16000, return_tensors="pt", padding=True)
    # Move each tensor in the inputs dictionary to the desired device.
    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    with torch.no_grad():
        logits = model(**inputs).logits
        scores = torch.nn.functional.softmax(logits, dim=-1).squeeze().tolist()
    
    return { id2label[0]: scores[0], id2label[1]: scores[1] }


demo = gr.Interface(
    fn=predict_gender,
    inputs=gr.Audio(type="numpy"),  
    outputs=gr.Label(num_top_classes=2), 
    title="Voice Gender Detection",
    description="Please use the microphone option and speak into the microphone to predict real time gender from voice."
)

demo.launch(debug=False, share=True)