Spaces:
Sleeping
Sleeping
import torch | |
import torchaudio | |
from transformers import AutoFeatureExtractor, AutoModelForAudioClassification | |
MODEL_PATH = "D:/SER MiniProj/wav2vec2_model/" | |
TARGET_SAMPLE_RATE = 16000 # Model requires 16kHz audio | |
# Load feature extractor and model | |
feature_extractor = AutoFeatureExtractor.from_pretrained(MODEL_PATH) | |
model = AutoModelForAudioClassification.from_pretrained(MODEL_PATH) | |
print("Feature extractor and model loaded successfully!") | |
# Load an audio file | |
audio_file = "D:/SER MiniProj/temp_audio.wav" | |
waveform, sample_rate = torchaudio.load(audio_file) | |
# Convert to mono if needed | |
if waveform.shape[0] > 1: | |
waveform = torch.mean(waveform, dim=0, keepdim=True) | |
# Resample if the sample rate is not 16kHz | |
if sample_rate != TARGET_SAMPLE_RATE: | |
resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=TARGET_SAMPLE_RATE) | |
waveform = resampler(waveform) | |
sample_rate = TARGET_SAMPLE_RATE # Update sample rate | |
# Process the audio for the model | |
inputs = feature_extractor(waveform.squeeze(0), sampling_rate=sample_rate, return_tensors="pt") | |
# Perform inference | |
with torch.no_grad(): | |
logits = model(**inputs).logits | |
# Get the predicted emotion | |
predicted_label = torch.argmax(logits, dim=-1).item() | |
# Print the output | |
print(f"Predicted Emotion Class: {predicted_label}") | |