Spaces:
Sleeping
Sleeping
File size: 6,211 Bytes
1f75494 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 |
import streamlit as st
import torch
import torchaudio
import whisper
from transformers import pipeline, AutoFeatureExtractor, AutoModelForAudioClassification
import librosa
import librosa.display
import matplotlib.pyplot as plt
import numpy as np
import altair as alt
# Constants
MODEL_PATH = "D:/SER MiniProj/wav2vec2_model/"
TARGET_SAMPLE_RATE = 16000 # Required sample rate for Wav2Vec2
AUDIO_SAVE_PATH = "temp_audio.wav"
emotion_labels = {
0: "Neutral",
1: "Calm",
2: "Happy",
3: "Sad",
4: "Angry",
5: "Fearful",
6: "Disgust",
7: "Surprised"
}
# Load models with caching
@st.cache_resource
def load_models():
feature_extractor = AutoFeatureExtractor.from_pretrained(MODEL_PATH)
ser_model = AutoModelForAudioClassification.from_pretrained(MODEL_PATH)
whisper_model = whisper.load_model("base")
summarizer = pipeline("summarization", model="t5-base", framework="pt")
return feature_extractor, ser_model, whisper_model, summarizer
feature_extractor, ser_model, whisper_model, summarizer = load_models()
# UI Layout
st.set_page_config(page_title="Speech Analysis App", layout="wide")
st.title("Speech Emotion Recognition & Summarization")
st.markdown("Upload an audio file to analyze emotions, transcribe speech, and get a concise summary.")
uploaded_file = st.file_uploader("Upload Audio File", type=["wav", "mp3", "ogg"])
if uploaded_file:
with open(AUDIO_SAVE_PATH, "wb") as f:
f.write(uploaded_file.getbuffer())
st.audio(AUDIO_SAVE_PATH, format="audio/wav")
waveform, sample_rate = torchaudio.load(AUDIO_SAVE_PATH)
if waveform.shape[0] > 1:
waveform = torch.mean(waveform, dim=0, keepdim=True)
if sample_rate != TARGET_SAMPLE_RATE:
resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=TARGET_SAMPLE_RATE)
waveform = resampler(waveform)
y = waveform.squeeze().numpy()
# Audio Visualization
st.subheader("Audio Visualizations")
# Waveform
st.markdown("**Waveform**")
fig_wave, ax_wave = plt.subplots(figsize=(8, 1.8)) # Reduced vertical height
ax_wave.plot(y, linewidth=0.8)
ax_wave.set_xlabel("Samples")
ax_wave.set_ylabel("Amplitude")
ax_wave.set_title("Waveform", fontsize=10)
ax_wave.tick_params(labelsize=8)
fig_wave.tight_layout(pad=0.3)
st.pyplot(fig_wave)
col1, col2 = st.columns([1, 1])
with col1:
st.markdown("**Spectrogram**")
fig, ax = plt.subplots(figsize=(6, 3))
D = librosa.amplitude_to_db(librosa.stft(y), ref=np.max)
img = librosa.display.specshow(D, sr=TARGET_SAMPLE_RATE, x_axis='time', y_axis='log', ax=ax)
fig.colorbar(img, ax=ax, format="%+2.0f dB")
fig.tight_layout(pad=0.5)
st.pyplot(fig)
with col2:
st.markdown("**MFCCs**")
fig2, ax2 = plt.subplots(figsize=(6, 3))
mfccs = librosa.feature.mfcc(y=y, sr=TARGET_SAMPLE_RATE, n_mfcc=13)
img2 = librosa.display.specshow(mfccs, x_axis='time', ax=ax2)
fig2.colorbar(img2, ax=ax2)
fig2.tight_layout(pad=0.5)
st.pyplot(fig2)
# Emotion Prediction
st.subheader("Emotion Recognition")
inputs = feature_extractor(y, sampling_rate=TARGET_SAMPLE_RATE, return_tensors="pt")
with torch.no_grad():
logits = ser_model(**inputs).logits
predicted_class = torch.argmax(logits, dim=-1).item()
predicted_emotion = emotion_labels[predicted_class]
st.success(f"Predicted Emotion: {predicted_emotion}")
# Transcription & Summarization
st.subheader("Speech Transcription & Summary")
transcription = whisper_model.transcribe(AUDIO_SAVE_PATH)["text"]
st.info(f"Transcription: {transcription}")
summary = summarizer(transcription, max_length=50, min_length=10, do_sample=False)[0]["summary_text"]
st.success(f"Summary: {summary}")
# Playback speed
st.subheader("Playback Options")
speed = st.select_slider("Playback Speed", options=[0.5, 0.75, 1.0, 1.25, 1.5], value=1.0)
st.markdown(f"Playback speed set to {speed}x (you can use external player to preview adjusted audio)")
# Audio Info
st.markdown("**Audio Metadata**")
st.write(f"Duration: {round(len(y) / TARGET_SAMPLE_RATE, 2)} seconds")
st.write(f"Sample Rate: {TARGET_SAMPLE_RATE} Hz")
# # Streamlit UI
# st.title("🎤 Speech Analysis: Emotion & Summarization")
# # Upload audio file
# uploaded_file = st.file_uploader("Upload an audio file", type=["wav", "mp3", "ogg"])
# if uploaded_file is not None:
# # Save the uploaded file
# with open(AUDIO_SAVE_PATH, "wb") as f:
# f.write(uploaded_file.getbuffer())
# # Display the audio
# st.audio(AUDIO_SAVE_PATH, format="audio/wav")
# # **Speech Emotion Recognition**
# st.subheader("Speech Emotion Recognition")
# waveform, sample_rate = torchaudio.load(AUDIO_SAVE_PATH)
# # Convert stereo to mono
# if waveform.shape[0] > 1:
# waveform = torch.mean(waveform, dim=0, keepdim=True)
# # Resample if needed
# if sample_rate != TARGET_SAMPLE_RATE:
# resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=TARGET_SAMPLE_RATE)
# waveform = resampler(waveform)
# # Extract features
# inputs = feature_extractor(waveform.squeeze(0), sampling_rate=TARGET_SAMPLE_RATE, return_tensors="pt")
# # Get emotion prediction
# with torch.no_grad():
# logits = ser_model(**inputs).logits
# predicted_class = torch.argmax(logits, dim=-1).item()
# emotion = emotion_labels.get(predicted_class, "Unknown")
# st.success(f"Predicted Emotion: {emotion} ({predicted_class})")
# # **Speech Summarization**
# st.subheader(" Speech Summarization")
# transcription = whisper_model.transcribe(AUDIO_SAVE_PATH)["text"]
# st.info(f" Transcription: {transcription}")
# # Generate summary
# summary = summarizer(transcription, max_length=50, min_length=10, do_sample=False)[0]["summary_text"]
# st.success(f"The Summary: {summary}")
|