Spaces:
Sleeping
Sleeping
File size: 9,352 Bytes
152d61c d7ef86b 152d61c d7ef86b 317475a d7ef86b 317475a c39c802 0e85ac7 37b0c44 0e85ac7 d7ef86b 317475a 0e85ac7 152d61c d7ef86b 152d61c 317475a d7ef86b 317475a d7ef86b 317475a d7ef86b 317475a d7ef86b 317475a d7ef86b 317475a 7abe73c 3970052 152d61c d7ef86b 152d61c d7ef86b 317475a d7ef86b 317475a 0a4b920 e4cf4e2 152d61c d7ef86b 7abe73c d7ef86b 317475a 0a4b920 d7ef86b b1dada0 317475a 0a4b920 d7ef86b 317475a d7ef86b 317475a d7ef86b 317475a d7ef86b 317475a d7ef86b 317475a d7ef86b 317475a d7ef86b 317475a 0a4b920 d7ef86b 317475a d7ef86b 317475a d7ef86b 317475a d7ef86b 317475a 152d61c d7ef86b 152d61c d7ef86b 317475a d7ef86b 317475a d7ef86b 317475a d7ef86b 317475a d7ef86b 317475a d7ef86b 317475a c39c802 152d61c 317475a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 |
##########################################
# Step 0: Import required libraries
##########################################
import streamlit as st # For web interface
from transformers import (
pipeline, # For loading pre-trained models
SpeechT5Processor, # For text-to-speech processing
SpeechT5ForTextToSpeech, # TTS model
SpeechT5HifiGan, # Vocoder for generating audio waveforms
AutoModelForCausalLM, # For text generation
AutoTokenizer # For tokenizing input text
) # AI model components
from datasets import load_dataset # To load voice embeddings
import torch # For tensor computations
import soundfile as sf # For handling audio files
import re # For regular expressions in text processing
##########################################
# Initial configuration
##########################################
st.set_page_config(
page_title="Just Comment", # Title of the web app
page_icon="๐ฌ", # Icon displayed in the browser tab
layout="centered", # Center the layout of the app
initial_sidebar_state="collapsed" # Start with sidebar collapsed
)
##########################################
# Global model loading with caching
##########################################
@st.cache_resource(show_spinner=False) # Cache the models for performance
def _load_models():
"""Load and cache all ML models with optimized settings"""
return {
# Emotion classification pipeline
'emotion': pipeline(
"text-classification", # Specify task type
model="Thea231/jhartmann_emotion_finetuning", # Load the model
truncation=True # Enable text truncation for long inputs
),
# Text generation components
'textgen_tokenizer': AutoTokenizer.from_pretrained(
"Qwen/Qwen1.5-0.5B", # Load tokenizer
use_fast=True # Enable fast tokenization
),
'textgen_model': AutoModelForCausalLM.from_pretrained(
"Qwen/Qwen1.5-0.5B", # Load text generation model
torch_dtype=torch.float16 # Use half-precision for faster inference
),
# Text-to-speech components
'tts_processor': SpeechT5Processor.from_pretrained("microsoft/speecht5_tts"), # Load TTS processor
'tts_model': SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts"), # Load TTS model
'tts_vocoder': SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan"), # Load vocoder
# Preloaded speaker embeddings
'speaker_embeddings': torch.tensor(
load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")[7306]["xvector"] # Load speaker embeddings
).unsqueeze(0) # Add an additional dimension for batch processing
}
##########################################
# UI Components
##########################################
def _display_interface():
"""Render user interface elements"""
st.title("Just Comment") # Set the main title of the app
st.markdown("### I'm listening to you, my friend๏ฝ") # Subheading for user interaction
return st.text_area(
"๐ Enter your comment:", # Label for the text area
placeholder="Type your message here...", # Placeholder text
height=150, # Height of the text area
key="user_input" # Unique key for the text area
)
##########################################
# Core Processing Functions
##########################################
def _analyze_emotion(text, classifier):
"""Identify dominant emotion with confidence threshold"""
results = classifier(text, return_all_scores=True)[0] # Get emotion scores
valid_emotions = {'sadness', 'joy', 'love', 'anger', 'fear', 'surprise'} # Define valid emotions
filtered = [e for e in results if e['label'].lower() in valid_emotions] # Filter results by valid emotions
return max(filtered, key=lambda x: x['score']) # Return the emotion with the highest score
def _generate_prompt(text, emotion):
"""Create structured prompts for all emotion types"""
prompt_templates = {
"sadness": (
"Sadness detected: {input}\n"
"Required response structure:\n"
"1. Empathetic acknowledgment\n2. Support offer\n3. Solution proposal\n"
"Response:"
),
"joy": (
"Joy detected: {input}\n"
"Required response structure:\n"
"1. Enthusiastic thanks\n2. Positive reinforcement\n3. Future engagement\n"
"Response:"
),
"love": (
"Affection detected: {input}\n"
"Required response structure:\n"
"1. Warm appreciation\n2. Community focus\n3. Exclusive benefit\n"
"Response:"
),
"anger": (
"Anger detected: {input}\n"
"Required response structure:\n"
"1. Sincere apology\n2. Action steps\n3. Compensation\n"
"Response:"
),
"fear": (
"Concern detected: {input}\n"
"Required response structure:\n"
"1. Reassurance\n2. Safety measures\n3. Support options\n"
"Response:"
),
"surprise": (
"Surprise detected: {input}\n"
"Required response structure:\n"
"1. Acknowledge uniqueness\n2. Creative solution\n3. Follow-up\n"
"Response:"
)
}
return prompt_templates.get(emotion.lower(), "").format(input=text) # Format and return the appropriate prompt
def _process_response(raw_text):
"""Clean and format the generated response"""
# Extract text after last "Response:" marker
processed = raw_text.split("Response:")[-1].strip()
# Remove incomplete sentences
if '.' in processed:
processed = processed.rsplit('.', 1)[0] + '.' # Ensure the response ends with a period
# Ensure length between 50-200 characters
return processed[:200].strip() if len(processed) > 50 else "Thank you for your feedback. We value your input and will respond shortly."
def _generate_text_response(input_text, models):
"""Generate optimized text response with timing controls"""
# Emotion analysis
emotion = _analyze_emotion(input_text, models['emotion']) # Analyze the emotion of user input
# Prompt engineering
prompt = _generate_prompt(input_text, emotion['label']) # Generate prompt based on detected emotion
# Text generation with optimized parameters
inputs = models['textgen_tokenizer'](prompt, return_tensors="pt").to('cpu') # Tokenize the prompt
outputs = models['textgen_model'].generate(
inputs.input_ids, # Input token IDs
max_new_tokens=100, # Strict token limit for response length
temperature=0.7, # Control randomness in text generation
top_p=0.9, # Control diversity in sampling
do_sample=True, # Enable sampling to generate varied responses
pad_token_id=models['textgen_tokenizer'].eos_token_id # Use end-of-sequence token for padding
)
return _process_response(
models['textgen_tokenizer'].decode(outputs[0], skip_special_tokens=True) # Decode and process the response
)
def _generate_audio_response(text, models):
"""Convert text to speech with performance optimizations"""
# Process text input for TTS
inputs = models['tts_processor'](text=text, return_tensors="pt") # Tokenize input text for TTS
# Generate spectrogram
spectrogram = models['tts_model'].generate_speech(
inputs["input_ids"], # Input token IDs for TTS
models['speaker_embeddings'] # Use preloaded speaker embeddings
)
# Generate waveform with optimizations
with torch.no_grad(): # Disable gradient calculation for inference
waveform = models['tts_vocoder'](spectrogram) # Generate audio waveform from spectrogram
# Save audio file
sf.write("response.wav", waveform.numpy(), samplerate=16000) # Save waveform as a WAV file
return "response.wav" # Return the path to the saved audio file
##########################################
# Main Application Flow
##########################################
def main():
"""Primary execution flow"""
# Load models once
ml_models = _load_models() # Load all models and cache them
# Display interface
user_input = _display_interface() # Show the user input interface
if user_input: # Check if user has entered input
# Text generation stage
with st.spinner("๐ Analyzing emotions and generating response..."): # Show loading spinner
text_response = _generate_text_response(user_input, ml_models) # Generate text response
# Display results
st.subheader("๐ Generated Response") # Subheader for response section
st.markdown(f"```\n{text_response}\n```") # Display generated response in markdown format
# Audio generation stage
with st.spinner("๐ Converting to speech..."): # Show loading spinner
audio_file = _generate_audio_response(text_response, ml_models) # Generate audio response
st.audio(audio_file, format="audio/wav") # Play audio file in the app
if __name__ == "__main__":
main() # Execute the main function when the script is run |