Spaces:

joey1101
/

Comment_Reply

Sleeping

File size: 9,352 Bytes

152d61c
d7ef86b
152d61c
d7ef86b
 
317475a
 
 
 
 
 
d7ef86b
317475a
 
 
 
 
c39c802
0e85ac7
37b0c44
0e85ac7
d7ef86b
317475a
 
 
 
0e85ac7
 
152d61c
d7ef86b
152d61c
317475a
d7ef86b
 
 
 
 
317475a
 
d7ef86b
 
 
 
 
317475a
d7ef86b
 
 
317475a
d7ef86b
 
 
 
317475a
 
 
d7ef86b
 
 
317475a
 
7abe73c
3970052
152d61c
d7ef86b
152d61c
d7ef86b
 
317475a
 
d7ef86b
 
317475a
 
 
 
0a4b920
e4cf4e2
152d61c
d7ef86b
7abe73c
d7ef86b
 
317475a
 
 
 
0a4b920
d7ef86b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b1dada0
317475a
0a4b920
d7ef86b
317475a
d7ef86b
 
 
 
 
317475a
d7ef86b
 
 
 
 
 
 
317475a
d7ef86b
 
317475a
d7ef86b
 
317475a
d7ef86b
317475a
 
 
 
 
 
d7ef86b
 
 
317475a
0a4b920
 
d7ef86b
 
317475a
 
d7ef86b
 
 
317475a
 
d7ef86b
 
 
317475a
 
d7ef86b
 
317475a
 
152d61c
 
d7ef86b
152d61c
 
d7ef86b
 
317475a
d7ef86b
 
317475a
d7ef86b
317475a
d7ef86b
317475a
 
d7ef86b
 
317475a
 
d7ef86b
 
317475a
 
 
c39c802
152d61c
317475a

##########################################
# Step 0: Import required libraries
##########################################
import streamlit as st  # For web interface
from transformers import (
    pipeline,  # For loading pre-trained models
    SpeechT5Processor,  # For text-to-speech processing
    SpeechT5ForTextToSpeech,  # TTS model
    SpeechT5HifiGan,  # Vocoder for generating audio waveforms
    AutoModelForCausalLM,  # For text generation
    AutoTokenizer  # For tokenizing input text
)  # AI model components

from datasets import load_dataset  # To load voice embeddings
import torch  # For tensor computations
import soundfile as sf  # For handling audio files
import re  # For regular expressions in text processing

##########################################
# Initial configuration
##########################################
st.set_page_config(
    page_title="Just Comment",  # Title of the web app
    page_icon="💬",  # Icon displayed in the browser tab
    layout="centered",  # Center the layout of the app
    initial_sidebar_state="collapsed"  # Start with sidebar collapsed
)

##########################################
# Global model loading with caching
##########################################
@st.cache_resource(show_spinner=False)  # Cache the models for performance
def _load_models():
    """Load and cache all ML models with optimized settings"""
    return {
        # Emotion classification pipeline
        'emotion': pipeline(
            "text-classification",  # Specify task type
            model="Thea231/jhartmann_emotion_finetuning",  # Load the model
            truncation=True  # Enable text truncation for long inputs
        ),
        
        # Text generation components
        'textgen_tokenizer': AutoTokenizer.from_pretrained(
            "Qwen/Qwen1.5-0.5B",  # Load tokenizer
            use_fast=True  # Enable fast tokenization
        ),
        'textgen_model': AutoModelForCausalLM.from_pretrained(
            "Qwen/Qwen1.5-0.5B",  # Load text generation model
            torch_dtype=torch.float16  # Use half-precision for faster inference
        ),
        
        # Text-to-speech components
        'tts_processor': SpeechT5Processor.from_pretrained("microsoft/speecht5_tts"),  # Load TTS processor
        'tts_model': SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts"),  # Load TTS model
        'tts_vocoder': SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan"),  # Load vocoder
        
        # Preloaded speaker embeddings
        'speaker_embeddings': torch.tensor(
            load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")[7306]["xvector"]  # Load speaker embeddings
        ).unsqueeze(0)  # Add an additional dimension for batch processing
    }

##########################################
# UI Components
##########################################
def _display_interface():
    """Render user interface elements"""
    st.title("Just Comment")  # Set the main title of the app
    st.markdown("### I'm listening to you, my friend～")  # Subheading for user interaction
    
    return st.text_area(
        "📝 Enter your comment:",  # Label for the text area
        placeholder="Type your message here...",  # Placeholder text
        height=150,  # Height of the text area
        key="user_input"  # Unique key for the text area
    )

##########################################
# Core Processing Functions
##########################################
def _analyze_emotion(text, classifier):
    """Identify dominant emotion with confidence threshold"""
    results = classifier(text, return_all_scores=True)[0]  # Get emotion scores
    valid_emotions = {'sadness', 'joy', 'love', 'anger', 'fear', 'surprise'}  # Define valid emotions
    filtered = [e for e in results if e['label'].lower() in valid_emotions]  # Filter results by valid emotions
    return max(filtered, key=lambda x: x['score'])  # Return the emotion with the highest score

def _generate_prompt(text, emotion):
    """Create structured prompts for all emotion types"""
    prompt_templates = {
        "sadness": (
            "Sadness detected: {input}\n"
            "Required response structure:\n"
            "1. Empathetic acknowledgment\n2. Support offer\n3. Solution proposal\n"
            "Response:"
        ),
        "joy": (
            "Joy detected: {input}\n"
            "Required response structure:\n"
            "1. Enthusiastic thanks\n2. Positive reinforcement\n3. Future engagement\n"
            "Response:"
        ),
        "love": (
            "Affection detected: {input}\n"
            "Required response structure:\n"
            "1. Warm appreciation\n2. Community focus\n3. Exclusive benefit\n"
            "Response:"
        ),
        "anger": (
            "Anger detected: {input}\n"
            "Required response structure:\n"
            "1. Sincere apology\n2. Action steps\n3. Compensation\n"
            "Response:"
        ),
        "fear": (
            "Concern detected: {input}\n"
            "Required response structure:\n"
            "1. Reassurance\n2. Safety measures\n3. Support options\n"
            "Response:"
        ),
        "surprise": (
            "Surprise detected: {input}\n"
            "Required response structure:\n"
            "1. Acknowledge uniqueness\n2. Creative solution\n3. Follow-up\n"
            "Response:"
        )
    }
    return prompt_templates.get(emotion.lower(), "").format(input=text)  # Format and return the appropriate prompt

def _process_response(raw_text):
    """Clean and format the generated response"""
    # Extract text after last "Response:" marker
    processed = raw_text.split("Response:")[-1].strip()
    
    # Remove incomplete sentences
    if '.' in processed:
        processed = processed.rsplit('.', 1)[0] + '.'  # Ensure the response ends with a period
    
    # Ensure length between 50-200 characters
    return processed[:200].strip() if len(processed) > 50 else "Thank you for your feedback. We value your input and will respond shortly."

def _generate_text_response(input_text, models):
    """Generate optimized text response with timing controls"""
    # Emotion analysis
    emotion = _analyze_emotion(input_text, models['emotion'])  # Analyze the emotion of user input
    
    # Prompt engineering
    prompt = _generate_prompt(input_text, emotion['label'])  # Generate prompt based on detected emotion
    
    # Text generation with optimized parameters
    inputs = models['textgen_tokenizer'](prompt, return_tensors="pt").to('cpu')  # Tokenize the prompt
    outputs = models['textgen_model'].generate(
        inputs.input_ids,  # Input token IDs
        max_new_tokens=100,  # Strict token limit for response length
        temperature=0.7,  # Control randomness in text generation
        top_p=0.9,  # Control diversity in sampling
        do_sample=True,  # Enable sampling to generate varied responses
        pad_token_id=models['textgen_tokenizer'].eos_token_id  # Use end-of-sequence token for padding
    )
    
    return _process_response(
        models['textgen_tokenizer'].decode(outputs[0], skip_special_tokens=True)  # Decode and process the response
    )

def _generate_audio_response(text, models):
    """Convert text to speech with performance optimizations"""
    # Process text input for TTS
    inputs = models['tts_processor'](text=text, return_tensors="pt")  # Tokenize input text for TTS
    
    # Generate spectrogram
    spectrogram = models['tts_model'].generate_speech(
        inputs["input_ids"],  # Input token IDs for TTS
        models['speaker_embeddings']  # Use preloaded speaker embeddings
    )
    
    # Generate waveform with optimizations
    with torch.no_grad():  # Disable gradient calculation for inference
        waveform = models['tts_vocoder'](spectrogram)  # Generate audio waveform from spectrogram
    
    # Save audio file
    sf.write("response.wav", waveform.numpy(), samplerate=16000)  # Save waveform as a WAV file
    return "response.wav"  # Return the path to the saved audio file

##########################################
# Main Application Flow
##########################################
def main():
    """Primary execution flow"""
    # Load models once
    ml_models = _load_models()  # Load all models and cache them
    
    # Display interface
    user_input = _display_interface()  # Show the user input interface
    
    if user_input:  # Check if user has entered input
        # Text generation stage
        with st.spinner("🔍 Analyzing emotions and generating response..."):  # Show loading spinner
            text_response = _generate_text_response(user_input, ml_models)  # Generate text response
        
        # Display results
        st.subheader("📄 Generated Response")  # Subheader for response section
        st.markdown(f"```\n{text_response}\n```")  # Display generated response in markdown format
        
        # Audio generation stage
        with st.spinner("🔊 Converting to speech..."):  # Show loading spinner
            audio_file = _generate_audio_response(text_response, ml_models)  # Generate audio response
            st.audio(audio_file, format="audio/wav")  # Play audio file in the app

if __name__ == "__main__":
    main()  # Execute the main function when the script is run