Spaces:

joey1101
/

Comment_Reply

Running

App Files Files Community

joey1101 commited on Mar 28

Commit

befe307

verified ·

1 Parent(s): f551227

Update app.py

Browse files

Files changed (1) hide show

app.py +61 -82

app.py CHANGED Viewed

@@ -2,7 +2,7 @@
 # Step 0: Essential imports
 ##########################################
 import streamlit as st  # Web interface
-from transformers import (  # AI components: emotion analysis, TTS, and text generation
     pipeline,
     SpeechT5Processor,
     SpeechT5ForTextToSpeech,
@@ -10,15 +10,14 @@ from transformers import (  # AI components: emotion analysis, TTS, and text gen
     AutoModelForCausalLM,
     AutoTokenizer
 )
-from datasets import load_dataset  # To load speaker embeddings dataset
-import torch  # For tensor operations
-import soundfile as sf  # For writing audio files
-import sentencepiece  # Required for SpeechT5Processor tokenization
 ##########################################
 # Initial configuration (MUST BE FIRST)
 ##########################################
-st.set_page_config(  # Configure the web page
     page_title="Just Comment",
     page_icon="💬",
     layout="centered"
@@ -29,25 +28,25 @@ st.set_page_config(  # Configure the web page
 ##########################################
 @st.cache_resource(show_spinner=False)
 def _load_components():
-    """Load and cache all models with hardware optimization."""
-    device = "cuda" if torch.cuda.is_available() else "cpu"  # Detect available device
-    # Emotion classifier (fast and truncated)
     emotion_pipe = pipeline(
         "text-classification",
         model="Thea231/jhartmann_emotion_finetuning",
         device=device,
         truncation=True
     )
-    # Text generator (optimized with FP16 and auto device mapping)
     text_tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen1.5-0.5B")
     text_model = AutoModelForCausalLM.from_pretrained(
         "Qwen/Qwen1.5-0.5B",
         torch_dtype=torch.float16,
         device_map="auto"
     )
     # TTS system (accelerated)
     tts_processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
     tts_model = SpeechT5ForTextToSpeech.from_pretrained(
@@ -58,12 +57,12 @@ def _load_components():
         "microsoft/speecht5_hifigan",
         torch_dtype=torch.float16
     ).to(device)
-    # Preloaded voice profile for TTS
     speaker_emb = torch.tensor(
         load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")[7306]["xvector"]
     ).unsqueeze(0).to(device)
     return {
         "emotion": emotion_pipe,
         "text_model": text_model,
@@ -79,10 +78,10 @@ def _load_components():
 # User interface components
 ##########################################
 def _show_interface():
-    """Render the input interface"""
-    st.title("🚀 Just Comment")  # Display the title with a rocket icon
-    st.markdown("### I'm listening to you, my friend～")  # Display the friendly subtitle
-    return st.text_area(  # Return user's comment input
         "📝 Enter your comment:",
         placeholder="Share your thoughts...",
         height=150,
@@ -93,39 +92,28 @@ def _show_interface():
 # Core processing functions
 ##########################################
 def _fast_emotion(text, analyzer):
-    """Rapid emotion detection with input length limit."""
-    result = analyzer(text[:256], return_all_scores=True)[0]  # Analyze only the first 256 characters for speed
-    valid_emotions = ['sadness', 'joy', 'love', 'anger', 'fear', 'surprise']
-    # Select the emotion from valid ones or default to neutral
     return max(
-        (e for e in result if e['label'].lower() in valid_emotions),
         key=lambda x: x['score'],
         default={'label': 'neutral', 'score': 0}
     )
 def _build_prompt(text, emotion):
-    """Template-based prompt engineering in continuous prose (no bullet points)."""
-    templates = {
-        "sadness": "I sensed sadness in your comment: {text}. We are truly sorry and are here to support you.",
-        "joy": "Your comment radiates joy: {text}. Thank you for your bright feedback; we look forward to serving you even better.",
-        "love": "Your message exudes love: {text}. We appreciate your heartfelt words and cherish our connection with you.",
-        "anger": "I understand your comment reflects anger: {text}. Please accept our sincere apologies as we work to resolve your concerns.",
-        "fear": "It seems you feel fear in your comment: {text}. We want to reassure you that your safety and satisfaction are our priority.",
-        "surprise": "Your comment conveys surprise: {text}. We are delighted by your experience and will strive to exceed your expectations.",
-        "neutral": "Thank you for your comment: {text}. We remain committed to providing you with outstanding service."
-    }
-    # Build and return a continuous prompt with the user comment truncated to 200 characters
-    return templates.get(emotion.lower(), templates["neutral"]).format(text=text[:200])
 def _generate_response(text, models):
-    """Optimized text generation pipeline using the detected emotion."""
-    # Detect the dominant emotion quickly
-    detected = _fast_emotion(text, models["emotion"])
-    # Build prompt based on detected emotion (continuous sentences)
-    prompt = _build_prompt(text, detected["label"])
-    print(f"Generated prompt: {prompt}")  # Print prompt using f-string for debugging
-    # Generate text using the Qwen model
     inputs = models["text_tokenizer"](
         prompt,
         return_tensors="pt",
@@ -133,63 +121,54 @@ def _generate_response(text, models):
         truncation=True
     ).to(models["device"])
-    # Generate the response ensuring balanced length (approximately 50-200 tokens)
     output = models["text_model"].generate(
         inputs.input_ids,
-        max_new_tokens=120,  # Upper bound tokens for answer
-        min_length=50,       # Lower bound to ensure completeness
         temperature=0.7,
         top_p=0.9,
         do_sample=True,
         pad_token_id=models["text_tokenizer"].eos_token_id
     )
-    input_len = inputs.input_ids.shape[1]  # Determine the length of the prompt tokens
-    full_text = models["text_tokenizer"].decode(output[0], skip_special_tokens=True)
-    # Extract only the generated portion after "Response:" if present
-    response = full_text.split("Response:")[-1].strip()
-    print(f"Generated response: {response}")  # Debug print using f-string
-    # Return response ensuring it is within 50-200 words (approximation by character length here)
-    return response[:200]  # Truncate to 200 characters as an approximation
 def _text_to_speech(text, models):
-    """Efficiently synthesize speech for the given text."""
-    inputs = models["tts_processor"](
-        text=text[:150],  # Limit text length for TTS to 150 characters
-        return_tensors="pt"
-    ).to(models["device"])
-    with torch.inference_mode():  # Fast, no-grad inference
-        spectrogram = models["tts_model"].generate_speech(
-            inputs["input_ids"],
-            models["speaker_emb"]
-        )
         audio = models["tts_vocoder"](spectrogram)
-    sf.write("output.wav", audio.cpu().numpy(), 16000)  # Save generated audio as .wav at 16kHz
     return "output.wav"
 ##########################################
 # Main application flow
 ##########################################
 def main():
-    """Primary execution controller."""
-    components = _load_components()  # Load all models and components
-    user_input = _show_interface()  # Render input interface and capture user comment
-    if user_input:  # If a comment is provided
-        with st.spinner("🔍 Generating response..."):
-            generated_response = _generate_response(user_input, components)  # Generate response based on emotion and text
         st.subheader("📄 Response")
-        st.markdown(
-            f"<p style='color:#3498DB; font-size:20px;'>{generated_response}</p>",
-            unsafe_allow_html=True
-        )  # Display the generated response in styled format
-        with st.spinner("🔊 Synthesizing audio..."):
-            audio_file = _text_to_speech(generated_response, components)  # Convert response to speech
-            st.audio(audio_file, format="audio/wav", start_time=0)  # Embed auto-playing audio player
-        print(f"Final generated response: {generated_response}")  # Debug output using f-string
-# Run the main function when the script is executed
 if __name__ == "__main__":
-    main()  # Call the main function

 # Step 0: Essential imports
 ##########################################
 import streamlit as st  # Web interface
+from transformers import (  # AI components
     pipeline,
     SpeechT5Processor,
     SpeechT5ForTextToSpeech,
     AutoModelForCausalLM,
     AutoTokenizer
 )
+from datasets import load_dataset  # Voice data
+import torch  # Tensor operations
+import soundfile as sf  # Audio processing
 ##########################################
 # Initial configuration (MUST BE FIRST)
 ##########################################
+st.set_page_config(  # Set page config first
     page_title="Just Comment",
     page_icon="💬",
     layout="centered"
 ##########################################
 @st.cache_resource(show_spinner=False)
 def _load_components():
+    """Load and cache all models with hardware optimization"""
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    # Emotion classifier (fast)
     emotion_pipe = pipeline(
         "text-classification",
         model="Thea231/jhartmann_emotion_finetuning",
         device=device,
         truncation=True
     )
+    # Text generator (optimized)
     text_tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen1.5-0.5B")
     text_model = AutoModelForCausalLM.from_pretrained(
         "Qwen/Qwen1.5-0.5B",
         torch_dtype=torch.float16,
         device_map="auto"
     )
     # TTS system (accelerated)
     tts_processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
     tts_model = SpeechT5ForTextToSpeech.from_pretrained(
         "microsoft/speecht5_hifigan",
         torch_dtype=torch.float16
     ).to(device)
+    # Preloaded voice profile
     speaker_emb = torch.tensor(
         load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")[7306]["xvector"]
     ).unsqueeze(0).to(device)
     return {
         "emotion": emotion_pipe,
         "text_model": text_model,
 # User interface components
 ##########################################
 def _show_interface():
+    """Render input interface"""
+    st.title("Just Comment")
+    st.markdown("### I'm listening to you, my friend～")
+    return st.text_area(  # Input field
         "📝 Enter your comment:",
         placeholder="Share your thoughts...",
         height=150,
 # Core processing functions
 ##########################################
 def _fast_emotion(text, analyzer):
+    """Rapid emotion detection with input limits"""
+    result = analyzer(text[:256], return_all_scores=True)[0]  # Limit input length
+    emotions = ['sadness', 'joy', 'love', 'anger', 'fear', 'surprise']
     return max(
+        (e for e in result if e['label'].lower() in emotions),
         key=lambda x: x['score'],
         default={'label': 'neutral', 'score': 0}
     )
 def _build_prompt(text, emotion):
+    """Template-based prompt engineering for response generation"""
+    return f"{emotion.capitalize()} detected: {text}\nRespond with a coherent and supportive response."
 def _generate_response(text, models):
+    """Optimized text generation pipeline"""
+    # Emotion detection
+    emotion = _fast_emotion(text, models["emotion"])
+    # Prompt construction
+    prompt = _build_prompt(text, emotion["label"])
+    # Generate text
     inputs = models["text_tokenizer"](
         prompt,
         return_tensors="pt",
         truncation=True
     ).to(models["device"])
     output = models["text_model"].generate(
         inputs.input_ids,
+        max_new_tokens=100,  # Balanced length for response
         temperature=0.7,
         top_p=0.9,
         do_sample=True,
         pad_token_id=models["text_tokenizer"].eos_token_id
     )
+    # Process output
+    response = models["text_tokenizer"].decode(output[0], skip_special_tokens=True)
+    return response.strip()[:200] or "Thank you for your feedback."
 def _text_to_speech(text, models):
+    """High-speed audio synthesis"""
+    inputs = models["tts_processor"](text=text[:150], return_tensors="pt").to(models["device"])
+    with torch.inference_mode():  # Accelerated inference
+        spectrogram = models["tts_model"].generate_speech(inputs["input_ids"], models["speaker_emb"])
         audio = models["tts_vocoder"](spectrogram)
+    sf.write("output.wav", audio.cpu().numpy(), 16000)
     return "output.wav"
 ##########################################
 # Main application flow
 ##########################################
 def main():
+    """Primary execution controller"""
+    # Load components
+    components = _load_components()
+    # Show interface
+    user_input = _show_interface()
+    if user_input:
+        # Text generation
+        with st.spinner("🔍 Analyzing..."):
+            response = _generate_response(user_input, components)
+        # Display result
         st.subheader("📄 Response")
+        st.markdown(f"```\n{response}\n```")  # f-string formatted
+        # Audio generation
+        with st.spinner("🔊 Synthesizing..."):
+            audio_path = _text_to_speech(response, components)
+            st.audio(audio_path, format="audio/wav")
 if __name__ == "__main__":
+    main()  # Execute the main function