Comment_Reply / app.py
joey1101's picture
Update app.py
5e4841e verified
raw
history blame
6.7 kB
##########################################
# Step 0: Essential imports
##########################################
import streamlit as st # Web interface
from transformers import ( # AI components
pipeline,
SpeechT5Processor,
SpeechT5ForTextToSpeech,
SpeechT5HifiGan,
AutoModelForCausalLM,
AutoTokenizer
)
from datasets import load_dataset # Voice data
import torch # Tensor operations
import soundfile as sf # Audio processing
##########################################
# Initial configuration (MUST BE FIRST)
##########################################
st.set_page_config( # Set page config first
page_title="Just Comment",
page_icon="💬",
layout="centered"
)
##########################################
# Optimized model loader with caching
##########################################
@st.cache_resource(show_spinner=False)
def _load_components():
"""Load and cache all models with hardware optimization"""
device = "cuda" if torch.cuda.is_available() else "cpu"
# Emotion classifier (fast)
emotion_pipe = pipeline(
"text-classification",
model="Thea231/jhartmann_emotion_finetuning",
device=device,
truncation=True
)
# Text generator (optimized)
text_tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen1.5-0.5B")
text_model = AutoModelForCausalLM.from_pretrained(
"Qwen/Qwen1.5-0.5B",
torch_dtype=torch.float16,
device_map="auto"
)
# TTS system (accelerated)
tts_processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
tts_model = SpeechT5ForTextToSpeech.from_pretrained(
"microsoft/speecht5_tts",
torch_dtype=torch.float16
).to(device)
tts_vocoder = SpeechT5HifiGan.from_pretrained(
"microsoft/speecht5_hifigan",
torch_dtype=torch.float16
).to(device)
# Preloaded voice profile
speaker_emb = torch.tensor(
load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")[7306]["xvector"]
).unsqueeze(0).to(device)
return {
"emotion": emotion_pipe,
"text_model": text_model,
"text_tokenizer": text_tokenizer,
"tts_processor": tts_processor,
"tts_model": tts_model,
"tts_vocoder": tts_vocoder,
"speaker_emb": speaker_emb,
"device": device
}
##########################################
# User interface components
##########################################
def _show_interface():
"""Render input interface"""
st.title("Just Comment")
st.markdown(f"### I'm listening to you, my friend~")
return st.text_area( # Input field
"📝 Enter your comment:",
placeholder="Share your thoughts...",
height=150,
key="input"
)
##########################################
# Core processing functions
##########################################
def _fast_emotion(text, analyzer):
"""Rapid emotion detection with input limits"""
result = analyzer(text[:256], return_all_scores=True)[0] # Limit input length
emotions = ['sadness', 'joy', 'love', 'anger', 'fear', 'surprise']
return max(
(e for e in result if e['label'].lower() in emotions),
key=lambda x: x['score'],
default={'label': 'neutral', 'score': 0}
)
def _build_prompt(text, emotion):
"""Template-based prompt engineering"""
templates = {
"sadness": f"Sadness detected: {{text}}\nRespond with: 1. Empathy 2. Support 3. Solution\nResponse:",
"joy": f"Joy detected: {{text}}\nRespond with: 1. Thanks 2. Praise 3. Engagement\nResponse:",
"love": f"Love detected: {{text}}\nRespond with: 1. Appreciation 2. Connection 3. Offer\nResponse:",
"anger": f"Anger detected: {{text}}\nRespond with: 1. Apology 2. Action 3. Compensation\nResponse:",
"fear": f"Fear detected: {{text}}\nRespond with: 1. Reassurance 2. Safety 3. Support\nResponse:",
"surprise": f"Surprise detected: {{text}}\nRespond with: 1. Acknowledgement 2. Solution 3. Follow-up\nResponse:",
"neutral": f"Feedback: {{text}}\nProfessional response:\n1. Acknowledgement\n2. Assistance\n3. Next steps\nResponse:"
}
return templates[emotion.lower()].format(text=text[:200]) # Input truncation
def _generate_response(text, models):
"""Optimized text generation pipeline"""
# Emotion detection
emotion = _fast_emotion(text, models["emotion"])
# Prompt construction
prompt = _build_prompt(text, emotion["label"])
# Generate text
inputs = models["text_tokenizer"](
prompt,
return_tensors="pt",
max_length=100,
truncation=True
).to(models["device"])
output = models["text_model"].generate(
inputs.input_ids,
max_new_tokens=120, # Balanced length
temperature=0.7,
top_p=0.9,
do_sample=True,
pad_token_id=models["text_tokenizer"].eos_token_id
)
# Process output
full_text = models["text_tokenizer"].decode(output[0], skip_special_tokens=True)
response = full_text.split("Response:")[-1].strip()
# Ensure completeness
if "." in response:
response = response.rsplit(".", 1)[0] + "."
return response[:200] or "Thank you for your feedback. We'll respond shortly."
def _text_to_speech(text, models):
"""High-speed audio synthesis"""
inputs = models["tts_processor"](
text=text[:150], # Limit text length
return_tensors="pt"
).to(models["device"])
with torch.inference_mode(): # Accelerated inference
spectrogram = models["tts_model"].generate_speech(
inputs["input_ids"],
models["speaker_emb"]
)
audio = models["tts_vocoder"](spectrogram)
sf.write("output.wav", audio.cpu().numpy(), 16000)
return "output.wav"
##########################################
# Main application flow
##########################################
def main():
"""Primary execution controller"""
# Load components
components = _load_components()
# Show interface
user_input = _show_interface()
if user_input:
# Text generation
with st.spinner("🔍 Analyzing..."):
response = _generate_response(user_input, components)
# Display result
st.subheader(f"📄 Response")
st.markdown(f"```\n{response}\n```") # f-string formatted
# Audio generation
with st.spinner("🔊 Synthesizing..."):
audio_path = _text_to_speech(response, components)
st.audio(audio_path, format="audio/wav")
if __name__ == "__main__":
main()