joey1101 commited on
Commit
f551227
Β·
verified Β·
1 Parent(s): 5e4841e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +67 -66
app.py CHANGED
@@ -2,7 +2,7 @@
2
  # Step 0: Essential imports
3
  ##########################################
4
  import streamlit as st # Web interface
5
- from transformers import ( # AI components
6
  pipeline,
7
  SpeechT5Processor,
8
  SpeechT5ForTextToSpeech,
@@ -10,14 +10,15 @@ from transformers import ( # AI components
10
  AutoModelForCausalLM,
11
  AutoTokenizer
12
  )
13
- from datasets import load_dataset # Voice data
14
- import torch # Tensor operations
15
- import soundfile as sf # Audio processing
 
16
 
17
  ##########################################
18
  # Initial configuration (MUST BE FIRST)
19
  ##########################################
20
- st.set_page_config( # Set page config first
21
  page_title="Just Comment",
22
  page_icon="πŸ’¬",
23
  layout="centered"
@@ -28,10 +29,10 @@ st.set_page_config( # Set page config first
28
  ##########################################
29
  @st.cache_resource(show_spinner=False)
30
  def _load_components():
31
- """Load and cache all models with hardware optimization"""
32
- device = "cuda" if torch.cuda.is_available() else "cpu"
33
 
34
- # Emotion classifier (fast)
35
  emotion_pipe = pipeline(
36
  "text-classification",
37
  model="Thea231/jhartmann_emotion_finetuning",
@@ -39,7 +40,7 @@ def _load_components():
39
  truncation=True
40
  )
41
 
42
- # Text generator (optimized)
43
  text_tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen1.5-0.5B")
44
  text_model = AutoModelForCausalLM.from_pretrained(
45
  "Qwen/Qwen1.5-0.5B",
@@ -58,7 +59,7 @@ def _load_components():
58
  torch_dtype=torch.float16
59
  ).to(device)
60
 
61
- # Preloaded voice profile
62
  speaker_emb = torch.tensor(
63
  load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")[7306]["xvector"]
64
  ).unsqueeze(0).to(device)
@@ -78,10 +79,10 @@ def _load_components():
78
  # User interface components
79
  ##########################################
80
  def _show_interface():
81
- """Render input interface"""
82
- st.title("Just Comment")
83
- st.markdown(f"### I'm listening to you, my friend~")
84
- return st.text_area( # Input field
85
  "πŸ“ Enter your comment:",
86
  placeholder="Share your thoughts...",
87
  height=150,
@@ -92,37 +93,39 @@ def _show_interface():
92
  # Core processing functions
93
  ##########################################
94
  def _fast_emotion(text, analyzer):
95
- """Rapid emotion detection with input limits"""
96
- result = analyzer(text[:256], return_all_scores=True)[0] # Limit input length
97
- emotions = ['sadness', 'joy', 'love', 'anger', 'fear', 'surprise']
 
98
  return max(
99
- (e for e in result if e['label'].lower() in emotions),
100
  key=lambda x: x['score'],
101
  default={'label': 'neutral', 'score': 0}
102
  )
103
 
104
  def _build_prompt(text, emotion):
105
- """Template-based prompt engineering"""
106
  templates = {
107
- "sadness": f"Sadness detected: {{text}}\nRespond with: 1. Empathy 2. Support 3. Solution\nResponse:",
108
- "joy": f"Joy detected: {{text}}\nRespond with: 1. Thanks 2. Praise 3. Engagement\nResponse:",
109
- "love": f"Love detected: {{text}}\nRespond with: 1. Appreciation 2. Connection 3. Offer\nResponse:",
110
- "anger": f"Anger detected: {{text}}\nRespond with: 1. Apology 2. Action 3. Compensation\nResponse:",
111
- "fear": f"Fear detected: {{text}}\nRespond with: 1. Reassurance 2. Safety 3. Support\nResponse:",
112
- "surprise": f"Surprise detected: {{text}}\nRespond with: 1. Acknowledgement 2. Solution 3. Follow-up\nResponse:",
113
- "neutral": f"Feedback: {{text}}\nProfessional response:\n1. Acknowledgement\n2. Assistance\n3. Next steps\nResponse:"
114
  }
115
- return templates[emotion.lower()].format(text=text[:200]) # Input truncation
 
116
 
117
  def _generate_response(text, models):
118
- """Optimized text generation pipeline"""
119
- # Emotion detection
120
- emotion = _fast_emotion(text, models["emotion"])
121
-
122
- # Prompt construction
123
- prompt = _build_prompt(text, emotion["label"])
124
-
125
- # Generate text
126
  inputs = models["text_tokenizer"](
127
  prompt,
128
  return_tensors="pt",
@@ -130,65 +133,63 @@ def _generate_response(text, models):
130
  truncation=True
131
  ).to(models["device"])
132
 
 
133
  output = models["text_model"].generate(
134
  inputs.input_ids,
135
- max_new_tokens=120, # Balanced length
 
136
  temperature=0.7,
137
  top_p=0.9,
138
  do_sample=True,
139
  pad_token_id=models["text_tokenizer"].eos_token_id
140
  )
141
 
142
- # Process output
143
  full_text = models["text_tokenizer"].decode(output[0], skip_special_tokens=True)
 
144
  response = full_text.split("Response:")[-1].strip()
145
-
146
- # Ensure completeness
147
- if "." in response:
148
- response = response.rsplit(".", 1)[0] + "."
149
- return response[:200] or "Thank you for your feedback. We'll respond shortly."
150
 
151
  def _text_to_speech(text, models):
152
- """High-speed audio synthesis"""
153
  inputs = models["tts_processor"](
154
- text=text[:150], # Limit text length
155
  return_tensors="pt"
156
  ).to(models["device"])
157
 
158
- with torch.inference_mode(): # Accelerated inference
159
  spectrogram = models["tts_model"].generate_speech(
160
  inputs["input_ids"],
161
  models["speaker_emb"]
162
  )
163
  audio = models["tts_vocoder"](spectrogram)
164
 
165
- sf.write("output.wav", audio.cpu().numpy(), 16000)
166
  return "output.wav"
167
 
168
  ##########################################
169
  # Main application flow
170
  ##########################################
171
  def main():
172
- """Primary execution controller"""
173
- # Load components
174
- components = _load_components()
175
-
176
- # Show interface
177
- user_input = _show_interface()
178
 
179
- if user_input:
180
- # Text generation
181
- with st.spinner("πŸ” Analyzing..."):
182
- response = _generate_response(user_input, components)
183
-
184
- # Display result
185
- st.subheader(f"πŸ“„ Response")
186
- st.markdown(f"```\n{response}\n```") # f-string formatted
187
-
188
- # Audio generation
189
- with st.spinner("πŸ”Š Synthesizing..."):
190
- audio_path = _text_to_speech(response, components)
191
- st.audio(audio_path, format="audio/wav")
192
 
 
193
  if __name__ == "__main__":
194
- main()
 
2
  # Step 0: Essential imports
3
  ##########################################
4
  import streamlit as st # Web interface
5
+ from transformers import ( # AI components: emotion analysis, TTS, and text generation
6
  pipeline,
7
  SpeechT5Processor,
8
  SpeechT5ForTextToSpeech,
 
10
  AutoModelForCausalLM,
11
  AutoTokenizer
12
  )
13
+ from datasets import load_dataset # To load speaker embeddings dataset
14
+ import torch # For tensor operations
15
+ import soundfile as sf # For writing audio files
16
+ import sentencepiece # Required for SpeechT5Processor tokenization
17
 
18
  ##########################################
19
  # Initial configuration (MUST BE FIRST)
20
  ##########################################
21
+ st.set_page_config( # Configure the web page
22
  page_title="Just Comment",
23
  page_icon="πŸ’¬",
24
  layout="centered"
 
29
  ##########################################
30
  @st.cache_resource(show_spinner=False)
31
  def _load_components():
32
+ """Load and cache all models with hardware optimization."""
33
+ device = "cuda" if torch.cuda.is_available() else "cpu" # Detect available device
34
 
35
+ # Emotion classifier (fast and truncated)
36
  emotion_pipe = pipeline(
37
  "text-classification",
38
  model="Thea231/jhartmann_emotion_finetuning",
 
40
  truncation=True
41
  )
42
 
43
+ # Text generator (optimized with FP16 and auto device mapping)
44
  text_tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen1.5-0.5B")
45
  text_model = AutoModelForCausalLM.from_pretrained(
46
  "Qwen/Qwen1.5-0.5B",
 
59
  torch_dtype=torch.float16
60
  ).to(device)
61
 
62
+ # Preloaded voice profile for TTS
63
  speaker_emb = torch.tensor(
64
  load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")[7306]["xvector"]
65
  ).unsqueeze(0).to(device)
 
79
  # User interface components
80
  ##########################################
81
  def _show_interface():
82
+ """Render the input interface"""
83
+ st.title("πŸš€ Just Comment") # Display the title with a rocket icon
84
+ st.markdown("### I'm listening to you, my friend~") # Display the friendly subtitle
85
+ return st.text_area( # Return user's comment input
86
  "πŸ“ Enter your comment:",
87
  placeholder="Share your thoughts...",
88
  height=150,
 
93
  # Core processing functions
94
  ##########################################
95
  def _fast_emotion(text, analyzer):
96
+ """Rapid emotion detection with input length limit."""
97
+ result = analyzer(text[:256], return_all_scores=True)[0] # Analyze only the first 256 characters for speed
98
+ valid_emotions = ['sadness', 'joy', 'love', 'anger', 'fear', 'surprise']
99
+ # Select the emotion from valid ones or default to neutral
100
  return max(
101
+ (e for e in result if e['label'].lower() in valid_emotions),
102
  key=lambda x: x['score'],
103
  default={'label': 'neutral', 'score': 0}
104
  )
105
 
106
  def _build_prompt(text, emotion):
107
+ """Template-based prompt engineering in continuous prose (no bullet points)."""
108
  templates = {
109
+ "sadness": "I sensed sadness in your comment: {text}. We are truly sorry and are here to support you.",
110
+ "joy": "Your comment radiates joy: {text}. Thank you for your bright feedback; we look forward to serving you even better.",
111
+ "love": "Your message exudes love: {text}. We appreciate your heartfelt words and cherish our connection with you.",
112
+ "anger": "I understand your comment reflects anger: {text}. Please accept our sincere apologies as we work to resolve your concerns.",
113
+ "fear": "It seems you feel fear in your comment: {text}. We want to reassure you that your safety and satisfaction are our priority.",
114
+ "surprise": "Your comment conveys surprise: {text}. We are delighted by your experience and will strive to exceed your expectations.",
115
+ "neutral": "Thank you for your comment: {text}. We remain committed to providing you with outstanding service."
116
  }
117
+ # Build and return a continuous prompt with the user comment truncated to 200 characters
118
+ return templates.get(emotion.lower(), templates["neutral"]).format(text=text[:200])
119
 
120
  def _generate_response(text, models):
121
+ """Optimized text generation pipeline using the detected emotion."""
122
+ # Detect the dominant emotion quickly
123
+ detected = _fast_emotion(text, models["emotion"])
124
+ # Build prompt based on detected emotion (continuous sentences)
125
+ prompt = _build_prompt(text, detected["label"])
126
+ print(f"Generated prompt: {prompt}") # Print prompt using f-string for debugging
127
+
128
+ # Generate text using the Qwen model
129
  inputs = models["text_tokenizer"](
130
  prompt,
131
  return_tensors="pt",
 
133
  truncation=True
134
  ).to(models["device"])
135
 
136
+ # Generate the response ensuring balanced length (approximately 50-200 tokens)
137
  output = models["text_model"].generate(
138
  inputs.input_ids,
139
+ max_new_tokens=120, # Upper bound tokens for answer
140
+ min_length=50, # Lower bound to ensure completeness
141
  temperature=0.7,
142
  top_p=0.9,
143
  do_sample=True,
144
  pad_token_id=models["text_tokenizer"].eos_token_id
145
  )
146
 
147
+ input_len = inputs.input_ids.shape[1] # Determine the length of the prompt tokens
148
  full_text = models["text_tokenizer"].decode(output[0], skip_special_tokens=True)
149
+ # Extract only the generated portion after "Response:" if present
150
  response = full_text.split("Response:")[-1].strip()
151
+ print(f"Generated response: {response}") # Debug print using f-string
152
+ # Return response ensuring it is within 50-200 words (approximation by character length here)
153
+ return response[:200] # Truncate to 200 characters as an approximation
 
 
154
 
155
  def _text_to_speech(text, models):
156
+ """Efficiently synthesize speech for the given text."""
157
  inputs = models["tts_processor"](
158
+ text=text[:150], # Limit text length for TTS to 150 characters
159
  return_tensors="pt"
160
  ).to(models["device"])
161
 
162
+ with torch.inference_mode(): # Fast, no-grad inference
163
  spectrogram = models["tts_model"].generate_speech(
164
  inputs["input_ids"],
165
  models["speaker_emb"]
166
  )
167
  audio = models["tts_vocoder"](spectrogram)
168
 
169
+ sf.write("output.wav", audio.cpu().numpy(), 16000) # Save generated audio as .wav at 16kHz
170
  return "output.wav"
171
 
172
  ##########################################
173
  # Main application flow
174
  ##########################################
175
  def main():
176
+ """Primary execution controller."""
177
+ components = _load_components() # Load all models and components
178
+ user_input = _show_interface() # Render input interface and capture user comment
 
 
 
179
 
180
+ if user_input: # If a comment is provided
181
+ with st.spinner("πŸ” Generating response..."):
182
+ generated_response = _generate_response(user_input, components) # Generate response based on emotion and text
183
+ st.subheader("πŸ“„ Response")
184
+ st.markdown(
185
+ f"<p style='color:#3498DB; font-size:20px;'>{generated_response}</p>",
186
+ unsafe_allow_html=True
187
+ ) # Display the generated response in styled format
188
+ with st.spinner("πŸ”Š Synthesizing audio..."):
189
+ audio_file = _text_to_speech(generated_response, components) # Convert response to speech
190
+ st.audio(audio_file, format="audio/wav", start_time=0) # Embed auto-playing audio player
191
+ print(f"Final generated response: {generated_response}") # Debug output using f-string
 
192
 
193
+ # Run the main function when the script is executed
194
  if __name__ == "__main__":
195
+ main() # Call the main function