nuojohnchen commited on
Commit
a386c90
·
verified ·
1 Parent(s): 74a5c4a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +66 -31
app.py CHANGED
@@ -13,21 +13,15 @@ DESCRIPTION = '''
13
  <div>
14
  <h1 style="text-align: center;">JudgeLRM</h1>
15
  <p>This Space demonstrates the <a href="https://huggingface.co/nuojohnchen/JudgeLRM-7B"><b>JudgeLRM</b></a> model, designed to evaluate the quality of two AI assistant responses. JudgeLRM is a family of judgment-oriented LLMs trained using reinforcement learning (RL) with judge-wise, outcome-driven rewards. JudgeLRM models consistently outperform both SFT-tuned and state-of-the-art reasoning models. Notably, JudgeLRM-3B surpasses GPT-4, and JudgeLRM-7B outperforms DeepSeek-R1 by 2.79\% in F1 score, particularly excelling in judge tasks requiring deep reasoning.</p>
16
- <p>Enter an instruction and two responses, and the model will score them on a scale of 1-10 (higher is better).</p>
17
  <p>You can also select Hugging Face models to automatically generate responses for evaluation.</p>
18
  </div>
19
  '''
20
 
21
  LICENSE = """
22
- <div style="font-family: monospace; white-space: pre; margin-top: 20px; line-height: 1.2;">
23
- @misc{XtraGPT,
24
- title = {JudgeLRM},
25
- url = {https://huggingface.co/nuojohnchen/JudgeLRM-7B},
26
- author = {Nuo Chen, Zhiyuan Hu, Qingyun Zou, Jiaying Wu, Qian Wang, Bryan Hooi, Bingsheng He},
27
- month = {March},
28
- year = {2025}
29
- }
30
- </div>
31
  """
32
 
33
  PLACEHOLDER = """
@@ -60,12 +54,8 @@ MODEL_PATHS = {
60
  POPULAR_MODELS = [
61
  "Qwen/Qwen2.5-7B-Instruct",
62
  "01-ai/Yi-6B-Chat",
63
- "microsoft/phi-2",
64
- "FreedomIntelligence/Apollo-7B",
65
- "tiiuae/falcon-7b-instruct",
66
- "HuggingFaceH4/zephyr-7b-beta",
67
- "stabilityai/stablelm-3b-4e1t",
68
- "openchat/openchat-3.5-0106"
69
  ]
70
 
71
  # Global variables for model and tokenizer
@@ -92,6 +82,31 @@ def get_model_path(dropdown_value, custom_value):
92
  return custom_value.strip()
93
  return dropdown_value
94
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
95
  # Function to generate response from a model
96
  def generate_response(instruction, model_path, progress=gr.Progress()):
97
  """Generate a response from a specified model"""
@@ -115,7 +130,7 @@ def generate_response(instruction, model_path, progress=gr.Progress()):
115
  do_sample=True
116
  )
117
 
118
- # Decode response and clean it
119
  full_response = response_tokenizer.decode(output[0], skip_special_tokens=True)
120
 
121
  # Remove the prompt part from the response
@@ -131,6 +146,9 @@ def generate_response(instruction, model_path, progress=gr.Progress()):
131
  for token in ["<|assistant|>", "<assistant>", "Assistant:", "A:"]:
132
  clean_response = clean_response.replace(token, "").strip()
133
 
 
 
 
134
  # Clean up resources
135
  del response_model
136
  del response_tokenizer
@@ -141,7 +159,7 @@ def generate_response(instruction, model_path, progress=gr.Progress()):
141
  except Exception as e:
142
  return f"Error generating response: {str(e)}"
143
 
144
- @spaces.GPU(duration=200)
145
  def judge_responses(instruction, response1, response2, model_name, temperature=0.1, max_new_tokens=2048):
146
  """
147
  Evaluate the quality of two responses
@@ -216,15 +234,15 @@ def judge_responses(instruction, response1, response2, model_name, temperature=0
216
 
217
  yield result
218
 
219
- @spaces.GPU(duration=200)
220
  def generate_and_judge(instruction, model_dropdown_1, custom_model_1, model_dropdown_2, custom_model_2, judge_model_name, temperature=0.1, max_new_tokens=2048, progress=gr.Progress()):
221
  """Generate responses from two models and judge them"""
222
- progress(0, desc="Starting generation process")
223
-
224
  # Determine which model paths to use
225
  model_path_1 = get_model_path(model_dropdown_1, custom_model_1)
226
  model_path_2 = get_model_path(model_dropdown_2, custom_model_2)
227
 
 
 
228
  # Generate responses from both models
229
  progress(0.1, desc=f"Generating response from {model_path_1}")
230
  response1 = generate_response(instruction, model_path_1, progress)
@@ -244,6 +262,14 @@ def generate_and_judge(instruction, model_dropdown_1, custom_model_1, model_drop
244
 
245
  return response1, response2, evaluation_results
246
 
 
 
 
 
 
 
 
 
247
  # Create Gradio interface
248
  with gr.Blocks(fill_height=True, css=css) as demo:
249
  gr.Markdown(DESCRIPTION)
@@ -312,24 +338,33 @@ with gr.Blocks(fill_height=True, css=css) as demo:
312
  inputs=[auto_instruction, model_dropdown_1, custom_model_1, model_dropdown_2, custom_model_2, auto_model_dropdown, auto_temperature, auto_max_tokens],
313
  outputs=[auto_response1, auto_response2, auto_output]
314
  )
315
- # Examples for auto-generation
 
316
  auto_examples = [
317
  ["Write a short poem about artificial intelligence",
318
  "Qwen/Qwen2.5-7B-Instruct",
319
- "Qwen/Qwen2.5-7B-Instruct",
320
- "01-ai/Yi-6B-Chat",
321
  "01-ai/Yi-6B-Chat"],
322
  ["我听说有些人有高血压却没有任何症状。这是真的吗?",
323
  "FreedomIntelligence/Apollo-7B",
324
- "FreedomIntelligence/Apollo-7B",
325
- "microsoft/phi-2",
326
- "microsoft/phi-2"]
327
  ]
328
 
329
- gr.Examples(
330
- examples=auto_examples,
331
- inputs=[auto_instruction, model_dropdown_1, custom_model_1, model_dropdown_2, custom_model_2]
332
- )
 
 
 
 
 
 
 
 
 
 
 
 
333
 
334
  # Manual Evaluation tab (now second)
335
  with gr.TabItem("Manual Evaluation"):
 
13
  <div>
14
  <h1 style="text-align: center;">JudgeLRM</h1>
15
  <p>This Space demonstrates the <a href="https://huggingface.co/nuojohnchen/JudgeLRM-7B"><b>JudgeLRM</b></a> model, designed to evaluate the quality of two AI assistant responses. JudgeLRM is a family of judgment-oriented LLMs trained using reinforcement learning (RL) with judge-wise, outcome-driven rewards. JudgeLRM models consistently outperform both SFT-tuned and state-of-the-art reasoning models. Notably, JudgeLRM-3B surpasses GPT-4, and JudgeLRM-7B outperforms DeepSeek-R1 by 2.79\% in F1 score, particularly excelling in judge tasks requiring deep reasoning.</p>
16
+ <p>Enter an instruction and two responses, and the model will think, reason and score them on a scale of 1-10 (higher is better).</p>
17
  <p>You can also select Hugging Face models to automatically generate responses for evaluation.</p>
18
  </div>
19
  '''
20
 
21
  LICENSE = """
22
+ <p/>
23
+ ---
24
+ Built on Qwen2.5 architecture
 
 
 
 
 
 
25
  """
26
 
27
  PLACEHOLDER = """
 
54
  POPULAR_MODELS = [
55
  "Qwen/Qwen2.5-7B-Instruct",
56
  "01-ai/Yi-6B-Chat",
57
+ "openchat/openchat-3.5-0106",
58
+ "FreedomIntelligence/Apollo-7B"
 
 
 
 
59
  ]
60
 
61
  # Global variables for model and tokenizer
 
82
  return custom_value.strip()
83
  return dropdown_value
84
 
85
+ # Function to clean model response
86
+ def clean_response_text(text):
87
+ """Remove conversation markers and other artifacts from model response"""
88
+ # Remove any <|user|> or <|assistant|> markers and subsequent conversations
89
+ user_pattern = r'<\|user\|>.*'
90
+ assistant_pattern = r'<\|assistant\|>.*'
91
+
92
+ # Try to clean with regex first (using re.DOTALL to match across lines)
93
+ cleaned = re.sub(user_pattern, '', text, flags=re.DOTALL)
94
+ cleaned = re.sub(assistant_pattern, '', cleaned, flags=re.DOTALL)
95
+
96
+ # If that didn't work well, try a more aggressive approach
97
+ if '<|user|>' in cleaned or '<|assistant|>' in cleaned:
98
+ parts = text.split('<|user|>')
99
+ if len(parts) > 0:
100
+ cleaned = parts[0].strip()
101
+
102
+ # Remove other common markers
103
+ markers = ['<user>', '</user>', '<assistant>', '</assistant>',
104
+ 'User:', 'Assistant:', 'Human:', 'AI:']
105
+ for marker in markers:
106
+ cleaned = cleaned.replace(marker, '')
107
+
108
+ return cleaned.strip()
109
+
110
  # Function to generate response from a model
111
  def generate_response(instruction, model_path, progress=gr.Progress()):
112
  """Generate a response from a specified model"""
 
130
  do_sample=True
131
  )
132
 
133
+ # Decode response
134
  full_response = response_tokenizer.decode(output[0], skip_special_tokens=True)
135
 
136
  # Remove the prompt part from the response
 
146
  for token in ["<|assistant|>", "<assistant>", "Assistant:", "A:"]:
147
  clean_response = clean_response.replace(token, "").strip()
148
 
149
+ # Apply additional cleaning to remove conversation markers
150
+ clean_response = clean_response_text(clean_response)
151
+
152
  # Clean up resources
153
  del response_model
154
  del response_tokenizer
 
159
  except Exception as e:
160
  return f"Error generating response: {str(e)}"
161
 
162
+ @spaces.GPU(duration=120)
163
  def judge_responses(instruction, response1, response2, model_name, temperature=0.1, max_new_tokens=2048):
164
  """
165
  Evaluate the quality of two responses
 
234
 
235
  yield result
236
 
237
+ @spaces.GPU(duration=120)
238
  def generate_and_judge(instruction, model_dropdown_1, custom_model_1, model_dropdown_2, custom_model_2, judge_model_name, temperature=0.1, max_new_tokens=2048, progress=gr.Progress()):
239
  """Generate responses from two models and judge them"""
 
 
240
  # Determine which model paths to use
241
  model_path_1 = get_model_path(model_dropdown_1, custom_model_1)
242
  model_path_2 = get_model_path(model_dropdown_2, custom_model_2)
243
 
244
+ progress(0, desc="Starting generation process")
245
+
246
  # Generate responses from both models
247
  progress(0.1, desc=f"Generating response from {model_path_1}")
248
  response1 = generate_response(instruction, model_path_1, progress)
 
262
 
263
  return response1, response2, evaluation_results
264
 
265
+ # Function to process examples for display
266
+ def process_example_for_display(example):
267
+ """Process example data for display in the interface"""
268
+ instruction = example[0]
269
+ model1 = example[1]
270
+ model2 = example[2]
271
+ return f"**Question:** {instruction}\n\n**Model 1:** {model1}\n\n**Model 2:** {model2}"
272
+
273
  # Create Gradio interface
274
  with gr.Blocks(fill_height=True, css=css) as demo:
275
  gr.Markdown(DESCRIPTION)
 
338
  inputs=[auto_instruction, model_dropdown_1, custom_model_1, model_dropdown_2, custom_model_2, auto_model_dropdown, auto_temperature, auto_max_tokens],
339
  outputs=[auto_response1, auto_response2, auto_output]
340
  )
341
+
342
+ # Examples for auto-generation with simplified display
343
  auto_examples = [
344
  ["Write a short poem about artificial intelligence",
345
  "Qwen/Qwen2.5-7B-Instruct",
 
 
346
  "01-ai/Yi-6B-Chat"],
347
  ["我听说有些人有高血压却没有任何症状。这是真的吗?",
348
  "FreedomIntelligence/Apollo-7B",
349
+ "openchat/openchat-3.5-0106"]
 
 
350
  ]
351
 
352
+ # Custom examples component with simplified display
353
+ with gr.Row():
354
+ gr.Markdown("### Examples")
355
+
356
+ for i, example in enumerate(auto_examples):
357
+ with gr.Row():
358
+ example_btn = gr.Button(f"Example {i+1}", scale=1)
359
+ example_display = gr.Markdown(process_example_for_display(example), scale=4)
360
+
361
+ # Set up click handler for this example
362
+ example_btn.click(
363
+ lambda instruction, model1, model2: [instruction, model1, "", model2, ""],
364
+ inputs=None,
365
+ outputs=[auto_instruction, model_dropdown_1, custom_model_1, model_dropdown_2, custom_model_2],
366
+ _js=f"() => [{repr(example[0])}, {repr(example[1])}, '', {repr(example[2])}, '']"
367
+ )
368
 
369
  # Manual Evaluation tab (now second)
370
  with gr.TabItem("Manual Evaluation"):