Spaces:

nuojohnchen
/

JudgeLRMDemo

Running

App Files Files Community

nuojohnchen commited on Mar 31

Commit

a386c90

verified ·

1 Parent(s): 74a5c4a

Update app.py

Browse files

Files changed (1) hide show

app.py +66 -31

app.py CHANGED Viewed

@@ -13,21 +13,15 @@ DESCRIPTION = '''
 <div>
 <h1 style="text-align: center;">JudgeLRM</h1>
 <p>This Space demonstrates the <a href="https://huggingface.co/nuojohnchen/JudgeLRM-7B"><b>JudgeLRM</b></a> model, designed to evaluate the quality of two AI assistant responses. JudgeLRM is a family of judgment-oriented LLMs trained using reinforcement learning (RL) with judge-wise, outcome-driven rewards. JudgeLRM models consistently outperform both SFT-tuned and state-of-the-art reasoning models. Notably, JudgeLRM-3B surpasses GPT-4, and JudgeLRM-7B outperforms DeepSeek-R1 by 2.79\% in F1 score, particularly excelling in judge tasks requiring deep reasoning.</p>
-<p>Enter an instruction and two responses, and the model will score them on a scale of 1-10 (higher is better).</p>
 <p>You can also select Hugging Face models to automatically generate responses for evaluation.</p>
 </div>
 '''
 LICENSE = """
-<div style="font-family: monospace; white-space: pre; margin-top: 20px; line-height: 1.2;">
-@misc{XtraGPT,
-    title = {JudgeLRM},
-    url = {https://huggingface.co/nuojohnchen/JudgeLRM-7B},
-    author = {Nuo Chen, Zhiyuan Hu, Qingyun Zou, Jiaying Wu, Qian Wang, Bryan Hooi, Bingsheng He},
-    month = {March},
-    year = {2025}
-}
-</div>
 """
 PLACEHOLDER = """
@@ -60,12 +54,8 @@ MODEL_PATHS = {
 POPULAR_MODELS = [
     "Qwen/Qwen2.5-7B-Instruct",
     "01-ai/Yi-6B-Chat",
-    "microsoft/phi-2",
-    "FreedomIntelligence/Apollo-7B",
-    "tiiuae/falcon-7b-instruct",
-    "HuggingFaceH4/zephyr-7b-beta",
-    "stabilityai/stablelm-3b-4e1t",
-    "openchat/openchat-3.5-0106"
 ]
 # Global variables for model and tokenizer
@@ -92,6 +82,31 @@ def get_model_path(dropdown_value, custom_value):
         return custom_value.strip()
     return dropdown_value
 # Function to generate response from a model
 def generate_response(instruction, model_path, progress=gr.Progress()):
     """Generate a response from a specified model"""
@@ -115,7 +130,7 @@ def generate_response(instruction, model_path, progress=gr.Progress()):
             do_sample=True
         )
-        # Decode response and clean it
         full_response = response_tokenizer.decode(output[0], skip_special_tokens=True)
         # Remove the prompt part from the response
@@ -131,6 +146,9 @@ def generate_response(instruction, model_path, progress=gr.Progress()):
                 for token in ["<|assistant|>", "<assistant>", "Assistant:", "A:"]:
                     clean_response = clean_response.replace(token, "").strip()
         # Clean up resources
         del response_model
         del response_tokenizer
@@ -141,7 +159,7 @@ def generate_response(instruction, model_path, progress=gr.Progress()):
     except Exception as e:
         return f"Error generating response: {str(e)}"
-@spaces.GPU(duration=200)
 def judge_responses(instruction, response1, response2, model_name, temperature=0.1, max_new_tokens=2048):
     """
     Evaluate the quality of two responses
@@ -216,15 +234,15 @@ def judge_responses(instruction, response1, response2, model_name, temperature=0
         yield result
-@spaces.GPU(duration=200)
 def generate_and_judge(instruction, model_dropdown_1, custom_model_1, model_dropdown_2, custom_model_2, judge_model_name, temperature=0.1, max_new_tokens=2048, progress=gr.Progress()):
     """Generate responses from two models and judge them"""
-    progress(0, desc="Starting generation process")
     # Determine which model paths to use
     model_path_1 = get_model_path(model_dropdown_1, custom_model_1)
     model_path_2 = get_model_path(model_dropdown_2, custom_model_2)
     # Generate responses from both models
     progress(0.1, desc=f"Generating response from {model_path_1}")
     response1 = generate_response(instruction, model_path_1, progress)
@@ -244,6 +262,14 @@ def generate_and_judge(instruction, model_dropdown_1, custom_model_1, model_drop
     return response1, response2, evaluation_results
 # Create Gradio interface
 with gr.Blocks(fill_height=True, css=css) as demo:
     gr.Markdown(DESCRIPTION)
@@ -312,24 +338,33 @@ with gr.Blocks(fill_height=True, css=css) as demo:
                 inputs=[auto_instruction, model_dropdown_1, custom_model_1, model_dropdown_2, custom_model_2, auto_model_dropdown, auto_temperature, auto_max_tokens],
                 outputs=[auto_response1, auto_response2, auto_output]
             )
-            # Examples for auto-generation
             auto_examples = [
                 ["Write a short poem about artificial intelligence",
                  "Qwen/Qwen2.5-7B-Instruct",
-                 "Qwen/Qwen2.5-7B-Instruct",
-                 "01-ai/Yi-6B-Chat",
                  "01-ai/Yi-6B-Chat"],
                 ["我听说有些人有高血压却没有任何症状。这是真的吗？",
                  "FreedomIntelligence/Apollo-7B",
-                 "FreedomIntelligence/Apollo-7B",
-                 "microsoft/phi-2",
-                 "microsoft/phi-2"]
             ]
-            gr.Examples(
-                examples=auto_examples,
-                inputs=[auto_instruction, model_dropdown_1, custom_model_1, model_dropdown_2, custom_model_2]
-            )
         # Manual Evaluation tab (now second)
         with gr.TabItem("Manual Evaluation"):

 <div>
 <h1 style="text-align: center;">JudgeLRM</h1>
 <p>This Space demonstrates the <a href="https://huggingface.co/nuojohnchen/JudgeLRM-7B"><b>JudgeLRM</b></a> model, designed to evaluate the quality of two AI assistant responses. JudgeLRM is a family of judgment-oriented LLMs trained using reinforcement learning (RL) with judge-wise, outcome-driven rewards. JudgeLRM models consistently outperform both SFT-tuned and state-of-the-art reasoning models. Notably, JudgeLRM-3B surpasses GPT-4, and JudgeLRM-7B outperforms DeepSeek-R1 by 2.79\% in F1 score, particularly excelling in judge tasks requiring deep reasoning.</p>
+<p>Enter an instruction and two responses, and the model will think, reason and score them on a scale of 1-10 (higher is better).</p>
 <p>You can also select Hugging Face models to automatically generate responses for evaluation.</p>
 </div>
 '''
 LICENSE = """
+<p/>
+---
+Built on Qwen2.5 architecture
 """
 PLACEHOLDER = """
 POPULAR_MODELS = [
     "Qwen/Qwen2.5-7B-Instruct",
     "01-ai/Yi-6B-Chat",
+    "openchat/openchat-3.5-0106",
+    "FreedomIntelligence/Apollo-7B"
 ]
 # Global variables for model and tokenizer
         return custom_value.strip()
     return dropdown_value
+# Function to clean model response
+def clean_response_text(text):
+    """Remove conversation markers and other artifacts from model response"""
+    # Remove any <|user|> or <|assistant|> markers and subsequent conversations
+    user_pattern = r'<\|user\|>.*'
+    assistant_pattern = r'<\|assistant\|>.*'
+    # Try to clean with regex first (using re.DOTALL to match across lines)
+    cleaned = re.sub(user_pattern, '', text, flags=re.DOTALL)
+    cleaned = re.sub(assistant_pattern, '', cleaned, flags=re.DOTALL)
+    # If that didn't work well, try a more aggressive approach
+    if '<|user|>' in cleaned or '<|assistant|>' in cleaned:
+        parts = text.split('<|user|>')
+        if len(parts) > 0:
+            cleaned = parts[0].strip()
+    # Remove other common markers
+    markers = ['<user>', '</user>', '<assistant>', '</assistant>',
+               'User:', 'Assistant:', 'Human:', 'AI:']
+    for marker in markers:
+        cleaned = cleaned.replace(marker, '')
+    return cleaned.strip()
 # Function to generate response from a model
 def generate_response(instruction, model_path, progress=gr.Progress()):
     """Generate a response from a specified model"""
             do_sample=True
         )
+        # Decode response
         full_response = response_tokenizer.decode(output[0], skip_special_tokens=True)
         # Remove the prompt part from the response
                 for token in ["<|assistant|>", "<assistant>", "Assistant:", "A:"]:
                     clean_response = clean_response.replace(token, "").strip()
+        # Apply additional cleaning to remove conversation markers
+        clean_response = clean_response_text(clean_response)
         # Clean up resources
         del response_model
         del response_tokenizer
     except Exception as e:
         return f"Error generating response: {str(e)}"
+@spaces.GPU(duration=120)
 def judge_responses(instruction, response1, response2, model_name, temperature=0.1, max_new_tokens=2048):
     """
     Evaluate the quality of two responses
         yield result
+@spaces.GPU(duration=120)
 def generate_and_judge(instruction, model_dropdown_1, custom_model_1, model_dropdown_2, custom_model_2, judge_model_name, temperature=0.1, max_new_tokens=2048, progress=gr.Progress()):
     """Generate responses from two models and judge them"""
     # Determine which model paths to use
     model_path_1 = get_model_path(model_dropdown_1, custom_model_1)
     model_path_2 = get_model_path(model_dropdown_2, custom_model_2)
+    progress(0, desc="Starting generation process")
     # Generate responses from both models
     progress(0.1, desc=f"Generating response from {model_path_1}")
     response1 = generate_response(instruction, model_path_1, progress)
     return response1, response2, evaluation_results
+# Function to process examples for display
+def process_example_for_display(example):
+    """Process example data for display in the interface"""
+    instruction = example[0]
+    model1 = example[1]
+    model2 = example[2]
+    return f"**Question:** {instruction}\n\n**Model 1:** {model1}\n\n**Model 2:** {model2}"
 # Create Gradio interface
 with gr.Blocks(fill_height=True, css=css) as demo:
     gr.Markdown(DESCRIPTION)
                 inputs=[auto_instruction, model_dropdown_1, custom_model_1, model_dropdown_2, custom_model_2, auto_model_dropdown, auto_temperature, auto_max_tokens],
                 outputs=[auto_response1, auto_response2, auto_output]
             )
+            # Examples for auto-generation with simplified display
             auto_examples = [
                 ["Write a short poem about artificial intelligence",
                  "Qwen/Qwen2.5-7B-Instruct",
                  "01-ai/Yi-6B-Chat"],
                 ["我听说有些人有高血压却没有任何症状。这是真的吗？",
                  "FreedomIntelligence/Apollo-7B",
+                 "openchat/openchat-3.5-0106"]
             ]
+            # Custom examples component with simplified display
+            with gr.Row():
+                gr.Markdown("### Examples")
+            for i, example in enumerate(auto_examples):
+                with gr.Row():
+                    example_btn = gr.Button(f"Example {i+1}", scale=1)
+                    example_display = gr.Markdown(process_example_for_display(example), scale=4)
+                    # Set up click handler for this example
+                    example_btn.click(
+                        lambda instruction, model1, model2: [instruction, model1, "", model2, ""],
+                        inputs=None,
+                        outputs=[auto_instruction, model_dropdown_1, custom_model_1, model_dropdown_2, custom_model_2],
+                        _js=f"() => [{repr(example[0])}, {repr(example[1])}, '', {repr(example[2])}, '']"
+                    )
         # Manual Evaluation tab (now second)
         with gr.TabItem("Manual Evaluation"):