Spaces:

zhwang4ai
/

GenerativeReasoningBenchmark

Running

File size: 9,820 Bytes

import json
from pathlib import Path

import gradio as gr
import pandas as pd

from texts import TITLE, DESCRIPTION, ABOUT
from process_data import load_average_data, load_hard_data, load_easy_data, load_detailed_success_rate_data, load_detailed_action_counts_data
from display import custom_css
BENCHMARKS_TO_SKIP = []

color_map = {
    "Pretrained": "#7497db",
    "RL": "#E8ECF2",
    "Finetuned": "#ffcd75",
    # "DPO": "#75809c",
}

model_name_map = {
    "qwen2.5-3b-instruct": "Qwen/Qwen2.5-3B-Instruct",
    "qwen2.5-7b-instruct": "Qwen/Qwen2.5-7B-Instruct",
    "qwen2.5-14b-instruct": "Qwen/Qwen2.5-14B-Instruct",
    "qwen2.5-32b-instruct": "Qwen/Qwen2.5-32B-Instruct",
    "qwen2.5-72b-instruct": "Qwen/Qwen2.5-72B-Instruct",
    "llama-3.1-8b-instruct": "Meta-Llama/Llama-3.1-8B-Instruct",
    "llama-3.1-70b-instruct": "Meta-Llama/Llama-3.1-70B-Instruct",
    "llama-3.2-3b-instruct": "Meta-Llama/Llama-3.2-3B-Instruct",
    "llama-3.3-70b-instruct": "Meta-Llama/Llama-3.3-70B-Instruct",
    "mistral-large-instruct-2411": "Mistral/Mistral-Large-2411",
    "gemma-2-27b-it": "google/gemma-2-27b-it",
    "gemma-2-9b-it": "google/gemma-2-9b-it",
    "deepseek-v3": "deepseek-ai/DeepSeek-V3",
    "deepseek-r1": "deepseek-ai/DeepSeek-R1",
    "qwq-32b": "Qwen/QwQ-32B",
    "yi-lightning": "Yi/Yi-Lightning",
    'gpt-3.5-turbo': "openai/gpt-3.5-turbo",
    'gpt-4o': "openai/gpt-4o",
    'gpt-4o-mini': "openai/gpt-4o-mini",
    'o1-mini': "openai/o1-mini",
    'claude-3.5-haiku': "anthropic/claude-3.5-haiku",
    'claude-3.5-sonnet': "anthropic/claude-3.5-sonnet",
}

def map_model_name(model_id):
    if model_id not in model_name_map.keys():
        return model_id
    else:
        return model_name_map[model_id]

# 定义函数，将模型名称转换为带有链接的 HTML 格式
def model_hyperlink(link, model_name):
    # return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
    return f"[{model_name}]({link})"

def make_clickable_model(model_name):
    link = f"https://huggingface.co./{model_name}"
    return model_hyperlink(link, model_name)

rl_models = ['deepseek-r1', 'o1-mini']
def map_model_type(model_name):
    if model_name in rl_models:
        return "RL"
    else:
        return "Pretrained"
    

def prep_leaderboard_df():
    average_df = load_average_data()
    hard_df = load_hard_data()
    easy_df = load_easy_data()
    df = pd.concat([easy_df, hard_df, average_df], axis=1)
    # insert a column named "Model" at the first position
    df.insert(0, "Model", [map_model_name(idx) for idx in df.index])
    df.insert(1, "Model Type", [map_model_type(idx) for idx in df.index])
    # 对 Model 列应用函数，将模型名称转换为链接形式
    # df['Model'] = df['Model'].apply(make_clickable_model)
    df = df.round(2)
    return df

def prep_detailed_success_rate_df():
    df = load_detailed_success_rate_data()
    # df = df.T # 转置为 model 是行，指标是列
    df.insert(0, "Model", [map_model_name(idx) for idx in df.index])
    df.insert(1, "Model Type", [map_model_type(idx) for idx in df.index])
    df = df.round(2)
    return df 

def prep_detailed_action_counts_df():
    df = load_detailed_action_counts_data()
    # df = df.T # 转置为 model 是行，指标是列
    df.insert(0, "Model", [map_model_name(idx) for idx in df.index])
    df.insert(1, "Model Type", [map_model_type(idx) for idx in df.index])
    df = df.round(2)
    return df   

leaderboard_df = prep_leaderboard_df()
detailed_success_rate_df = prep_detailed_success_rate_df()
detailed_action_counts_df = prep_detailed_action_counts_df()

# Function to update the table based on search query
def filter_and_search_success_rate(cols: list[str], search_query: str, agg: str,):
    # print("filter")
    df = detailed_success_rate_df
    search_terms = "Model"
    if len(search_query) > 0:
        search_terms = search_query.split(";")
        search_terms = [term.strip().lower() for term in search_terms]
        pattern = "|".join(search_terms)
        df = df[df["Model"].str.lower().str.contains(pattern, regex=True)]
        # Drop any columns which are all NaN
        df = df.dropna(how="all", axis=1)

    if len(cols) > 0:
        index_cols = list(leaderboard_df.columns[:1])
        new_cols = index_cols + cols
        df = df.copy()[new_cols]
        df = df.copy().dropna(how="all", axis=0, subset=[c for c in df.columns if c in cols])

        df[cols] = df[cols].apply(pd.to_numeric, errors='coerce')
        df = df.sort_values(by=cols, ascending=False, na_position='last')
        df[cols] = df[cols].astype(str)
    return df

# Function to update the table based on search query
def filter_and_search_action_counts(cols: list[str], search_query: str, agg: str,):
    # print("filter")
    df = detailed_action_counts_df
    search_terms = "Model"
    if len(search_query) > 0:
        search_terms = search_query.split(";")
        search_terms = [term.strip().lower() for term in search_terms]
        pattern = "|".join(search_terms)
        df = df[df["Model"].str.lower().str.contains(pattern, regex=True)]
        # Drop any columns which are all NaN
        df = df.dropna(how="all", axis=1)

    if len(cols) > 0:
        index_cols = list(leaderboard_df.columns[:1])
        new_cols = index_cols + cols
        df = df.copy()[new_cols]
        df = df.copy().dropna(how="all", axis=0, subset=[c for c in df.columns if c in cols])

        df[cols] = df[cols].apply(pd.to_numeric, errors='coerce')
        df = df.sort_values(by=cols, ascending=False, na_position='last')
        df[cols] = df[cols].astype(str)
    return df


demo = gr.Blocks(css=custom_css)

with demo:
    gr.HTML(TITLE)
    with gr.Row():
        with gr.Column():
            gr.Markdown(DESCRIPTION, elem_classes="markdown-text")
    with gr.Tabs(elem_classes="tab-buttons") as tabs:
        with gr.TabItem("🏆 Leaderboard"):
            with gr.Row():
                # search_bar = gr.Textbox(placeholder="Search for your model...", show_label=False)

                # cols_bar = gr.CheckboxGroup(
                #     choices=[c for c in leaderboard_df.columns[1:] if c != "Average"],
                #     show_label=False,
                #     # info="Select columns to display",
                # )
                with gr.Group():
                    leaderboard_table = gr.Dataframe(
                        value=leaderboard_df,
                        wrap=True,
                        column_widths=[250, 120] + [(60 + len(c)) for c in leaderboard_df.columns[2:]],
                    )

                #cols_bar.change(filter_and_search, inputs=[cols_bar, search_bar], outputs=[leaderboard_table])
                # search_bar.submit(filter_and_search, inputs=[cols_bar, search_bar], outputs=[leaderboard_table])
        with gr.TabItem("Success Rates - Detailed"):
            with gr.Column():
                with gr.Row():
                    search_bar = gr.Textbox(placeholder="Search for your model...", show_label=False)

                with gr.Row():
                    cols_bar = gr.CheckboxGroup(
                        choices=[c for c in detailed_success_rate_df.columns[2:] if c != "Average"],
                        show_label=False,
                        # info="Select columns to display",
                    )
                detailed_success_rate_table = gr.Dataframe(
                    value=detailed_success_rate_df,
                    wrap=True,
                    column_widths=[350, 120] + [(150 + len(c)) for c in detailed_success_rate_df.columns[2:]],
                )
                cols_bar.change(filter_and_search_success_rate, inputs=[cols_bar, search_bar], outputs=[detailed_success_rate_table])
                search_bar.submit(filter_and_search_success_rate, inputs=[cols_bar, search_bar], outputs=[detailed_success_rate_table])
        
        with gr.TabItem("Action Counts - Detailed"):
            with gr.Column():
                with gr.Row():
                    search_bar_1 = gr.Textbox(placeholder="Search for your model...", show_label=False)

                with gr.Row():
                    cols_bar_1 = gr.CheckboxGroup(
                        choices=[c for c in detailed_action_counts_df.columns[2:] if c != "Average"],
                        show_label=False,
                        # info="Select columns to display",
                    )
                detailed_action_counts_table = gr.Dataframe(
                    value=detailed_action_counts_df,
                    wrap=True,
                    column_widths=[350, 120] + [(100 + len(c)) for c in detailed_action_counts_df.columns[2:]],
                )
                cols_bar_1.change(filter_and_search_action_counts, inputs=[cols_bar_1, search_bar_1], outputs=[detailed_action_counts_table])
                search_bar_1.submit(filter_and_search_action_counts, inputs=[cols_bar_1, search_bar_1], outputs=[detailed_action_counts_table])
        
        with gr.TabItem("About"):
            gr.Markdown(ABOUT)


    with gr.Row():
        with gr.Accordion("📚 Citation", open=False):
            citation_button = gr.Textbox(
                value=r"""@article{lin2025generative,
  title={Generative Evaluation of Complex Reasoning in Large Language Models},
  author={Lin, Haowei and Wang, Xiangyu and Yan, Ruilin and Huang, Baizhou and Ye, Haotian and Zhu, Jianhua and Wang, Zihao and Zou, James and Ma, Jianzhu and Liang, Yitao},
  journal={arXiv preprint arXiv:2504.02810},
  year={2025}
}""",
                lines=7,
                label="Copy the following to cite these results.",
                elem_id="citation-button",
                show_copy_button=True,
            )

demo.launch()