File size: 9,820 Bytes
f49345e
 
 
39d0162
 
f49345e
70c8dc9
 
f49345e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b136fe4
 
f49345e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70c8dc9
f49345e
 
 
70c8dc9
 
a39bbd6
70c8dc9
 
 
 
 
 
 
a39bbd6
70c8dc9
 
 
 
f49345e
70c8dc9
 
 
f49345e
 
c8f6405
8a0123e
c8f6405
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f49345e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39d0162
 
 
f49345e
39d0162
 
70c8dc9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3504b6a
70c8dc9
 
 
 
 
4733633
8a0647b
 
 
 
 
 
 
 
 
70c8dc9
 
 
55e3386
70c8dc9
c8f6405
 
70c8dc9
 
4733633
8a0647b
 
 
 
 
 
 
 
 
70c8dc9
 
 
7cba530
70c8dc9
c8f6405
 
70c8dc9
 
 
f49345e
39d0162
 
f49345e
39d0162
f49345e
 
 
 
 
 
 
 
39d0162
 
 
 
f49345e
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
import json
from pathlib import Path

import gradio as gr
import pandas as pd

from texts import TITLE, DESCRIPTION, ABOUT
from process_data import load_average_data, load_hard_data, load_easy_data, load_detailed_success_rate_data, load_detailed_action_counts_data
from display import custom_css
BENCHMARKS_TO_SKIP = []

color_map = {
    "Pretrained": "#7497db",
    "RL": "#E8ECF2",
    "Finetuned": "#ffcd75",
    # "DPO": "#75809c",
}

model_name_map = {
    "qwen2.5-3b-instruct": "Qwen/Qwen2.5-3B-Instruct",
    "qwen2.5-7b-instruct": "Qwen/Qwen2.5-7B-Instruct",
    "qwen2.5-14b-instruct": "Qwen/Qwen2.5-14B-Instruct",
    "qwen2.5-32b-instruct": "Qwen/Qwen2.5-32B-Instruct",
    "qwen2.5-72b-instruct": "Qwen/Qwen2.5-72B-Instruct",
    "llama-3.1-8b-instruct": "Meta-Llama/Llama-3.1-8B-Instruct",
    "llama-3.1-70b-instruct": "Meta-Llama/Llama-3.1-70B-Instruct",
    "llama-3.2-3b-instruct": "Meta-Llama/Llama-3.2-3B-Instruct",
    "llama-3.3-70b-instruct": "Meta-Llama/Llama-3.3-70B-Instruct",
    "mistral-large-instruct-2411": "Mistral/Mistral-Large-2411",
    "gemma-2-27b-it": "google/gemma-2-27b-it",
    "gemma-2-9b-it": "google/gemma-2-9b-it",
    "deepseek-v3": "deepseek-ai/DeepSeek-V3",
    "deepseek-r1": "deepseek-ai/DeepSeek-R1",
    "qwq-32b": "Qwen/QwQ-32B",
    "yi-lightning": "Yi/Yi-Lightning",
    'gpt-3.5-turbo': "openai/gpt-3.5-turbo",
    'gpt-4o': "openai/gpt-4o",
    'gpt-4o-mini': "openai/gpt-4o-mini",
    'o1-mini': "openai/o1-mini",
    'claude-3.5-haiku': "anthropic/claude-3.5-haiku",
    'claude-3.5-sonnet': "anthropic/claude-3.5-sonnet",
}

def map_model_name(model_id):
    if model_id not in model_name_map.keys():
        return model_id
    else:
        return model_name_map[model_id]

# 定义函数,将模型名称转换为带有链接的 HTML 格式
def model_hyperlink(link, model_name):
    # return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
    return f"[{model_name}]({link})"

def make_clickable_model(model_name):
    link = f"https://huggingface.co./{model_name}"
    return model_hyperlink(link, model_name)

rl_models = ['deepseek-r1', 'o1-mini']
def map_model_type(model_name):
    if model_name in rl_models:
        return "RL"
    else:
        return "Pretrained"
    

def prep_leaderboard_df():
    average_df = load_average_data()
    hard_df = load_hard_data()
    easy_df = load_easy_data()
    df = pd.concat([easy_df, hard_df, average_df], axis=1)
    # insert a column named "Model" at the first position
    df.insert(0, "Model", [map_model_name(idx) for idx in df.index])
    df.insert(1, "Model Type", [map_model_type(idx) for idx in df.index])
    # 对 Model 列应用函数,将模型名称转换为链接形式
    # df['Model'] = df['Model'].apply(make_clickable_model)
    df = df.round(2)
    return df

def prep_detailed_success_rate_df():
    df = load_detailed_success_rate_data()
    # df = df.T # 转置为 model 是行,指标是列
    df.insert(0, "Model", [map_model_name(idx) for idx in df.index])
    df.insert(1, "Model Type", [map_model_type(idx) for idx in df.index])
    df = df.round(2)
    return df 

def prep_detailed_action_counts_df():
    df = load_detailed_action_counts_data()
    # df = df.T # 转置为 model 是行,指标是列
    df.insert(0, "Model", [map_model_name(idx) for idx in df.index])
    df.insert(1, "Model Type", [map_model_type(idx) for idx in df.index])
    df = df.round(2)
    return df   

leaderboard_df = prep_leaderboard_df()
detailed_success_rate_df = prep_detailed_success_rate_df()
detailed_action_counts_df = prep_detailed_action_counts_df()

# Function to update the table based on search query
def filter_and_search_success_rate(cols: list[str], search_query: str, agg: str,):
    # print("filter")
    df = detailed_success_rate_df
    search_terms = "Model"
    if len(search_query) > 0:
        search_terms = search_query.split(";")
        search_terms = [term.strip().lower() for term in search_terms]
        pattern = "|".join(search_terms)
        df = df[df["Model"].str.lower().str.contains(pattern, regex=True)]
        # Drop any columns which are all NaN
        df = df.dropna(how="all", axis=1)

    if len(cols) > 0:
        index_cols = list(leaderboard_df.columns[:1])
        new_cols = index_cols + cols
        df = df.copy()[new_cols]
        df = df.copy().dropna(how="all", axis=0, subset=[c for c in df.columns if c in cols])

        df[cols] = df[cols].apply(pd.to_numeric, errors='coerce')
        df = df.sort_values(by=cols, ascending=False, na_position='last')
        df[cols] = df[cols].astype(str)
    return df

# Function to update the table based on search query
def filter_and_search_action_counts(cols: list[str], search_query: str, agg: str,):
    # print("filter")
    df = detailed_action_counts_df
    search_terms = "Model"
    if len(search_query) > 0:
        search_terms = search_query.split(";")
        search_terms = [term.strip().lower() for term in search_terms]
        pattern = "|".join(search_terms)
        df = df[df["Model"].str.lower().str.contains(pattern, regex=True)]
        # Drop any columns which are all NaN
        df = df.dropna(how="all", axis=1)

    if len(cols) > 0:
        index_cols = list(leaderboard_df.columns[:1])
        new_cols = index_cols + cols
        df = df.copy()[new_cols]
        df = df.copy().dropna(how="all", axis=0, subset=[c for c in df.columns if c in cols])

        df[cols] = df[cols].apply(pd.to_numeric, errors='coerce')
        df = df.sort_values(by=cols, ascending=False, na_position='last')
        df[cols] = df[cols].astype(str)
    return df


demo = gr.Blocks(css=custom_css)

with demo:
    gr.HTML(TITLE)
    with gr.Row():
        with gr.Column():
            gr.Markdown(DESCRIPTION, elem_classes="markdown-text")
    with gr.Tabs(elem_classes="tab-buttons") as tabs:
        with gr.TabItem("🏆 Leaderboard"):
            with gr.Row():
                # search_bar = gr.Textbox(placeholder="Search for your model...", show_label=False)

                # cols_bar = gr.CheckboxGroup(
                #     choices=[c for c in leaderboard_df.columns[1:] if c != "Average"],
                #     show_label=False,
                #     # info="Select columns to display",
                # )
                with gr.Group():
                    leaderboard_table = gr.Dataframe(
                        value=leaderboard_df,
                        wrap=True,
                        column_widths=[250, 120] + [(60 + len(c)) for c in leaderboard_df.columns[2:]],
                    )

                #cols_bar.change(filter_and_search, inputs=[cols_bar, search_bar], outputs=[leaderboard_table])
                # search_bar.submit(filter_and_search, inputs=[cols_bar, search_bar], outputs=[leaderboard_table])
        with gr.TabItem("Success Rates - Detailed"):
            with gr.Column():
                with gr.Row():
                    search_bar = gr.Textbox(placeholder="Search for your model...", show_label=False)

                with gr.Row():
                    cols_bar = gr.CheckboxGroup(
                        choices=[c for c in detailed_success_rate_df.columns[2:] if c != "Average"],
                        show_label=False,
                        # info="Select columns to display",
                    )
                detailed_success_rate_table = gr.Dataframe(
                    value=detailed_success_rate_df,
                    wrap=True,
                    column_widths=[350, 120] + [(150 + len(c)) for c in detailed_success_rate_df.columns[2:]],
                )
                cols_bar.change(filter_and_search_success_rate, inputs=[cols_bar, search_bar], outputs=[detailed_success_rate_table])
                search_bar.submit(filter_and_search_success_rate, inputs=[cols_bar, search_bar], outputs=[detailed_success_rate_table])
        
        with gr.TabItem("Action Counts - Detailed"):
            with gr.Column():
                with gr.Row():
                    search_bar_1 = gr.Textbox(placeholder="Search for your model...", show_label=False)

                with gr.Row():
                    cols_bar_1 = gr.CheckboxGroup(
                        choices=[c for c in detailed_action_counts_df.columns[2:] if c != "Average"],
                        show_label=False,
                        # info="Select columns to display",
                    )
                detailed_action_counts_table = gr.Dataframe(
                    value=detailed_action_counts_df,
                    wrap=True,
                    column_widths=[350, 120] + [(100 + len(c)) for c in detailed_action_counts_df.columns[2:]],
                )
                cols_bar_1.change(filter_and_search_action_counts, inputs=[cols_bar_1, search_bar_1], outputs=[detailed_action_counts_table])
                search_bar_1.submit(filter_and_search_action_counts, inputs=[cols_bar_1, search_bar_1], outputs=[detailed_action_counts_table])
        
        with gr.TabItem("About"):
            gr.Markdown(ABOUT)


    with gr.Row():
        with gr.Accordion("📚 Citation", open=False):
            citation_button = gr.Textbox(
                value=r"""@article{lin2025generative,
  title={Generative Evaluation of Complex Reasoning in Large Language Models},
  author={Lin, Haowei and Wang, Xiangyu and Yan, Ruilin and Huang, Baizhou and Ye, Haotian and Zhu, Jianhua and Wang, Zihao and Zou, James and Ma, Jianzhu and Liang, Yitao},
  journal={arXiv preprint arXiv:2504.02810},
  year={2025}
}""",
                lines=7,
                label="Copy the following to cite these results.",
                elem_id="citation-button",
                show_copy_button=True,
            )

demo.launch()