File size: 9,820 Bytes
f49345e 39d0162 f49345e 70c8dc9 f49345e b136fe4 f49345e 70c8dc9 f49345e 70c8dc9 a39bbd6 70c8dc9 a39bbd6 70c8dc9 f49345e 70c8dc9 f49345e c8f6405 8a0123e c8f6405 f49345e 39d0162 f49345e 39d0162 70c8dc9 3504b6a 70c8dc9 4733633 8a0647b 70c8dc9 55e3386 70c8dc9 c8f6405 70c8dc9 4733633 8a0647b 70c8dc9 7cba530 70c8dc9 c8f6405 70c8dc9 f49345e 39d0162 f49345e 39d0162 f49345e 39d0162 f49345e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 |
import json
from pathlib import Path
import gradio as gr
import pandas as pd
from texts import TITLE, DESCRIPTION, ABOUT
from process_data import load_average_data, load_hard_data, load_easy_data, load_detailed_success_rate_data, load_detailed_action_counts_data
from display import custom_css
BENCHMARKS_TO_SKIP = []
color_map = {
"Pretrained": "#7497db",
"RL": "#E8ECF2",
"Finetuned": "#ffcd75",
# "DPO": "#75809c",
}
model_name_map = {
"qwen2.5-3b-instruct": "Qwen/Qwen2.5-3B-Instruct",
"qwen2.5-7b-instruct": "Qwen/Qwen2.5-7B-Instruct",
"qwen2.5-14b-instruct": "Qwen/Qwen2.5-14B-Instruct",
"qwen2.5-32b-instruct": "Qwen/Qwen2.5-32B-Instruct",
"qwen2.5-72b-instruct": "Qwen/Qwen2.5-72B-Instruct",
"llama-3.1-8b-instruct": "Meta-Llama/Llama-3.1-8B-Instruct",
"llama-3.1-70b-instruct": "Meta-Llama/Llama-3.1-70B-Instruct",
"llama-3.2-3b-instruct": "Meta-Llama/Llama-3.2-3B-Instruct",
"llama-3.3-70b-instruct": "Meta-Llama/Llama-3.3-70B-Instruct",
"mistral-large-instruct-2411": "Mistral/Mistral-Large-2411",
"gemma-2-27b-it": "google/gemma-2-27b-it",
"gemma-2-9b-it": "google/gemma-2-9b-it",
"deepseek-v3": "deepseek-ai/DeepSeek-V3",
"deepseek-r1": "deepseek-ai/DeepSeek-R1",
"qwq-32b": "Qwen/QwQ-32B",
"yi-lightning": "Yi/Yi-Lightning",
'gpt-3.5-turbo': "openai/gpt-3.5-turbo",
'gpt-4o': "openai/gpt-4o",
'gpt-4o-mini': "openai/gpt-4o-mini",
'o1-mini': "openai/o1-mini",
'claude-3.5-haiku': "anthropic/claude-3.5-haiku",
'claude-3.5-sonnet': "anthropic/claude-3.5-sonnet",
}
def map_model_name(model_id):
if model_id not in model_name_map.keys():
return model_id
else:
return model_name_map[model_id]
# 定义函数,将模型名称转换为带有链接的 HTML 格式
def model_hyperlink(link, model_name):
# return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
return f"[{model_name}]({link})"
def make_clickable_model(model_name):
link = f"https://huggingface.co./{model_name}"
return model_hyperlink(link, model_name)
rl_models = ['deepseek-r1', 'o1-mini']
def map_model_type(model_name):
if model_name in rl_models:
return "RL"
else:
return "Pretrained"
def prep_leaderboard_df():
average_df = load_average_data()
hard_df = load_hard_data()
easy_df = load_easy_data()
df = pd.concat([easy_df, hard_df, average_df], axis=1)
# insert a column named "Model" at the first position
df.insert(0, "Model", [map_model_name(idx) for idx in df.index])
df.insert(1, "Model Type", [map_model_type(idx) for idx in df.index])
# 对 Model 列应用函数,将模型名称转换为链接形式
# df['Model'] = df['Model'].apply(make_clickable_model)
df = df.round(2)
return df
def prep_detailed_success_rate_df():
df = load_detailed_success_rate_data()
# df = df.T # 转置为 model 是行,指标是列
df.insert(0, "Model", [map_model_name(idx) for idx in df.index])
df.insert(1, "Model Type", [map_model_type(idx) for idx in df.index])
df = df.round(2)
return df
def prep_detailed_action_counts_df():
df = load_detailed_action_counts_data()
# df = df.T # 转置为 model 是行,指标是列
df.insert(0, "Model", [map_model_name(idx) for idx in df.index])
df.insert(1, "Model Type", [map_model_type(idx) for idx in df.index])
df = df.round(2)
return df
leaderboard_df = prep_leaderboard_df()
detailed_success_rate_df = prep_detailed_success_rate_df()
detailed_action_counts_df = prep_detailed_action_counts_df()
# Function to update the table based on search query
def filter_and_search_success_rate(cols: list[str], search_query: str, agg: str,):
# print("filter")
df = detailed_success_rate_df
search_terms = "Model"
if len(search_query) > 0:
search_terms = search_query.split(";")
search_terms = [term.strip().lower() for term in search_terms]
pattern = "|".join(search_terms)
df = df[df["Model"].str.lower().str.contains(pattern, regex=True)]
# Drop any columns which are all NaN
df = df.dropna(how="all", axis=1)
if len(cols) > 0:
index_cols = list(leaderboard_df.columns[:1])
new_cols = index_cols + cols
df = df.copy()[new_cols]
df = df.copy().dropna(how="all", axis=0, subset=[c for c in df.columns if c in cols])
df[cols] = df[cols].apply(pd.to_numeric, errors='coerce')
df = df.sort_values(by=cols, ascending=False, na_position='last')
df[cols] = df[cols].astype(str)
return df
# Function to update the table based on search query
def filter_and_search_action_counts(cols: list[str], search_query: str, agg: str,):
# print("filter")
df = detailed_action_counts_df
search_terms = "Model"
if len(search_query) > 0:
search_terms = search_query.split(";")
search_terms = [term.strip().lower() for term in search_terms]
pattern = "|".join(search_terms)
df = df[df["Model"].str.lower().str.contains(pattern, regex=True)]
# Drop any columns which are all NaN
df = df.dropna(how="all", axis=1)
if len(cols) > 0:
index_cols = list(leaderboard_df.columns[:1])
new_cols = index_cols + cols
df = df.copy()[new_cols]
df = df.copy().dropna(how="all", axis=0, subset=[c for c in df.columns if c in cols])
df[cols] = df[cols].apply(pd.to_numeric, errors='coerce')
df = df.sort_values(by=cols, ascending=False, na_position='last')
df[cols] = df[cols].astype(str)
return df
demo = gr.Blocks(css=custom_css)
with demo:
gr.HTML(TITLE)
with gr.Row():
with gr.Column():
gr.Markdown(DESCRIPTION, elem_classes="markdown-text")
with gr.Tabs(elem_classes="tab-buttons") as tabs:
with gr.TabItem("🏆 Leaderboard"):
with gr.Row():
# search_bar = gr.Textbox(placeholder="Search for your model...", show_label=False)
# cols_bar = gr.CheckboxGroup(
# choices=[c for c in leaderboard_df.columns[1:] if c != "Average"],
# show_label=False,
# # info="Select columns to display",
# )
with gr.Group():
leaderboard_table = gr.Dataframe(
value=leaderboard_df,
wrap=True,
column_widths=[250, 120] + [(60 + len(c)) for c in leaderboard_df.columns[2:]],
)
#cols_bar.change(filter_and_search, inputs=[cols_bar, search_bar], outputs=[leaderboard_table])
# search_bar.submit(filter_and_search, inputs=[cols_bar, search_bar], outputs=[leaderboard_table])
with gr.TabItem("Success Rates - Detailed"):
with gr.Column():
with gr.Row():
search_bar = gr.Textbox(placeholder="Search for your model...", show_label=False)
with gr.Row():
cols_bar = gr.CheckboxGroup(
choices=[c for c in detailed_success_rate_df.columns[2:] if c != "Average"],
show_label=False,
# info="Select columns to display",
)
detailed_success_rate_table = gr.Dataframe(
value=detailed_success_rate_df,
wrap=True,
column_widths=[350, 120] + [(150 + len(c)) for c in detailed_success_rate_df.columns[2:]],
)
cols_bar.change(filter_and_search_success_rate, inputs=[cols_bar, search_bar], outputs=[detailed_success_rate_table])
search_bar.submit(filter_and_search_success_rate, inputs=[cols_bar, search_bar], outputs=[detailed_success_rate_table])
with gr.TabItem("Action Counts - Detailed"):
with gr.Column():
with gr.Row():
search_bar_1 = gr.Textbox(placeholder="Search for your model...", show_label=False)
with gr.Row():
cols_bar_1 = gr.CheckboxGroup(
choices=[c for c in detailed_action_counts_df.columns[2:] if c != "Average"],
show_label=False,
# info="Select columns to display",
)
detailed_action_counts_table = gr.Dataframe(
value=detailed_action_counts_df,
wrap=True,
column_widths=[350, 120] + [(100 + len(c)) for c in detailed_action_counts_df.columns[2:]],
)
cols_bar_1.change(filter_and_search_action_counts, inputs=[cols_bar_1, search_bar_1], outputs=[detailed_action_counts_table])
search_bar_1.submit(filter_and_search_action_counts, inputs=[cols_bar_1, search_bar_1], outputs=[detailed_action_counts_table])
with gr.TabItem("About"):
gr.Markdown(ABOUT)
with gr.Row():
with gr.Accordion("📚 Citation", open=False):
citation_button = gr.Textbox(
value=r"""@article{lin2025generative,
title={Generative Evaluation of Complex Reasoning in Large Language Models},
author={Lin, Haowei and Wang, Xiangyu and Yan, Ruilin and Huang, Baizhou and Ye, Haotian and Zhu, Jianhua and Wang, Zihao and Zou, James and Ma, Jianzhu and Liang, Yitao},
journal={arXiv preprint arXiv:2504.02810},
year={2025}
}""",
lines=7,
label="Copy the following to cite these results.",
elem_id="citation-button",
show_copy_button=True,
)
demo.launch() |