update tabs
Browse files- app.py +57 -22
- process_data.py +14 -0
- texts.py +9 -0
app.py
CHANGED
@@ -4,8 +4,8 @@ from pathlib import Path
|
|
4 |
import gradio as gr
|
5 |
import pandas as pd
|
6 |
|
7 |
-
from texts import TITLE, DESCRIPTION
|
8 |
-
from process_data import load_average_data, load_hard_data, load_easy_data
|
9 |
from display import custom_css
|
10 |
BENCHMARKS_TO_SKIP = []
|
11 |
|
@@ -73,12 +73,27 @@ def prep_leaderboard_df():
|
|
73 |
df.insert(0, "Model", [map_model_name(idx) for idx in df.index])
|
74 |
df.insert(1, "Model Type", [map_model_type(idx) for idx in df.index])
|
75 |
# 对 Model 列应用函数,将模型名称转换为链接形式
|
76 |
-
df['Model'] = df['Model'].apply(make_clickable_model)
|
77 |
df = df.round(2)
|
78 |
return df
|
79 |
|
80 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
81 |
|
|
|
|
|
|
|
82 |
|
83 |
# Function to update the table based on search query
|
84 |
def filter_and_search(cols: list[str], search_query: str, agg: str):
|
@@ -109,25 +124,45 @@ demo = gr.Blocks(css=custom_css)
|
|
109 |
|
110 |
with demo:
|
111 |
gr.HTML(TITLE)
|
112 |
-
with gr.
|
113 |
-
gr.
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
#
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
128 |
|
129 |
-
cols_bar.change(filter_and_search, inputs=[cols_bar, search_bar], outputs=[leaderboard_table])
|
130 |
-
search_bar.submit(filter_and_search, inputs=[cols_bar, search_bar], outputs=[leaderboard_table])
|
131 |
|
132 |
with gr.Row():
|
133 |
with gr.Accordion("📚 Citation", open=False):
|
|
|
4 |
import gradio as gr
|
5 |
import pandas as pd
|
6 |
|
7 |
+
from texts import TITLE, DESCRIPTION, ABOUT
|
8 |
+
from process_data import load_average_data, load_hard_data, load_easy_data, load_detailed_success_rate_data, load_detailed_action_counts_data
|
9 |
from display import custom_css
|
10 |
BENCHMARKS_TO_SKIP = []
|
11 |
|
|
|
73 |
df.insert(0, "Model", [map_model_name(idx) for idx in df.index])
|
74 |
df.insert(1, "Model Type", [map_model_type(idx) for idx in df.index])
|
75 |
# 对 Model 列应用函数,将模型名称转换为链接形式
|
76 |
+
# df['Model'] = df['Model'].apply(make_clickable_model)
|
77 |
df = df.round(2)
|
78 |
return df
|
79 |
|
80 |
+
def prep_detailed_success_rate_df():
|
81 |
+
df = load_detailed_success_rate_data()
|
82 |
+
df.insert(0, "Model", [map_model_name(idx) for idx in df.index])
|
83 |
+
df.insert(1, "Model Type", [map_model_type(idx) for idx in df.index])
|
84 |
+
df = df.round(2)
|
85 |
+
return df
|
86 |
+
|
87 |
+
def prep_detailed_action_counts_df():
|
88 |
+
df = load_detailed_action_counts_data()
|
89 |
+
df.insert(0, "Model", [map_model_name(idx) for idx in df.index])
|
90 |
+
df.insert(1, "Model Type", [map_model_type(idx) for idx in df.index])
|
91 |
+
df = df.round(2)
|
92 |
+
return df
|
93 |
|
94 |
+
leaderboard_df = prep_leaderboard_df()
|
95 |
+
detailed_success_rate_df = prep_detailed_success_rate_df()
|
96 |
+
detailed_action_counts_df = prep_detailed_action_counts_df()
|
97 |
|
98 |
# Function to update the table based on search query
|
99 |
def filter_and_search(cols: list[str], search_query: str, agg: str):
|
|
|
124 |
|
125 |
with demo:
|
126 |
gr.HTML(TITLE)
|
127 |
+
with gr.Row():
|
128 |
+
with gr.Column():
|
129 |
+
gr.Markdown(DESCRIPTION, elem_classes="markdown-text")
|
130 |
+
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
131 |
+
with gr.TabItem("🏆 Leaderboard"):
|
132 |
+
with gr.Row():
|
133 |
+
# search_bar = gr.Textbox(placeholder="Search for your model...", show_label=False)
|
134 |
+
|
135 |
+
# cols_bar = gr.CheckboxGroup(
|
136 |
+
# choices=[c for c in leaderboard_df.columns[1:] if c != "Average"],
|
137 |
+
# show_label=False,
|
138 |
+
# # info="Select columns to display",
|
139 |
+
# )
|
140 |
+
with gr.Group():
|
141 |
+
leaderboard_table = gr.Dataframe(
|
142 |
+
value=leaderboard_df,
|
143 |
+
wrap=True,
|
144 |
+
# column_widths=[400, 110] + [(260 + len(c)) for c in leaderboard_df.columns[1:]],
|
145 |
+
)
|
146 |
+
|
147 |
+
#cols_bar.change(filter_and_search, inputs=[cols_bar, search_bar], outputs=[leaderboard_table])
|
148 |
+
# search_bar.submit(filter_and_search, inputs=[cols_bar, search_bar], outputs=[leaderboard_table])
|
149 |
+
with gr.TabItem("Success Rates - Detailed"):
|
150 |
+
with gr.Row():
|
151 |
+
detailed_success_rate_table = gr.Dataframe(
|
152 |
+
value=detailed_success_rate_df,
|
153 |
+
wrap=True,
|
154 |
+
)
|
155 |
+
|
156 |
+
with gr.TabItem("Action Counts - Detailed"):
|
157 |
+
with gr.Row():
|
158 |
+
detailed_action_counts_table = gr.Dataframe(
|
159 |
+
value=detailed_action_counts_df,
|
160 |
+
wrap=True,
|
161 |
+
)
|
162 |
+
|
163 |
+
with gr.TabItem("About"):
|
164 |
+
gr.Markdown(ABOUT)
|
165 |
|
|
|
|
|
166 |
|
167 |
with gr.Row():
|
168 |
with gr.Accordion("📚 Citation", open=False):
|
process_data.py
CHANGED
@@ -20,6 +20,20 @@ def load_average_data():
|
|
20 |
})
|
21 |
return df
|
22 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
23 |
def load_hard_data():
|
24 |
with open(hard_data_path, 'r') as f:
|
25 |
hard_data = json.load(f)
|
|
|
20 |
})
|
21 |
return df
|
22 |
|
23 |
+
def load_detailed_success_rate_data():
|
24 |
+
with open(average_success_rate_file_path, 'r') as f:
|
25 |
+
detailed_success_rate_data = json.load(f)
|
26 |
+
df = pd.DataFrame(detailed_success_rate_data)
|
27 |
+
df = df.T # 转置为 model 是行,指标是列
|
28 |
+
return df
|
29 |
+
|
30 |
+
def load_detailed_action_counts_data():
|
31 |
+
with open(average_action_count_file_path, 'r') as f:
|
32 |
+
detailed_action_counts_data = json.load(f)
|
33 |
+
df = pd.DataFrame(detailed_action_counts_data)
|
34 |
+
df = df.T # 转置为 model 是行,指标是列
|
35 |
+
return df
|
36 |
+
|
37 |
def load_hard_data():
|
38 |
with open(hard_data_path, 'r') as f:
|
39 |
hard_data = json.load(f)
|
texts.py
CHANGED
@@ -8,3 +8,12 @@ DESCRIPTION = f"""
|
|
8 |
|
9 |
|
10 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
|
9 |
|
10 |
"""
|
11 |
+
|
12 |
+
ABOUT = """
|
13 |
+
|
14 |
+
## About KUMO Benchmark
|
15 |
+
|
16 |
+
KUMO is a novel benchmark designed to systematically evaluate the complex reasoning capabilities of Large Language Models (LLMs) through procedurally generated reasoning games. Explore the limits of LLM reasoning and track model performance on our interactive leaderboard.
|
17 |
+
|
18 |
+
|
19 |
+
"""
|