zhwang4ai commited on
Commit
70c8dc9
·
1 Parent(s): b136fe4

update tabs

Browse files
Files changed (3) hide show
  1. app.py +57 -22
  2. process_data.py +14 -0
  3. texts.py +9 -0
app.py CHANGED
@@ -4,8 +4,8 @@ from pathlib import Path
4
  import gradio as gr
5
  import pandas as pd
6
 
7
- from texts import TITLE, DESCRIPTION
8
- from process_data import load_average_data, load_hard_data, load_easy_data
9
  from display import custom_css
10
  BENCHMARKS_TO_SKIP = []
11
 
@@ -73,12 +73,27 @@ def prep_leaderboard_df():
73
  df.insert(0, "Model", [map_model_name(idx) for idx in df.index])
74
  df.insert(1, "Model Type", [map_model_type(idx) for idx in df.index])
75
  # 对 Model 列应用函数,将模型名称转换为链接形式
76
- df['Model'] = df['Model'].apply(make_clickable_model)
77
  df = df.round(2)
78
  return df
79
 
80
- leaderboard_df = prep_leaderboard_df()
 
 
 
 
 
 
 
 
 
 
 
 
81
 
 
 
 
82
 
83
  # Function to update the table based on search query
84
  def filter_and_search(cols: list[str], search_query: str, agg: str):
@@ -109,25 +124,45 @@ demo = gr.Blocks(css=custom_css)
109
 
110
  with demo:
111
  gr.HTML(TITLE)
112
- with gr.Column():
113
- gr.Markdown(DESCRIPTION, elem_classes="markdown-text")
114
- with gr.Row():
115
- search_bar = gr.Textbox(placeholder="Search for your model...", show_label=False)
116
-
117
- cols_bar = gr.CheckboxGroup(
118
- choices=[c for c in leaderboard_df.columns[1:] if c != "Average"],
119
- show_label=False,
120
- # info="Select columns to display",
121
- )
122
- with gr.Group():
123
- leaderboard_table = gr.Dataframe(
124
- value=leaderboard_df,
125
- wrap=True,
126
- # column_widths=[400, 110] + [(260 + len(c)) for c in leaderboard_df.columns[1:]],
127
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
128
 
129
- cols_bar.change(filter_and_search, inputs=[cols_bar, search_bar], outputs=[leaderboard_table])
130
- search_bar.submit(filter_and_search, inputs=[cols_bar, search_bar], outputs=[leaderboard_table])
131
 
132
  with gr.Row():
133
  with gr.Accordion("📚 Citation", open=False):
 
4
  import gradio as gr
5
  import pandas as pd
6
 
7
+ from texts import TITLE, DESCRIPTION, ABOUT
8
+ from process_data import load_average_data, load_hard_data, load_easy_data, load_detailed_success_rate_data, load_detailed_action_counts_data
9
  from display import custom_css
10
  BENCHMARKS_TO_SKIP = []
11
 
 
73
  df.insert(0, "Model", [map_model_name(idx) for idx in df.index])
74
  df.insert(1, "Model Type", [map_model_type(idx) for idx in df.index])
75
  # 对 Model 列应用函数,将模型名称转换为链接形式
76
+ # df['Model'] = df['Model'].apply(make_clickable_model)
77
  df = df.round(2)
78
  return df
79
 
80
+ def prep_detailed_success_rate_df():
81
+ df = load_detailed_success_rate_data()
82
+ df.insert(0, "Model", [map_model_name(idx) for idx in df.index])
83
+ df.insert(1, "Model Type", [map_model_type(idx) for idx in df.index])
84
+ df = df.round(2)
85
+ return df
86
+
87
+ def prep_detailed_action_counts_df():
88
+ df = load_detailed_action_counts_data()
89
+ df.insert(0, "Model", [map_model_name(idx) for idx in df.index])
90
+ df.insert(1, "Model Type", [map_model_type(idx) for idx in df.index])
91
+ df = df.round(2)
92
+ return df
93
 
94
+ leaderboard_df = prep_leaderboard_df()
95
+ detailed_success_rate_df = prep_detailed_success_rate_df()
96
+ detailed_action_counts_df = prep_detailed_action_counts_df()
97
 
98
  # Function to update the table based on search query
99
  def filter_and_search(cols: list[str], search_query: str, agg: str):
 
124
 
125
  with demo:
126
  gr.HTML(TITLE)
127
+ with gr.Row():
128
+ with gr.Column():
129
+ gr.Markdown(DESCRIPTION, elem_classes="markdown-text")
130
+ with gr.Tabs(elem_classes="tab-buttons") as tabs:
131
+ with gr.TabItem("🏆 Leaderboard"):
132
+ with gr.Row():
133
+ # search_bar = gr.Textbox(placeholder="Search for your model...", show_label=False)
134
+
135
+ # cols_bar = gr.CheckboxGroup(
136
+ # choices=[c for c in leaderboard_df.columns[1:] if c != "Average"],
137
+ # show_label=False,
138
+ # # info="Select columns to display",
139
+ # )
140
+ with gr.Group():
141
+ leaderboard_table = gr.Dataframe(
142
+ value=leaderboard_df,
143
+ wrap=True,
144
+ # column_widths=[400, 110] + [(260 + len(c)) for c in leaderboard_df.columns[1:]],
145
+ )
146
+
147
+ #cols_bar.change(filter_and_search, inputs=[cols_bar, search_bar], outputs=[leaderboard_table])
148
+ # search_bar.submit(filter_and_search, inputs=[cols_bar, search_bar], outputs=[leaderboard_table])
149
+ with gr.TabItem("Success Rates - Detailed"):
150
+ with gr.Row():
151
+ detailed_success_rate_table = gr.Dataframe(
152
+ value=detailed_success_rate_df,
153
+ wrap=True,
154
+ )
155
+
156
+ with gr.TabItem("Action Counts - Detailed"):
157
+ with gr.Row():
158
+ detailed_action_counts_table = gr.Dataframe(
159
+ value=detailed_action_counts_df,
160
+ wrap=True,
161
+ )
162
+
163
+ with gr.TabItem("About"):
164
+ gr.Markdown(ABOUT)
165
 
 
 
166
 
167
  with gr.Row():
168
  with gr.Accordion("📚 Citation", open=False):
process_data.py CHANGED
@@ -20,6 +20,20 @@ def load_average_data():
20
  })
21
  return df
22
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  def load_hard_data():
24
  with open(hard_data_path, 'r') as f:
25
  hard_data = json.load(f)
 
20
  })
21
  return df
22
 
23
+ def load_detailed_success_rate_data():
24
+ with open(average_success_rate_file_path, 'r') as f:
25
+ detailed_success_rate_data = json.load(f)
26
+ df = pd.DataFrame(detailed_success_rate_data)
27
+ df = df.T # 转置为 model 是行,指标是列
28
+ return df
29
+
30
+ def load_detailed_action_counts_data():
31
+ with open(average_action_count_file_path, 'r') as f:
32
+ detailed_action_counts_data = json.load(f)
33
+ df = pd.DataFrame(detailed_action_counts_data)
34
+ df = df.T # 转置为 model 是行,指标是列
35
+ return df
36
+
37
  def load_hard_data():
38
  with open(hard_data_path, 'r') as f:
39
  hard_data = json.load(f)
texts.py CHANGED
@@ -8,3 +8,12 @@ DESCRIPTION = f"""
8
 
9
 
10
  """
 
 
 
 
 
 
 
 
 
 
8
 
9
 
10
  """
11
+
12
+ ABOUT = """
13
+
14
+ ## About KUMO Benchmark
15
+
16
+ KUMO is a novel benchmark designed to systematically evaluate the complex reasoning capabilities of Large Language Models (LLMs) through procedurally generated reasoning games. Explore the limits of LLM reasoning and track model performance on our interactive leaderboard.
17
+
18
+
19
+ """