Spaces:

zhwang4ai
/

GenerativeReasoningBenchmark

Running

App Files Files Community

zhwang4ai commited on 22 days ago

Commit

70c8dc9

1 Parent(s): b136fe4

update tabs

Browse files

Files changed (3) hide show

app.py +57 -22
process_data.py +14 -0
texts.py +9 -0

app.py CHANGED Viewed

@@ -4,8 +4,8 @@ from pathlib import Path
 import gradio as gr
 import pandas as pd
-from texts import TITLE, DESCRIPTION
-from process_data import load_average_data, load_hard_data, load_easy_data
 from display import custom_css
 BENCHMARKS_TO_SKIP = []
@@ -73,12 +73,27 @@ def prep_leaderboard_df():
     df.insert(0, "Model", [map_model_name(idx) for idx in df.index])
     df.insert(1, "Model Type", [map_model_type(idx) for idx in df.index])
     # 对 Model 列应用函数，将模型名称转换为链接形式
-    df['Model'] = df['Model'].apply(make_clickable_model)
     df = df.round(2)
     return df
-leaderboard_df = prep_leaderboard_df()
 # Function to update the table based on search query
 def filter_and_search(cols: list[str], search_query: str, agg: str):
@@ -109,25 +124,45 @@ demo = gr.Blocks(css=custom_css)
 with demo:
     gr.HTML(TITLE)
-    with gr.Column():
-        gr.Markdown(DESCRIPTION, elem_classes="markdown-text")
-        with gr.Row():
-            search_bar = gr.Textbox(placeholder="Search for your model...", show_label=False)
-            cols_bar = gr.CheckboxGroup(
-                choices=[c for c in leaderboard_df.columns[1:] if c != "Average"],
-                show_label=False,
-                # info="Select columns to display",
-            )
-        with gr.Group():
-            leaderboard_table = gr.Dataframe(
-                value=leaderboard_df,
-                wrap=True,
-                # column_widths=[400, 110] + [(260 + len(c)) for c in leaderboard_df.columns[1:]],
-            )
-    cols_bar.change(filter_and_search, inputs=[cols_bar, search_bar], outputs=[leaderboard_table])
-    search_bar.submit(filter_and_search, inputs=[cols_bar, search_bar], outputs=[leaderboard_table])
     with gr.Row():
         with gr.Accordion("📚 Citation", open=False):

 import gradio as gr
 import pandas as pd
+from texts import TITLE, DESCRIPTION, ABOUT
+from process_data import load_average_data, load_hard_data, load_easy_data, load_detailed_success_rate_data, load_detailed_action_counts_data
 from display import custom_css
 BENCHMARKS_TO_SKIP = []
     df.insert(0, "Model", [map_model_name(idx) for idx in df.index])
     df.insert(1, "Model Type", [map_model_type(idx) for idx in df.index])
     # 对 Model 列应用函数，将模型名称转换为链接形式
+    # df['Model'] = df['Model'].apply(make_clickable_model)
     df = df.round(2)
     return df
+def prep_detailed_success_rate_df():
+    df = load_detailed_success_rate_data()
+    df.insert(0, "Model", [map_model_name(idx) for idx in df.index])
+    df.insert(1, "Model Type", [map_model_type(idx) for idx in df.index])
+    df = df.round(2)
+    return df
+def prep_detailed_action_counts_df():
+    df = load_detailed_action_counts_data()
+    df.insert(0, "Model", [map_model_name(idx) for idx in df.index])
+    df.insert(1, "Model Type", [map_model_type(idx) for idx in df.index])
+    df = df.round(2)
+    return df
+leaderboard_df = prep_leaderboard_df()
+detailed_success_rate_df = prep_detailed_success_rate_df()
+detailed_action_counts_df = prep_detailed_action_counts_df()
 # Function to update the table based on search query
 def filter_and_search(cols: list[str], search_query: str, agg: str):
 with demo:
     gr.HTML(TITLE)
+    with gr.Row():
+        with gr.Column():
+            gr.Markdown(DESCRIPTION, elem_classes="markdown-text")
+    with gr.Tabs(elem_classes="tab-buttons") as tabs:
+        with gr.TabItem("🏆 Leaderboard"):
+            with gr.Row():
+                # search_bar = gr.Textbox(placeholder="Search for your model...", show_label=False)
+                # cols_bar = gr.CheckboxGroup(
+                #     choices=[c for c in leaderboard_df.columns[1:] if c != "Average"],
+                #     show_label=False,
+                #     # info="Select columns to display",
+                # )
+                with gr.Group():
+                    leaderboard_table = gr.Dataframe(
+                        value=leaderboard_df,
+                        wrap=True,
+                        # column_widths=[400, 110] + [(260 + len(c)) for c in leaderboard_df.columns[1:]],
+                    )
+                #cols_bar.change(filter_and_search, inputs=[cols_bar, search_bar], outputs=[leaderboard_table])
+                # search_bar.submit(filter_and_search, inputs=[cols_bar, search_bar], outputs=[leaderboard_table])
+        with gr.TabItem("Success Rates - Detailed"):
+            with gr.Row():
+                detailed_success_rate_table = gr.Dataframe(
+                    value=detailed_success_rate_df,
+                    wrap=True,
+                )
+        with gr.TabItem("Action Counts - Detailed"):
+            with gr.Row():
+                detailed_action_counts_table = gr.Dataframe(
+                    value=detailed_action_counts_df,
+                    wrap=True,
+                )
+        with gr.TabItem("About"):
+            gr.Markdown(ABOUT)
     with gr.Row():
         with gr.Accordion("📚 Citation", open=False):

process_data.py CHANGED Viewed

@@ -20,6 +20,20 @@ def load_average_data():
         })
     return df
 def load_hard_data():
     with open(hard_data_path, 'r') as f:
         hard_data = json.load(f)

         })
     return df
+def load_detailed_success_rate_data():
+    with open(average_success_rate_file_path, 'r') as f:
+        detailed_success_rate_data = json.load(f)
+    df = pd.DataFrame(detailed_success_rate_data)
+    df = df.T # 转置为 model 是行，指标是列
+    return df
+def load_detailed_action_counts_data():
+    with open(average_action_count_file_path, 'r') as f:
+        detailed_action_counts_data = json.load(f)
+    df = pd.DataFrame(detailed_action_counts_data)
+    df = df.T # 转置为 model 是行，指标是列
+    return df
 def load_hard_data():
     with open(hard_data_path, 'r') as f:
         hard_data = json.load(f)

texts.py CHANGED Viewed

	@@ -8,3 +8,12 @@ DESCRIPTION = f"""
8
9
10	"""

 """
+ABOUT = """
+## About KUMO Benchmark
+KUMO is a novel benchmark designed to systematically evaluate the complex reasoning capabilities of Large Language Models (LLMs) through procedurally generated reasoning games. Explore the limits of LLM reasoning and track model performance on our interactive leaderboard.
+"""