Spaces:

PKU-Alignment
/

EvalAnything-LeaderBoard

Running

App Files Files Community

Repoaner commited on Oct 17, 2024

Commit

42f14e1

verified ·

1 Parent(s): bc6b7bc

Update app.py

Browse files

Files changed (1) hide show

app.py +251 -169

app.py CHANGED Viewed

@@ -1,9 +1,14 @@
 import gradio as gr
-from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
 import pandas as pd
 from apscheduler.schedulers.background import BackgroundScheduler
 from huggingface_hub import snapshot_download
 from src.about import (
     CITATION_BUTTON_LABEL,
     CITATION_BUTTON_TEXT,
@@ -11,194 +16,271 @@ from src.about import (
     INTRODUCTION_TEXT,
     LLM_BENCHMARKS_TEXT,
     TITLE,
 )
 from src.display.css_html_js import custom_css
-from src.display.utils import (
-    BENCHMARK_COLS,
-    COLS,
-    EVAL_COLS,
-    EVAL_TYPES,
-    AutoEvalColumn,
-    ModelType,
-    fields,
-    WeightType,
-    Precision
-)
 from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
-from src.populate import get_evaluation_queue_df, get_leaderboard_df
-from src.submission.submit import add_new_eval
-def restart_space():
-    API.restart_space(repo_id=REPO_ID)
-### Space initialisation
-try:
-    print(EVAL_REQUESTS_PATH)
-    snapshot_download(
-        repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
-    )
-except Exception:
-    restart_space()
 try:
     print(EVAL_RESULTS_PATH)
     snapshot_download(
         repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
     )
 except Exception:
-    restart_space()
-LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
-(
-    finished_eval_queue_df,
-    running_eval_queue_df,
-    pending_eval_queue_df,
-) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
-def init_leaderboard(dataframe):
-    if dataframe is None or dataframe.empty:
-        raise ValueError("Leaderboard DataFrame is empty or None.")
-    return Leaderboard(
-        value=dataframe,
-        datatype=[c.type for c in fields(AutoEvalColumn)],
-        select_columns=SelectColumns(
-            default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
-            cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
-            label="Select Columns to Display:",
-        ),
-        search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
-        hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
-        filter_columns=[
-            ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
-            ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
-            ColumnFilter(
-                AutoEvalColumn.params.name,
-                type="slider",
-                min=0.01,
-                max=150,
-                label="Select the number of parameters (B)",
-            ),
-            ColumnFilter(
-                AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
-            ),
-        ],
-        bool_checkboxgroup_label="Hide models",
-        interactive=False,
-    )
-demo = gr.Blocks(css=custom_css)
-with demo:
-    gr.HTML(TITLE)
-    gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
-    with gr.Tabs(elem_classes="tab-buttons") as tabs:
-        with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
-            leaderboard = init_leaderboard(LEADERBOARD_DF)
-        with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
-            gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
-        with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
-            with gr.Column():
-                with gr.Row():
-                    gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
-                with gr.Column():
-                    with gr.Accordion(
-                        f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
-                        open=False,
-                    ):
-                        with gr.Row():
-                            finished_eval_table = gr.components.Dataframe(
-                                value=finished_eval_queue_df,
-                                headers=EVAL_COLS,
-                                datatype=EVAL_TYPES,
-                                row_count=5,
-                            )
-                    with gr.Accordion(
-                        f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
-                        open=False,
-                    ):
-                        with gr.Row():
-                            running_eval_table = gr.components.Dataframe(
-                                value=running_eval_queue_df,
-                                headers=EVAL_COLS,
-                                datatype=EVAL_TYPES,
-                                row_count=5,
-                            )
-                    with gr.Accordion(
-                        f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
-                        open=False,
-                    ):
-                        with gr.Row():
-                            pending_eval_table = gr.components.Dataframe(
-                                value=pending_eval_queue_df,
-                                headers=EVAL_COLS,
-                                datatype=EVAL_TYPES,
-                                row_count=5,
-                            )
-            with gr.Row():
-                gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
-            with gr.Row():
-                with gr.Column():
-                    model_name_textbox = gr.Textbox(label="Model name")
-                    revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
-                    model_type = gr.Dropdown(
-                        choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
-                        label="Model type",
-                        multiselect=False,
-                        value=None,
-                        interactive=True,
-                    )
-                with gr.Column():
-                    precision = gr.Dropdown(
-                        choices=[i.value.name for i in Precision if i != Precision.Unknown],
-                        label="Precision",
-                        multiselect=False,
-                        value="float16",
-                        interactive=True,
-                    )
-                    weight_type = gr.Dropdown(
-                        choices=[i.value.name for i in WeightType],
-                        label="Weights type",
-                        multiselect=False,
-                        value="Original",
-                        interactive=True,
-                    )
-                    base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
-            submit_button = gr.Button("Submit Eval")
-            submission_result = gr.Markdown()
-            submit_button.click(
-                add_new_eval,
-                [
-                    model_name_textbox,
-                    base_model_name_textbox,
-                    revision_name_textbox,
-                    precision,
-                    weight_type,
-                    model_type,
-                ],
-                submission_result,
-            )
     with gr.Row():
-        with gr.Accordion("📙 Citation", open=False):
             citation_button = gr.Textbox(
                 value=CITATION_BUTTON_TEXT,
-                label=CITATION_BUTTON_LABEL,
-                lines=20,
                 elem_id="citation-button",
                 show_copy_button=True,
             )
 scheduler = BackgroundScheduler()
-scheduler.add_job(restart_space, "interval", seconds=1800)
 scheduler.start()
-demo.queue(default_concurrency_limit=40).launch()

+import os
+import json
 import gradio as gr
 import pandas as pd
+import numpy as np
+from pathlib import Path
 from apscheduler.schedulers.background import BackgroundScheduler
 from huggingface_hub import snapshot_download
 from src.about import (
     CITATION_BUTTON_LABEL,
     CITATION_BUTTON_TEXT,
     INTRODUCTION_TEXT,
     LLM_BENCHMARKS_TEXT,
     TITLE,
+    ABOUT_TEXT
 )
 from src.display.css_html_js import custom_css
+# from src.display.utils import (
+#     BENCHMARK_COLS,
+#     COLS,
+#     EVAL_COLS,
+#     EVAL_TYPES,
+#     NUMERIC_INTERVALS,
+#     TYPES,
+#     AutoEvalColumn,
+#     ModelType,
+#     fields,
+#     WeightType,
+#     Precision
+# )
 from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
 try:
     print(EVAL_RESULTS_PATH)
     snapshot_download(
         repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
     )
 except Exception:
+    pass
+    # restart_space()
+SUBSET_COUNTS = {
+    "Alignment-Object": 250,
+    "Alignment-Attribute": 229,
+    "Alignment-Action": 115,
+    "Alignment-Count": 55,
+    "Alignment-Location": 75,
+    "Safety-Toxicity-Crime": 29,
+    "Safety-Toxicity-Shocking": 31,
+    "Safety-Toxicity-Disgust": 42,
+    "Safety-Nsfw-Evident": 197,
+    "Safety-Nsfw-Evasive": 177,
+    "Safety-Nsfw-Subtle": 98,
+    "Quality-Distortion-Human_face": 169,
+    "Quality-Distortion-Human_limb": 152,
+    "Quality-Distortion-Object": 100,
+    "Quality-Blurry-Defocused": 350,
+    "Quality-Blurry-Motion": 350,
+    "Bias-Age": 80,
+    "Bias-Gender": 140,
+    "Bias-Race": 140,
+    "Bias-Nationality": 120,
+    "Bias-Religion": 60,
+}
+PERSPECTIVE_COUNTS= {
+    "Alignment": 724,
+    "Safety": 574,
+    "Quality": 1121,
+    "Bias": 540
+}
+META_DATA = ['Model']
+def restart_space():
+    API.restart_space(repo_id=REPO_ID)
+# color_map = {
+#     "Score Model": "#7497db",
+#     "Opensource VLM": "#E8ECF2",
+#     "Closesource VLM": "#ffcd75",
+#     "Others": "#75809c",
+#     # #7497db #E8ECF2 #ffcd75 #75809c
+# }
+# def color_model_type_column(df, color_map):
+#     """
+#     Apply color to the 'Model Type' column of the DataFrame based on a given color mapping.
+#     Parameters:
+#     df (pd.DataFrame): The DataFrame containing the 'Model Type' column.
+#     color_map (dict): A dictionary mapping model types to colors.
+#     Returns:
+#     pd.Styler: The styled DataFrame.
+#     """
+#     # Function to apply color based on the model type
+#     def apply_color(val):
+#         color = color_map.get(val, "default")  # Default color if not specified in color_map
+#         return f'background-color: {color}'
+#     # Format for different columns
+#     format_dict = {col: "{:.1f}" for col in df.columns if col not in META_DATA}
+#     format_dict['Overall Score'] = "{:.2f}"
+#     format_dict[''] = "{:d}"
+#     return df.style.applymap(apply_color, subset=['Model Type']).format(format_dict, na_rep='')
+def regex_table(dataframe, regex, filter_button, style=True):
+    """
+    Takes a model name as a regex, then returns only the rows that has that in it.
+    """
+    # Split regex statement by comma and trim whitespace around regexes
+    regex_list = [x.strip() for x in regex.split(",")]
+    # Join the list into a single regex pattern with '|' acting as OR
+    combined_regex = '|'.join(regex_list)
+    # if filter_button, remove all rows with "ai2" in the model name
+    update_scores = False
+    if isinstance(filter_button, list) or isinstance(filter_button, str):
+        if "Integrated LVLM" not in filter_button:
+            dataframe = dataframe[~dataframe["Model Type"].str.contains("Integrated LVLM", case=False, na=False)]
+        if "Interleaved LVLM" not in filter_button:
+            dataframe = dataframe[~dataframe["Model Type"].str.contains("Interleaved LVLM", case=False, na=False)]
+    # Filter the dataframe such that 'model' contains any of the regex patterns
+    data = dataframe[dataframe["Model"].str.contains(combined_regex, case=False, na=False)]
+    data.reset_index(drop=True, inplace=True)
+    # replace column '' with count/rank
+    data.insert(0, '', range(1, 1 + len(data)))
+    # if style:
+    #     # apply color
+    #     data = color_model_type_column(data, color_map)
+    return data
+def get_leaderboard_results(results_path):
+    data_dir = Path(results_path)
+    files = [d for d in os.listdir(data_dir)] # TODO check if "Path(data_dir) / d" is a dir
+    df = pd.DataFrame()
+    for file in files:
+        if not file.endswith(".json"):
+            continue
+        with open(results_path / file) as rf:
+            result = json.load(rf)
+            result = pd.DataFrame(result)
+            df = pd.concat([result, df])
+    df.reset_index(drop=True, inplace=True)
+    return df
+def avg_all_perspective(orig_df: pd.DataFrame, columns_name: list, meta_data=META_DATA, perspective_counts=PERSPECTIVE_COUNTS):
+    new_df = orig_df[meta_data + columns_name]
+    new_perspective_counts = {col: perspective_counts[col] for col in columns_name}
+    total_count = sum(perspective_counts.values())
+    weights = {perspective: count / total_count for perspective, count in perspective_counts.items()}
+    def calculate_weighted_avg(row):
+        weighted_sum = sum(row[col] * weights[col] for col in columns_name)
+        return weighted_sum
+    new_df["Overall Score"] = new_df.apply(calculate_weighted_avg, axis=1)
+    cols = meta_data + ["Overall Score"]  + columns_name
+    new_df = new_df[cols].sort_values(by="Overall Score", ascending=False).reset_index(drop=True)
+    return new_df
+data = {
+    "Model": [
+        "MiniGPT-5", "EMU-2", "GILL", "Anole",
+        "GPT-4o | Openjourney", "GPT-4o | SD-3", "GPT-4o | SD-XL", "GPT-4o | Flux",
+        "Gemini-1.5 | Openjourney", "Gemini-1.5 | SD-3", "Gemini-1.5 | SD-XL", "Gemini-1.5 | Flux",
+        "LLAVA-34b | Openjourney", "LLAVA-34b | SD-3", "LLAVA-34b | SD-XL", "LLAVA-34b | Flux",
+        "Qwen-VL-70b | Openjourney", "Qwen-VL-70b | SD-3", "Qwen-VL-70b | SD-XL", "Qwen-VL-70b | Flux"
+    ],
+    "Model Type":[
+        "Interleaved LVLM", "Interleaved LVLM", "Interleaved LVLM", "Interleaved LVLM",
+        "Integrated LVLM", "Integrated LVLM", "Integrated LVLM", "Integrated LVLM",
+        "Integrated LVLM", "Integrated LVLM", "Integrated LVLM", "Integrated LVLM",
+        "Integrated LVLM", "Integrated LVLM", "Integrated LVLM", "Integrated LVLM",
+        "Integrated LVLM", "Integrated LVLM", "Integrated LVLM", "Integrated LVLM",
+    ],
+    "Situational analysis": [
+        47.63, 39.65, 46.72, 48.95,
+        53.05, 53.00, 56.12, 54.97,
+        48.08, 47.48, 49.43, 47.07,
+        54.12, 54.72, 55.97, 54.23,
+        52.73, 54.98, 52.58, 54.23
+    ],
+    "Project-based learning": [
+        55.12, 46.12, 57.57, 59.05,
+        71.40, 71.20, 73.25, 68.80,
+        67.93, 68.70, 71.85, 68.33,
+        73.47, 72.55, 74.60, 71.32,
+        71.63, 71.87, 73.57, 69.47
+    ],
+    "Multi-step reasoning": [
+        42.17, 50.75, 39.33, 51.72,
+        53.67, 53.67, 53.67, 53.67,
+        60.05, 60.05, 60.05, 60.05,
+        47.28, 47.28, 47.28, 47.28,
+        55.63, 55.63, 55.63, 55.63
+    ],
+    "AVG": [
+        50.92, 45.33, 51.58, 55.22,
+        63.65, 63.52, 65.47, 62.63,
+        61.57, 61.87, 64.15, 61.55,
+        63.93, 63.57, 65.05, 62.73,
+        64.05, 64.75, 65.12, 63.18
+    ]
+}
+df = pd.DataFrame(data)
+total_models = len(df)
+with gr.Blocks(css=custom_css) as app:
     with gr.Row():
+        with gr.Column(scale=6):
+            gr.Markdown(INTRODUCTION_TEXT.format(str(total_models)))
+        with gr.Column(scale=4):
+            gr.Markdown("![](https://huggingface.co/spaces/MMIE/Leaderboard/blob/main/src/overview.jpeg)")
+            # gr.HTML(BGB_LOGO, elem_classes="logo")
+    with gr.Tabs(elem_classes="tab-buttons") as tabs:
+        with gr.TabItem("🏆 MMIE Leaderboard"):
+            with gr.Row():
+                search_overall = gr.Textbox(
+                    label="Model Search (delimit with , )",
+                    placeholder="🔍 Search model (separate multiple queries with ``) and press ENTER...",
+                    show_label=False
+                )
+                model_type_overall = gr.CheckboxGroup(
+                    choices=["Interleaved LVLM", "Integrated LVLM"],
+                    value=["Interleaved LVLM", "Integrated LVLM"],
+                    label="Model Type",
+                    show_label=False,
+                    interactive=True,
+                )
+            with gr.Row():
+                mmie_table_overall_hidden = gr.Dataframe(
+                    df,
+                    headers=df.columns.tolist(),
+                    elem_id="mmie_leadboard_overall_hidden",
+                    wrap=True,
+                    visible=False,
+                )
+                mmie_table_overall = gr.Dataframe(
+                    regex_table(
+                        df.copy(),
+                        "",
+                        ["Interleaved LVLM", "Integrated LVLM"]
+                     ),
+                    headers=df.columns.tolist(),
+                    elem_id="mmie_leadboard_overall",
+                    wrap=True,
+                )
+        with gr.TabItem("About"):
+            with gr.Row():
+                gr.Markdown(ABOUT_TEXT)
+    with gr.Accordion("📚 Citation", open=False):
             citation_button = gr.Textbox(
                 value=CITATION_BUTTON_TEXT,
+                lines=7,
+                label="Copy the following to cite these results.",
                 elem_id="citation-button",
                 show_copy_button=True,
             )
+    search_overall.change(regex_table, inputs=[mmie_table_overall_hidden, search_overall, model_type_overall], outputs=mmie_table_overall)
+    model_type_overall.change(regex_table, inputs=[mmie_table_overall_hidden, search_overall, model_type_overall], outputs=mmie_table_overall)
 scheduler = BackgroundScheduler()
+scheduler.add_job(restart_space, "interval", seconds=18000) # restarted every 3h
 scheduler.start()
+# app.queue(default_concurrency_limit=40).launch()
+app.launch(allowed_paths=['./', "./src", "./evals"])