Repoaner commited on
Commit
42f14e1
Β·
verified Β·
1 Parent(s): bc6b7bc

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +251 -169
app.py CHANGED
@@ -1,9 +1,14 @@
 
 
1
  import gradio as gr
2
- from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
3
  import pandas as pd
 
 
 
4
  from apscheduler.schedulers.background import BackgroundScheduler
5
  from huggingface_hub import snapshot_download
6
 
 
7
  from src.about import (
8
  CITATION_BUTTON_LABEL,
9
  CITATION_BUTTON_TEXT,
@@ -11,194 +16,271 @@ from src.about import (
11
  INTRODUCTION_TEXT,
12
  LLM_BENCHMARKS_TEXT,
13
  TITLE,
 
14
  )
15
  from src.display.css_html_js import custom_css
16
- from src.display.utils import (
17
- BENCHMARK_COLS,
18
- COLS,
19
- EVAL_COLS,
20
- EVAL_TYPES,
21
- AutoEvalColumn,
22
- ModelType,
23
- fields,
24
- WeightType,
25
- Precision
26
- )
 
 
27
  from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
28
- from src.populate import get_evaluation_queue_df, get_leaderboard_df
29
- from src.submission.submit import add_new_eval
30
 
31
-
32
- def restart_space():
33
- API.restart_space(repo_id=REPO_ID)
34
-
35
- ### Space initialisation
36
- try:
37
- print(EVAL_REQUESTS_PATH)
38
- snapshot_download(
39
- repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
40
- )
41
- except Exception:
42
- restart_space()
43
  try:
44
  print(EVAL_RESULTS_PATH)
45
  snapshot_download(
46
  repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
47
  )
48
  except Exception:
49
- restart_space()
50
-
51
-
52
- LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
53
-
54
- (
55
- finished_eval_queue_df,
56
- running_eval_queue_df,
57
- pending_eval_queue_df,
58
- ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
59
-
60
- def init_leaderboard(dataframe):
61
- if dataframe is None or dataframe.empty:
62
- raise ValueError("Leaderboard DataFrame is empty or None.")
63
- return Leaderboard(
64
- value=dataframe,
65
- datatype=[c.type for c in fields(AutoEvalColumn)],
66
- select_columns=SelectColumns(
67
- default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
68
- cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
69
- label="Select Columns to Display:",
70
- ),
71
- search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
72
- hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
73
- filter_columns=[
74
- ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
75
- ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
76
- ColumnFilter(
77
- AutoEvalColumn.params.name,
78
- type="slider",
79
- min=0.01,
80
- max=150,
81
- label="Select the number of parameters (B)",
82
- ),
83
- ColumnFilter(
84
- AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
85
- ),
86
- ],
87
- bool_checkboxgroup_label="Hide models",
88
- interactive=False,
89
- )
90
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
 
92
- demo = gr.Blocks(css=custom_css)
93
- with demo:
94
- gr.HTML(TITLE)
95
- gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
 
 
96
 
97
- with gr.Tabs(elem_classes="tab-buttons") as tabs:
98
- with gr.TabItem("πŸ… LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
99
- leaderboard = init_leaderboard(LEADERBOARD_DF)
100
-
101
- with gr.TabItem("πŸ“ About", elem_id="llm-benchmark-tab-table", id=2):
102
- gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
103
-
104
- with gr.TabItem("πŸš€ Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
105
- with gr.Column():
106
- with gr.Row():
107
- gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
108
-
109
- with gr.Column():
110
- with gr.Accordion(
111
- f"βœ… Finished Evaluations ({len(finished_eval_queue_df)})",
112
- open=False,
113
- ):
114
- with gr.Row():
115
- finished_eval_table = gr.components.Dataframe(
116
- value=finished_eval_queue_df,
117
- headers=EVAL_COLS,
118
- datatype=EVAL_TYPES,
119
- row_count=5,
120
- )
121
- with gr.Accordion(
122
- f"πŸ”„ Running Evaluation Queue ({len(running_eval_queue_df)})",
123
- open=False,
124
- ):
125
- with gr.Row():
126
- running_eval_table = gr.components.Dataframe(
127
- value=running_eval_queue_df,
128
- headers=EVAL_COLS,
129
- datatype=EVAL_TYPES,
130
- row_count=5,
131
- )
132
-
133
- with gr.Accordion(
134
- f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
135
- open=False,
136
- ):
137
- with gr.Row():
138
- pending_eval_table = gr.components.Dataframe(
139
- value=pending_eval_queue_df,
140
- headers=EVAL_COLS,
141
- datatype=EVAL_TYPES,
142
- row_count=5,
143
- )
144
- with gr.Row():
145
- gr.Markdown("# βœ‰οΈβœ¨ Submit your model here!", elem_classes="markdown-text")
146
 
147
- with gr.Row():
148
- with gr.Column():
149
- model_name_textbox = gr.Textbox(label="Model name")
150
- revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
151
- model_type = gr.Dropdown(
152
- choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
153
- label="Model type",
154
- multiselect=False,
155
- value=None,
156
- interactive=True,
157
- )
158
-
159
- with gr.Column():
160
- precision = gr.Dropdown(
161
- choices=[i.value.name for i in Precision if i != Precision.Unknown],
162
- label="Precision",
163
- multiselect=False,
164
- value="float16",
165
- interactive=True,
166
- )
167
- weight_type = gr.Dropdown(
168
- choices=[i.value.name for i in WeightType],
169
- label="Weights type",
170
- multiselect=False,
171
- value="Original",
172
- interactive=True,
173
- )
174
- base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
175
-
176
- submit_button = gr.Button("Submit Eval")
177
- submission_result = gr.Markdown()
178
- submit_button.click(
179
- add_new_eval,
180
- [
181
- model_name_textbox,
182
- base_model_name_textbox,
183
- revision_name_textbox,
184
- precision,
185
- weight_type,
186
- model_type,
187
- ],
188
- submission_result,
189
- )
190
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
191
  with gr.Row():
192
- with gr.Accordion("πŸ“™ Citation", open=False):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
193
  citation_button = gr.Textbox(
194
  value=CITATION_BUTTON_TEXT,
195
- label=CITATION_BUTTON_LABEL,
196
- lines=20,
197
  elem_id="citation-button",
198
  show_copy_button=True,
199
  )
200
-
 
 
 
201
  scheduler = BackgroundScheduler()
202
- scheduler.add_job(restart_space, "interval", seconds=1800)
203
  scheduler.start()
204
- demo.queue(default_concurrency_limit=40).launch()
 
 
1
+ import os
2
+ import json
3
  import gradio as gr
 
4
  import pandas as pd
5
+ import numpy as np
6
+
7
+ from pathlib import Path
8
  from apscheduler.schedulers.background import BackgroundScheduler
9
  from huggingface_hub import snapshot_download
10
 
11
+
12
  from src.about import (
13
  CITATION_BUTTON_LABEL,
14
  CITATION_BUTTON_TEXT,
 
16
  INTRODUCTION_TEXT,
17
  LLM_BENCHMARKS_TEXT,
18
  TITLE,
19
+ ABOUT_TEXT
20
  )
21
  from src.display.css_html_js import custom_css
22
+ # from src.display.utils import (
23
+ # BENCHMARK_COLS,
24
+ # COLS,
25
+ # EVAL_COLS,
26
+ # EVAL_TYPES,
27
+ # NUMERIC_INTERVALS,
28
+ # TYPES,
29
+ # AutoEvalColumn,
30
+ # ModelType,
31
+ # fields,
32
+ # WeightType,
33
+ # Precision
34
+ # )
35
  from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
 
 
36
 
 
 
 
 
 
 
 
 
 
 
 
 
37
  try:
38
  print(EVAL_RESULTS_PATH)
39
  snapshot_download(
40
  repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
41
  )
42
  except Exception:
43
+ pass
44
+ # restart_space()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
 
46
+ SUBSET_COUNTS = {
47
+ "Alignment-Object": 250,
48
+ "Alignment-Attribute": 229,
49
+ "Alignment-Action": 115,
50
+ "Alignment-Count": 55,
51
+ "Alignment-Location": 75,
52
+ "Safety-Toxicity-Crime": 29,
53
+ "Safety-Toxicity-Shocking": 31,
54
+ "Safety-Toxicity-Disgust": 42,
55
+ "Safety-Nsfw-Evident": 197,
56
+ "Safety-Nsfw-Evasive": 177,
57
+ "Safety-Nsfw-Subtle": 98,
58
+ "Quality-Distortion-Human_face": 169,
59
+ "Quality-Distortion-Human_limb": 152,
60
+ "Quality-Distortion-Object": 100,
61
+ "Quality-Blurry-Defocused": 350,
62
+ "Quality-Blurry-Motion": 350,
63
+ "Bias-Age": 80,
64
+ "Bias-Gender": 140,
65
+ "Bias-Race": 140,
66
+ "Bias-Nationality": 120,
67
+ "Bias-Religion": 60,
68
+ }
69
 
70
+ PERSPECTIVE_COUNTS= {
71
+ "Alignment": 724,
72
+ "Safety": 574,
73
+ "Quality": 1121,
74
+ "Bias": 540
75
+ }
76
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
 
79
+ META_DATA = ['Model']
80
+
81
+
82
+
83
+ def restart_space():
84
+ API.restart_space(repo_id=REPO_ID)
85
+
86
+
87
+ # color_map = {
88
+ # "Score Model": "#7497db",
89
+ # "Opensource VLM": "#E8ECF2",
90
+ # "Closesource VLM": "#ffcd75",
91
+ # "Others": "#75809c",
92
+
93
+ # # #7497db #E8ECF2 #ffcd75 #75809c
94
+ # }
95
+ # def color_model_type_column(df, color_map):
96
+ # """
97
+ # Apply color to the 'Model Type' column of the DataFrame based on a given color mapping.
98
+
99
+ # Parameters:
100
+ # df (pd.DataFrame): The DataFrame containing the 'Model Type' column.
101
+ # color_map (dict): A dictionary mapping model types to colors.
102
+
103
+ # Returns:
104
+ # pd.Styler: The styled DataFrame.
105
+ # """
106
+ # # Function to apply color based on the model type
107
+ # def apply_color(val):
108
+ # color = color_map.get(val, "default") # Default color if not specified in color_map
109
+ # return f'background-color: {color}'
110
+
111
+ # # Format for different columns
112
+ # format_dict = {col: "{:.1f}" for col in df.columns if col not in META_DATA}
113
+ # format_dict['Overall Score'] = "{:.2f}"
114
+ # format_dict[''] = "{:d}"
115
+
116
+ # return df.style.applymap(apply_color, subset=['Model Type']).format(format_dict, na_rep='')
117
+
118
+ def regex_table(dataframe, regex, filter_button, style=True):
119
+ """
120
+ Takes a model name as a regex, then returns only the rows that has that in it.
121
+ """
122
+ # Split regex statement by comma and trim whitespace around regexes
123
+ regex_list = [x.strip() for x in regex.split(",")]
124
+ # Join the list into a single regex pattern with '|' acting as OR
125
+ combined_regex = '|'.join(regex_list)
126
+
127
+ # if filter_button, remove all rows with "ai2" in the model name
128
+ update_scores = False
129
+ if isinstance(filter_button, list) or isinstance(filter_button, str):
130
+ if "Integrated LVLM" not in filter_button:
131
+ dataframe = dataframe[~dataframe["Model Type"].str.contains("Integrated LVLM", case=False, na=False)]
132
+ if "Interleaved LVLM" not in filter_button:
133
+ dataframe = dataframe[~dataframe["Model Type"].str.contains("Interleaved LVLM", case=False, na=False)]
134
+ # Filter the dataframe such that 'model' contains any of the regex patterns
135
+ data = dataframe[dataframe["Model"].str.contains(combined_regex, case=False, na=False)]
136
+
137
+ data.reset_index(drop=True, inplace=True)
138
+
139
+ # replace column '' with count/rank
140
+ data.insert(0, '', range(1, 1 + len(data)))
141
+
142
+ # if style:
143
+ # # apply color
144
+ # data = color_model_type_column(data, color_map)
145
+
146
+ return data
147
+
148
+ def get_leaderboard_results(results_path):
149
+ data_dir = Path(results_path)
150
+ files = [d for d in os.listdir(data_dir)] # TODO check if "Path(data_dir) / d" is a dir
151
+
152
+ df = pd.DataFrame()
153
+ for file in files:
154
+ if not file.endswith(".json"):
155
+ continue
156
+ with open(results_path / file) as rf:
157
+ result = json.load(rf)
158
+ result = pd.DataFrame(result)
159
+ df = pd.concat([result, df])
160
+ df.reset_index(drop=True, inplace=True)
161
+ return df
162
+
163
+
164
+ def avg_all_perspective(orig_df: pd.DataFrame, columns_name: list, meta_data=META_DATA, perspective_counts=PERSPECTIVE_COUNTS):
165
+ new_df = orig_df[meta_data + columns_name]
166
+ new_perspective_counts = {col: perspective_counts[col] for col in columns_name}
167
+ total_count = sum(perspective_counts.values())
168
+ weights = {perspective: count / total_count for perspective, count in perspective_counts.items()}
169
+ def calculate_weighted_avg(row):
170
+ weighted_sum = sum(row[col] * weights[col] for col in columns_name)
171
+ return weighted_sum
172
+ new_df["Overall Score"] = new_df.apply(calculate_weighted_avg, axis=1)
173
+
174
+ cols = meta_data + ["Overall Score"] + columns_name
175
+ new_df = new_df[cols].sort_values(by="Overall Score", ascending=False).reset_index(drop=True)
176
+ return new_df
177
+
178
+ data = {
179
+ "Model": [
180
+ "MiniGPT-5", "EMU-2", "GILL", "Anole",
181
+ "GPT-4o | Openjourney", "GPT-4o | SD-3", "GPT-4o | SD-XL", "GPT-4o | Flux",
182
+ "Gemini-1.5 | Openjourney", "Gemini-1.5 | SD-3", "Gemini-1.5 | SD-XL", "Gemini-1.5 | Flux",
183
+ "LLAVA-34b | Openjourney", "LLAVA-34b | SD-3", "LLAVA-34b | SD-XL", "LLAVA-34b | Flux",
184
+ "Qwen-VL-70b | Openjourney", "Qwen-VL-70b | SD-3", "Qwen-VL-70b | SD-XL", "Qwen-VL-70b | Flux"
185
+ ],
186
+ "Model Type":[
187
+ "Interleaved LVLM", "Interleaved LVLM", "Interleaved LVLM", "Interleaved LVLM",
188
+ "Integrated LVLM", "Integrated LVLM", "Integrated LVLM", "Integrated LVLM",
189
+ "Integrated LVLM", "Integrated LVLM", "Integrated LVLM", "Integrated LVLM",
190
+ "Integrated LVLM", "Integrated LVLM", "Integrated LVLM", "Integrated LVLM",
191
+ "Integrated LVLM", "Integrated LVLM", "Integrated LVLM", "Integrated LVLM",
192
+ ],
193
+ "Situational analysis": [
194
+ 47.63, 39.65, 46.72, 48.95,
195
+ 53.05, 53.00, 56.12, 54.97,
196
+ 48.08, 47.48, 49.43, 47.07,
197
+ 54.12, 54.72, 55.97, 54.23,
198
+ 52.73, 54.98, 52.58, 54.23
199
+ ],
200
+ "Project-based learning": [
201
+ 55.12, 46.12, 57.57, 59.05,
202
+ 71.40, 71.20, 73.25, 68.80,
203
+ 67.93, 68.70, 71.85, 68.33,
204
+ 73.47, 72.55, 74.60, 71.32,
205
+ 71.63, 71.87, 73.57, 69.47
206
+ ],
207
+ "Multi-step reasoning": [
208
+ 42.17, 50.75, 39.33, 51.72,
209
+ 53.67, 53.67, 53.67, 53.67,
210
+ 60.05, 60.05, 60.05, 60.05,
211
+ 47.28, 47.28, 47.28, 47.28,
212
+ 55.63, 55.63, 55.63, 55.63
213
+ ],
214
+ "AVG": [
215
+ 50.92, 45.33, 51.58, 55.22,
216
+ 63.65, 63.52, 65.47, 62.63,
217
+ 61.57, 61.87, 64.15, 61.55,
218
+ 63.93, 63.57, 65.05, 62.73,
219
+ 64.05, 64.75, 65.12, 63.18
220
+ ]
221
+ }
222
+ df = pd.DataFrame(data)
223
+ total_models = len(df)
224
+
225
+ with gr.Blocks(css=custom_css) as app:
226
  with gr.Row():
227
+ with gr.Column(scale=6):
228
+ gr.Markdown(INTRODUCTION_TEXT.format(str(total_models)))
229
+ with gr.Column(scale=4):
230
+ gr.Markdown("![](https://huggingface.co/spaces/MMIE/Leaderboard/blob/main/src/overview.jpeg)")
231
+ # gr.HTML(BGB_LOGO, elem_classes="logo")
232
+
233
+ with gr.Tabs(elem_classes="tab-buttons") as tabs:
234
+ with gr.TabItem("πŸ† MMIE Leaderboard"):
235
+ with gr.Row():
236
+ search_overall = gr.Textbox(
237
+ label="Model Search (delimit with , )",
238
+ placeholder="πŸ” Search model (separate multiple queries with ``) and press ENTER...",
239
+ show_label=False
240
+ )
241
+ model_type_overall = gr.CheckboxGroup(
242
+ choices=["Interleaved LVLM", "Integrated LVLM"],
243
+ value=["Interleaved LVLM", "Integrated LVLM"],
244
+ label="Model Type",
245
+ show_label=False,
246
+ interactive=True,
247
+ )
248
+ with gr.Row():
249
+ mmie_table_overall_hidden = gr.Dataframe(
250
+ df,
251
+ headers=df.columns.tolist(),
252
+ elem_id="mmie_leadboard_overall_hidden",
253
+ wrap=True,
254
+ visible=False,
255
+ )
256
+ mmie_table_overall = gr.Dataframe(
257
+ regex_table(
258
+ df.copy(),
259
+ "",
260
+ ["Interleaved LVLM", "Integrated LVLM"]
261
+ ),
262
+ headers=df.columns.tolist(),
263
+ elem_id="mmie_leadboard_overall",
264
+ wrap=True,
265
+ )
266
+ with gr.TabItem("About"):
267
+ with gr.Row():
268
+ gr.Markdown(ABOUT_TEXT)
269
+
270
+ with gr.Accordion("πŸ“š Citation", open=False):
271
  citation_button = gr.Textbox(
272
  value=CITATION_BUTTON_TEXT,
273
+ lines=7,
274
+ label="Copy the following to cite these results.",
275
  elem_id="citation-button",
276
  show_copy_button=True,
277
  )
278
+
279
+ search_overall.change(regex_table, inputs=[mmie_table_overall_hidden, search_overall, model_type_overall], outputs=mmie_table_overall)
280
+ model_type_overall.change(regex_table, inputs=[mmie_table_overall_hidden, search_overall, model_type_overall], outputs=mmie_table_overall)
281
+
282
  scheduler = BackgroundScheduler()
283
+ scheduler.add_job(restart_space, "interval", seconds=18000) # restarted every 3h
284
  scheduler.start()
285
+ # app.queue(default_concurrency_limit=40).launch()
286
+ app.launch(allowed_paths=['./', "./src", "./evals"])