Bagratuni commited on
Commit
c633720
Β·
1 Parent(s): 6b4ef20
app.py CHANGED
@@ -1,8 +1,8 @@
1
  import gradio as gr
2
  import pandas as pd
3
  import plotly.express as px
4
- from model_handler import ModelHandler
5
- from data_handler import unified_exam_result_table, mmlu_result_table, unified_exam_chart, mmlu_chart
6
 
7
  global_unified_exam_df = None
8
  global_mmlu_df = None
@@ -96,8 +96,17 @@ def main():
96
  ]
97
  }
98
  ```
99
- 3. **Submit your model**:
100
- - Add the `arm_bench` tag and the `result.json` file to your model card.
 
 
 
 
 
 
 
 
 
101
  - Click on the "Refresh Data" button in this app, and you will see your model's results.
102
  """
103
  )
 
1
  import gradio as gr
2
  import pandas as pd
3
  import plotly.express as px
4
+ from data.model_handler import ModelHandler
5
+ from data.data_handler import unified_exam_result_table, mmlu_result_table, unified_exam_chart, mmlu_chart
6
 
7
  global_unified_exam_df = None
8
  global_mmlu_df = None
 
96
  ]
97
  }
98
  ```
99
+ 3. **Important Notes**:
100
+ - For **`mmlu_results`**:
101
+ - The following categories must be included in the `mmlu_results` for the model to be considered valid:
102
+ - "Biology", "Business", "Chemistry", "Computer Science", "Economics", "Engineering", "Health", "History", "Law", "Math", "Other", "Philosophy", "Physics", "Psychology", "Average"
103
+ - If any of these categories are missing, the model will not be added to the evaluation.
104
+ - For **`unified_exam_results`**:
105
+ - The following categories must be included in the `unified_exam_results` for the model to be considered valid:
106
+ - "Average", "Armenian language and literature", "Armenian history", "Mathematics"
107
+ - If any of these categories are missing, the model will not be added to the evaluation.
108
+ 4. **Submit your model**:
109
+ - Add the `Arm-LLM-Bench` tag and the `result.json` file to your model card.
110
  - Click on the "Refresh Data" button in this app, and you will see your model's results.
111
  """
112
  )
data_handler.py β†’ data/data_handler.py RENAMED
@@ -1,12 +1,10 @@
1
  import gradio as gr
2
  import pandas as pd
3
  import plotly.express as px
4
- from model_handler import ModelHandler
5
 
6
  def unified_exam_result_table(unified_exam_df):
7
  df = unified_exam_df.copy()
8
- numeric_columns = df.select_dtypes(include=["number"])
9
- df["Average"] = numeric_columns.mean(axis=1)
10
  df = df.sort_values(by='Average', ascending=False).reset_index(drop=True)
11
  df.insert(0, 'Rank', range(1, len(df) + 1))
12
  cols = df.columns.tolist()
@@ -18,8 +16,6 @@ def unified_exam_result_table(unified_exam_df):
18
 
19
  def mmlu_result_table(mmlu_df):
20
  df = mmlu_df.copy()
21
- numeric_columns = df.select_dtypes(include=["number"])
22
- df["Average"] = numeric_columns.mean(axis=1)
23
  df = df.sort_values(by='Average', ascending=False).reset_index(drop=True)
24
  df.insert(0, 'Rank', range(1, len(df) + 1))
25
  cols = df.columns.tolist()
 
1
  import gradio as gr
2
  import pandas as pd
3
  import plotly.express as px
4
+ from data.model_handler import ModelHandler
5
 
6
  def unified_exam_result_table(unified_exam_df):
7
  df = unified_exam_df.copy()
 
 
8
  df = df.sort_values(by='Average', ascending=False).reset_index(drop=True)
9
  df.insert(0, 'Rank', range(1, len(df) + 1))
10
  cols = df.columns.tolist()
 
16
 
17
  def mmlu_result_table(mmlu_df):
18
  df = mmlu_df.copy()
 
 
19
  df = df.sort_values(by='Average', ascending=False).reset_index(drop=True)
20
  df.insert(0, 'Rank', range(1, len(df) + 1))
21
  cols = df.columns.tolist()
model_handler.py β†’ data/model_handler.py RENAMED
@@ -5,8 +5,10 @@ from typing import Any, Dict
5
  import pandas as pd
6
  from huggingface_hub import HfApi, hf_hub_download
7
 
 
 
8
  class ModelHandler:
9
- def __init__(self, model_infos_path="model_results.json"):
10
  self.api = HfApi()
11
  self.model_infos_path = model_infos_path
12
  self.model_infos = self._load_model_infos()
@@ -23,7 +25,7 @@ class ModelHandler:
23
  json.dump(self.model_infos, f, indent=4)
24
 
25
  def get_arm_bench_data(self):
26
- models = self.api.list_models(filter="arm_llm")
27
  model_names = {model["model_name"] for model in self.model_infos}
28
  repositories = [model.modelId for model in models]
29
 
@@ -63,16 +65,22 @@ class ModelHandler:
63
 
64
  if mmlu_results:
65
  mmlu_row = {"Model": model_name}
66
- for result in mmlu_results:
67
- mmlu_row[result["category"]] = result["score"]
68
- mmlu_data.append(mmlu_row)
 
 
 
69
 
70
  if unified_exam_results:
71
  unified_exam_row = {"Model": model_name}
72
- for result in unified_exam_results:
73
- unified_exam_row[result["category"]] = result["score"]
74
- unified_exam_data.append(unified_exam_row)
75
 
 
 
 
 
 
 
76
 
77
  mmlu_df = pd.DataFrame(mmlu_data)
78
  unified_exam_df = pd.DataFrame(unified_exam_data)
 
5
  import pandas as pd
6
  from huggingface_hub import HfApi, hf_hub_download
7
 
8
+ from data.required_categories import required_mmlu_categories, required_unified_exam_categories
9
+
10
  class ModelHandler:
11
+ def __init__(self, model_infos_path="data\model_results.json"):
12
  self.api = HfApi()
13
  self.model_infos_path = model_infos_path
14
  self.model_infos = self._load_model_infos()
 
25
  json.dump(self.model_infos, f, indent=4)
26
 
27
  def get_arm_bench_data(self):
28
+ models = self.api.list_models(filter="Arm-LLM-Benchmark")
29
  model_names = {model["model_name"] for model in self.model_infos}
30
  repositories = [model.modelId for model in models]
31
 
 
65
 
66
  if mmlu_results:
67
  mmlu_row = {"Model": model_name}
68
+
69
+ mmlu_categories = {result["category"] for result in mmlu_results}
70
+ if all(category in mmlu_categories for category in required_mmlu_categories):
71
+ for result in mmlu_results:
72
+ mmlu_row[result["category"]] = result["score"]
73
+ mmlu_data.append(mmlu_row)
74
 
75
  if unified_exam_results:
76
  unified_exam_row = {"Model": model_name}
 
 
 
77
 
78
+ unified_exam_categories = {result["category"] for result in unified_exam_results}
79
+
80
+ if all(category in unified_exam_categories for category in required_unified_exam_categories):
81
+ for result in unified_exam_results:
82
+ unified_exam_row[result["category"]] = result["score"]
83
+ unified_exam_data.append(unified_exam_row)
84
 
85
  mmlu_df = pd.DataFrame(mmlu_data)
86
  unified_exam_df = pd.DataFrame(unified_exam_data)
model_results.json β†’ data/model_results.json RENAMED
@@ -4,6 +4,10 @@
4
  "results": {
5
  "mmlu_results": [],
6
  "unified_exam_results": [
 
 
 
 
7
  {
8
  "category": "Armenian language and literature",
9
  "score": 10.5
@@ -23,6 +27,10 @@
23
  "model_name": "claude-3-5-sonnet-20241022",
24
  "results": {
25
  "mmlu_results": [
 
 
 
 
26
  {
27
  "category": "Biology",
28
  "score": 0.8667
@@ -81,6 +89,10 @@
81
  }
82
  ],
83
  "unified_exam_results": [
 
 
 
 
84
  {
85
  "category": "Armenian language and literature",
86
  "score": 10.0
@@ -100,6 +112,10 @@
100
  "model_name": "gemini-2.0-flash",
101
  "results": {
102
  "mmlu_results": [
 
 
 
 
103
  {
104
  "category": "Biology",
105
  "score": 0.85
@@ -158,6 +174,10 @@
158
  }
159
  ],
160
  "unified_exam_results": [
 
 
 
 
161
  {
162
  "category": "Armenian language and literature",
163
  "score": 5.5
@@ -177,6 +197,10 @@
177
  "model_name": "gpt-4o",
178
  "results": {
179
  "mmlu_results": [
 
 
 
 
180
  {
181
  "category": "Biology",
182
  "score": 0.8667
@@ -235,6 +259,10 @@
235
  }
236
  ],
237
  "unified_exam_results": [
 
 
 
 
238
  {
239
  "category": "Armenian language and literature",
240
  "score": 6.75
@@ -255,6 +283,10 @@
255
  "results": {
256
  "mmlu_results": [],
257
  "unified_exam_results": [
 
 
 
 
258
  {
259
  "category": "Armenian language and literature",
260
  "score": 7.25
@@ -274,6 +306,10 @@
274
  "model_name": "gemini-1.5-flash",
275
  "results": {
276
  "mmlu_results": [
 
 
 
 
277
  {
278
  "category": "Biology",
279
  "score": 0.75
@@ -332,6 +368,10 @@
332
  }
333
  ],
334
  "unified_exam_results": [
 
 
 
 
335
  {
336
  "category": "Armenian language and literature",
337
  "score": 4.75
@@ -351,6 +391,10 @@
351
  "model_name": "DeepSeek-V3",
352
  "results": {
353
  "mmlu_results": [
 
 
 
 
354
  {
355
  "category": "Biology",
356
  "score": 0.8167
@@ -409,6 +453,10 @@
409
  }
410
  ],
411
  "unified_exam_results": [
 
 
 
 
412
  {
413
  "category": "Armenian language and literature",
414
  "score": 5.25
@@ -428,6 +476,10 @@
428
  "model_name": "Meta-Llama-3.3-70B-Instruct",
429
  "results": {
430
  "mmlu_results": [
 
 
 
 
431
  {
432
  "category": "Biology",
433
  "score": 0.7333
@@ -486,6 +538,10 @@
486
  }
487
  ],
488
  "unified_exam_results": [
 
 
 
 
489
  {
490
  "category": "Armenian language and literature",
491
  "score": 4.5
@@ -505,6 +561,10 @@
505
  "model_name": "claude-3-5-haiku-20241022",
506
  "results": {
507
  "mmlu_results": [
 
 
 
 
508
  {
509
  "category": "Biology",
510
  "score": 0.75
@@ -563,6 +623,10 @@
563
  }
564
  ],
565
  "unified_exam_results": [
 
 
 
 
566
  {
567
  "category": "Armenian language and literature",
568
  "score": 5.0
 
4
  "results": {
5
  "mmlu_results": [],
6
  "unified_exam_results": [
7
+ {
8
+ "category": "Average",
9
+ "score": 11.0833
10
+ },
11
  {
12
  "category": "Armenian language and literature",
13
  "score": 10.5
 
27
  "model_name": "claude-3-5-sonnet-20241022",
28
  "results": {
29
  "mmlu_results": [
30
+ {
31
+ "category": "Average",
32
+ "score": 0.6958
33
+ },
34
  {
35
  "category": "Biology",
36
  "score": 0.8667
 
89
  }
90
  ],
91
  "unified_exam_results": [
92
+ {
93
+ "category": "Average",
94
+ "score": 10.6667
95
+ },
96
  {
97
  "category": "Armenian language and literature",
98
  "score": 10.0
 
112
  "model_name": "gemini-2.0-flash",
113
  "results": {
114
  "mmlu_results": [
115
+ {
116
+ "category": "Average",
117
+ "score": 0.7247
118
+ },
119
  {
120
  "category": "Biology",
121
  "score": 0.85
 
174
  }
175
  ],
176
  "unified_exam_results": [
177
+ {
178
+ "category": "Average",
179
+ "score": 9.8333
180
+ },
181
  {
182
  "category": "Armenian language and literature",
183
  "score": 5.5
 
197
  "model_name": "gpt-4o",
198
  "results": {
199
  "mmlu_results": [
200
+ {
201
+ "category": "Average",
202
+ "score": 0.6758
203
+ },
204
  {
205
  "category": "Biology",
206
  "score": 0.8667
 
259
  }
260
  ],
261
  "unified_exam_results": [
262
+ {
263
+ "category": "Average",
264
+ "score": 8.9167
265
+ },
266
  {
267
  "category": "Armenian language and literature",
268
  "score": 6.75
 
283
  "results": {
284
  "mmlu_results": [],
285
  "unified_exam_results": [
286
+ {
287
+ "category": "Average",
288
+ "score": 8.6667
289
+ },
290
  {
291
  "category": "Armenian language and literature",
292
  "score": 7.25
 
306
  "model_name": "gemini-1.5-flash",
307
  "results": {
308
  "mmlu_results": [
309
+ {
310
+ "category": "Average",
311
+ "score": 0.5592
312
+ },
313
  {
314
  "category": "Biology",
315
  "score": 0.75
 
368
  }
369
  ],
370
  "unified_exam_results": [
371
+ {
372
+ "category": "Average",
373
+ "score": 7.8333
374
+ },
375
  {
376
  "category": "Armenian language and literature",
377
  "score": 4.75
 
391
  "model_name": "DeepSeek-V3",
392
  "results": {
393
  "mmlu_results": [
394
+ {
395
+ "category": "Average",
396
+ "score": 0.6633
397
+ },
398
  {
399
  "category": "Biology",
400
  "score": 0.8167
 
453
  }
454
  ],
455
  "unified_exam_results": [
456
+ {
457
+ "category": "Average",
458
+ "score": 7.5
459
+ },
460
  {
461
  "category": "Armenian language and literature",
462
  "score": 5.25
 
476
  "model_name": "Meta-Llama-3.3-70B-Instruct",
477
  "results": {
478
  "mmlu_results": [
479
+ {
480
+ "category": "Average",
481
+ "score": 0.5139
482
+ },
483
  {
484
  "category": "Biology",
485
  "score": 0.7333
 
538
  }
539
  ],
540
  "unified_exam_results": [
541
+ {
542
+ "category": "Average",
543
+ "score": 7.0833
544
+ },
545
  {
546
  "category": "Armenian language and literature",
547
  "score": 4.5
 
561
  "model_name": "claude-3-5-haiku-20241022",
562
  "results": {
563
  "mmlu_results": [
564
+ {
565
+ "category": "Average",
566
+ "score": 0.5198
567
+ },
568
  {
569
  "category": "Biology",
570
  "score": 0.75
 
623
  }
624
  ],
625
  "unified_exam_results": [
626
+ {
627
+ "category": "Average",
628
+ "score": 6.5
629
+ },
630
  {
631
  "category": "Armenian language and literature",
632
  "score": 5.0
data/required_categories.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ required_unified_exam_categories = [
2
+ "Average",
3
+ "Armenian language and literature",
4
+ "Armenian history",
5
+ "Mathematics"
6
+ ]
7
+
8
+ required_mmlu_categories = [
9
+ "Biology",
10
+ "Business",
11
+ "Chemistry",
12
+ "Computer Science",
13
+ "Economics",
14
+ "Engineering",
15
+ "Health",
16
+ "History",
17
+ "Law",
18
+ "Math",
19
+ "Other",
20
+ "Philosophy",
21
+ "Physics",
22
+ "Psychology",
23
+ "Average"
24
+ ]