Spaces:
Running
Running
commit
Browse files- app.py +26 -26
- data_handler.py +1 -0
app.py
CHANGED
@@ -21,36 +21,36 @@ def refresh_data():
|
|
21 |
return global_output_armenian, global_output_mmlu, unified_exam_chart(global_output_armenian, 'Average'), mmlu_chart(global_output_mmlu, 'Average')
|
22 |
|
23 |
def main():
|
24 |
-
global global_mmlu_df, global_unified_exam_df, global_output_armenian, global_output_mmlu
|
25 |
-
model_handler = ModelHandler()
|
26 |
-
global_mmlu_df, global_unified_exam_df = model_handler.get_arm_bench_data()
|
27 |
|
28 |
-
global_output_armenian = unified_exam_result_table(global_unified_exam_df)
|
29 |
-
global_output_mmlu = mmlu_result_table(global_mmlu_df)
|
30 |
|
31 |
with gr.Blocks() as app:
|
32 |
with gr.Tabs():
|
33 |
-
with gr.TabItem("Armenian Unified Exams"):
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
with gr.TabItem("MMLU-Pro-Hy"):
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
with gr.TabItem("About"):
|
55 |
gr.Markdown("# About the Benchmark")
|
56 |
gr.Markdown(
|
|
|
21 |
return global_output_armenian, global_output_mmlu, unified_exam_chart(global_output_armenian, 'Average'), mmlu_chart(global_output_mmlu, 'Average')
|
22 |
|
23 |
def main():
|
24 |
+
# global global_mmlu_df, global_unified_exam_df, global_output_armenian, global_output_mmlu
|
25 |
+
# model_handler = ModelHandler()
|
26 |
+
# global_mmlu_df, global_unified_exam_df = model_handler.get_arm_bench_data()
|
27 |
|
28 |
+
# global_output_armenian = unified_exam_result_table(global_unified_exam_df)
|
29 |
+
# global_output_mmlu = mmlu_result_table(global_mmlu_df)
|
30 |
|
31 |
with gr.Blocks() as app:
|
32 |
with gr.Tabs():
|
33 |
+
# with gr.TabItem("Armenian Unified Exams"):
|
34 |
+
# gr.Markdown("# Armenian Unified Test Exams")
|
35 |
+
# gr.Markdown(
|
36 |
+
# """
|
37 |
+
# This benchmark contains results of various Language Models on Armenian Unified Test Exams for Armenian language and literature, Armenian history and mathematics. The scoring system is a 20-point scale, where 0-8 is a Fail, 8-18 is a Pass, and 18-20 is a Distinction.
|
38 |
+
# """
|
39 |
+
# )
|
40 |
+
# table_output_armenian = gr.DataFrame(value=global_output_armenian)
|
41 |
+
# plot_column_dropdown_unified_exam = gr.Dropdown(choices=['Average', 'Armenian language and literature', 'Armenian history', 'Mathematics'], value='Average', label='Select Column to Plot')
|
42 |
+
# plot_output_armenian = gr.Plot(lambda column: unified_exam_chart(global_output_armenian, column), inputs=plot_column_dropdown_unified_exam)
|
43 |
+
# with gr.TabItem("MMLU-Pro-Hy"):
|
44 |
+
# gr.Markdown("# MMLU-Pro Translated to Armenian (MMLU-Pro-Hy)")
|
45 |
+
# gr.Markdown(
|
46 |
+
# """
|
47 |
+
# This benchmark contains results of various Language Models on the MMLU-Pro benchmark, translated into Armenian. MMLU-Pro is a massive multi-task test in MCQA format. The scores represent accuracy.
|
48 |
+
# """
|
49 |
+
# )
|
50 |
+
# table_output_mmlu = gr.DataFrame(value=global_output_mmlu)
|
51 |
+
# subject_cols = ['Average','Biology', 'Business', 'Chemistry', 'Computer Science', 'Economics', 'Engineering', 'Health', 'History', 'Law', 'Math', 'Philosophy', 'Physics', 'Psychology','Other']
|
52 |
+
# plot_column_dropdown_mmlu = gr.Dropdown(choices=subject_cols, value='Average', label='Select Column to Plot')
|
53 |
+
# plot_output_mmlu = gr.Plot(lambda column: mmlu_chart(global_output_mmlu, column), inputs=plot_column_dropdown_mmlu)
|
54 |
with gr.TabItem("About"):
|
55 |
gr.Markdown("# About the Benchmark")
|
56 |
gr.Markdown(
|
data_handler.py
CHANGED
@@ -23,6 +23,7 @@ def mmlu_result_table(mmlu_df):
|
|
23 |
df = df.sort_values(by='Average', ascending=False).reset_index(drop=True)
|
24 |
df.insert(0, 'Rank', range(1, len(df) + 1))
|
25 |
cols = df.columns.tolist()
|
|
|
26 |
cols.insert(2, cols.pop(cols.index('Average')))
|
27 |
cols.append(cols.pop(cols.index('Other')))
|
28 |
df = df[cols]
|
|
|
23 |
df = df.sort_values(by='Average', ascending=False).reset_index(drop=True)
|
24 |
df.insert(0, 'Rank', range(1, len(df) + 1))
|
25 |
cols = df.columns.tolist()
|
26 |
+
print(cols)
|
27 |
cols.insert(2, cols.pop(cols.index('Average')))
|
28 |
cols.append(cols.pop(cols.index('Other')))
|
29 |
df = df[cols]
|