Bagratuni commited on
Commit
e82f5d4
·
1 Parent(s): 0d6ab83
Files changed (2) hide show
  1. app.py +26 -26
  2. data_handler.py +1 -0
app.py CHANGED
@@ -21,36 +21,36 @@ def refresh_data():
21
  return global_output_armenian, global_output_mmlu, unified_exam_chart(global_output_armenian, 'Average'), mmlu_chart(global_output_mmlu, 'Average')
22
 
23
  def main():
24
- global global_mmlu_df, global_unified_exam_df, global_output_armenian, global_output_mmlu
25
- model_handler = ModelHandler()
26
- global_mmlu_df, global_unified_exam_df = model_handler.get_arm_bench_data()
27
 
28
- global_output_armenian = unified_exam_result_table(global_unified_exam_df)
29
- global_output_mmlu = mmlu_result_table(global_mmlu_df)
30
 
31
  with gr.Blocks() as app:
32
  with gr.Tabs():
33
- with gr.TabItem("Armenian Unified Exams"):
34
- gr.Markdown("# Armenian Unified Test Exams")
35
- gr.Markdown(
36
- """
37
- This benchmark contains results of various Language Models on Armenian Unified Test Exams for Armenian language and literature, Armenian history and mathematics. The scoring system is a 20-point scale, where 0-8 is a Fail, 8-18 is a Pass, and 18-20 is a Distinction.
38
- """
39
- )
40
- table_output_armenian = gr.DataFrame(value=global_output_armenian)
41
- plot_column_dropdown_unified_exam = gr.Dropdown(choices=['Average', 'Armenian language and literature', 'Armenian history', 'Mathematics'], value='Average', label='Select Column to Plot')
42
- plot_output_armenian = gr.Plot(lambda column: unified_exam_chart(global_output_armenian, column), inputs=plot_column_dropdown_unified_exam)
43
- with gr.TabItem("MMLU-Pro-Hy"):
44
- gr.Markdown("# MMLU-Pro Translated to Armenian (MMLU-Pro-Hy)")
45
- gr.Markdown(
46
- """
47
- This benchmark contains results of various Language Models on the MMLU-Pro benchmark, translated into Armenian. MMLU-Pro is a massive multi-task test in MCQA format. The scores represent accuracy.
48
- """
49
- )
50
- table_output_mmlu = gr.DataFrame(value=global_output_mmlu)
51
- subject_cols = ['Average','Biology', 'Business', 'Chemistry', 'Computer Science', 'Economics', 'Engineering', 'Health', 'History', 'Law', 'Math', 'Philosophy', 'Physics', 'Psychology','Other']
52
- plot_column_dropdown_mmlu = gr.Dropdown(choices=subject_cols, value='Average', label='Select Column to Plot')
53
- plot_output_mmlu = gr.Plot(lambda column: mmlu_chart(global_output_mmlu, column), inputs=plot_column_dropdown_mmlu)
54
  with gr.TabItem("About"):
55
  gr.Markdown("# About the Benchmark")
56
  gr.Markdown(
 
21
  return global_output_armenian, global_output_mmlu, unified_exam_chart(global_output_armenian, 'Average'), mmlu_chart(global_output_mmlu, 'Average')
22
 
23
  def main():
24
+ # global global_mmlu_df, global_unified_exam_df, global_output_armenian, global_output_mmlu
25
+ # model_handler = ModelHandler()
26
+ # global_mmlu_df, global_unified_exam_df = model_handler.get_arm_bench_data()
27
 
28
+ # global_output_armenian = unified_exam_result_table(global_unified_exam_df)
29
+ # global_output_mmlu = mmlu_result_table(global_mmlu_df)
30
 
31
  with gr.Blocks() as app:
32
  with gr.Tabs():
33
+ # with gr.TabItem("Armenian Unified Exams"):
34
+ # gr.Markdown("# Armenian Unified Test Exams")
35
+ # gr.Markdown(
36
+ # """
37
+ # This benchmark contains results of various Language Models on Armenian Unified Test Exams for Armenian language and literature, Armenian history and mathematics. The scoring system is a 20-point scale, where 0-8 is a Fail, 8-18 is a Pass, and 18-20 is a Distinction.
38
+ # """
39
+ # )
40
+ # table_output_armenian = gr.DataFrame(value=global_output_armenian)
41
+ # plot_column_dropdown_unified_exam = gr.Dropdown(choices=['Average', 'Armenian language and literature', 'Armenian history', 'Mathematics'], value='Average', label='Select Column to Plot')
42
+ # plot_output_armenian = gr.Plot(lambda column: unified_exam_chart(global_output_armenian, column), inputs=plot_column_dropdown_unified_exam)
43
+ # with gr.TabItem("MMLU-Pro-Hy"):
44
+ # gr.Markdown("# MMLU-Pro Translated to Armenian (MMLU-Pro-Hy)")
45
+ # gr.Markdown(
46
+ # """
47
+ # This benchmark contains results of various Language Models on the MMLU-Pro benchmark, translated into Armenian. MMLU-Pro is a massive multi-task test in MCQA format. The scores represent accuracy.
48
+ # """
49
+ # )
50
+ # table_output_mmlu = gr.DataFrame(value=global_output_mmlu)
51
+ # subject_cols = ['Average','Biology', 'Business', 'Chemistry', 'Computer Science', 'Economics', 'Engineering', 'Health', 'History', 'Law', 'Math', 'Philosophy', 'Physics', 'Psychology','Other']
52
+ # plot_column_dropdown_mmlu = gr.Dropdown(choices=subject_cols, value='Average', label='Select Column to Plot')
53
+ # plot_output_mmlu = gr.Plot(lambda column: mmlu_chart(global_output_mmlu, column), inputs=plot_column_dropdown_mmlu)
54
  with gr.TabItem("About"):
55
  gr.Markdown("# About the Benchmark")
56
  gr.Markdown(
data_handler.py CHANGED
@@ -23,6 +23,7 @@ def mmlu_result_table(mmlu_df):
23
  df = df.sort_values(by='Average', ascending=False).reset_index(drop=True)
24
  df.insert(0, 'Rank', range(1, len(df) + 1))
25
  cols = df.columns.tolist()
 
26
  cols.insert(2, cols.pop(cols.index('Average')))
27
  cols.append(cols.pop(cols.index('Other')))
28
  df = df[cols]
 
23
  df = df.sort_values(by='Average', ascending=False).reset_index(drop=True)
24
  df.insert(0, 'Rank', range(1, len(df) + 1))
25
  cols = df.columns.tolist()
26
+ print(cols)
27
  cols.insert(2, cols.pop(cols.index('Average')))
28
  cols.append(cols.pop(cols.index('Other')))
29
  df = df[cols]