Spaces:

Metric-AI
/

ArmBench-LLM

Running

App Files Files Community

Bagratuni commited on Mar 10

Commit

e82f5d4

1 Parent(s): 0d6ab83

commit

Browse files

Files changed (2) hide show

app.py +26 -26
data_handler.py +1 -0

app.py CHANGED Viewed

@@ -21,36 +21,36 @@ def refresh_data():
     return global_output_armenian, global_output_mmlu, unified_exam_chart(global_output_armenian, 'Average'), mmlu_chart(global_output_mmlu, 'Average')
 def main():
-    global global_mmlu_df, global_unified_exam_df, global_output_armenian, global_output_mmlu
-    model_handler = ModelHandler()
-    global_mmlu_df, global_unified_exam_df = model_handler.get_arm_bench_data()
-    global_output_armenian = unified_exam_result_table(global_unified_exam_df)
-    global_output_mmlu = mmlu_result_table(global_mmlu_df)
     with gr.Blocks() as app:
         with gr.Tabs():
-            with gr.TabItem("Armenian Unified Exams"):
-                gr.Markdown("# Armenian Unified Test Exams")
-                gr.Markdown(
-                    """
-                    This benchmark contains results of various Language Models on Armenian Unified Test Exams for Armenian language and literature, Armenian history and mathematics. The scoring system is a 20-point scale, where 0-8 is a Fail, 8-18 is a Pass, and 18-20 is a Distinction.
-                    """
-                )
-                table_output_armenian = gr.DataFrame(value=global_output_armenian)
-                plot_column_dropdown_unified_exam = gr.Dropdown(choices=['Average', 'Armenian language and literature', 'Armenian history', 'Mathematics'], value='Average', label='Select Column to Plot')
-                plot_output_armenian = gr.Plot(lambda column: unified_exam_chart(global_output_armenian, column), inputs=plot_column_dropdown_unified_exam)
-            with gr.TabItem("MMLU-Pro-Hy"):
-                gr.Markdown("# MMLU-Pro Translated to Armenian (MMLU-Pro-Hy)")
-                gr.Markdown(
-                    """
-                    This benchmark contains results of various Language Models on the MMLU-Pro benchmark, translated into Armenian. MMLU-Pro is a massive multi-task test in MCQA format. The scores represent accuracy.
-                    """
-                )
-                table_output_mmlu = gr.DataFrame(value=global_output_mmlu)
-                subject_cols = ['Average','Biology', 'Business', 'Chemistry', 'Computer Science', 'Economics', 'Engineering', 'Health', 'History', 'Law', 'Math', 'Philosophy', 'Physics', 'Psychology','Other']
-                plot_column_dropdown_mmlu = gr.Dropdown(choices=subject_cols, value='Average', label='Select Column to Plot')
-                plot_output_mmlu = gr.Plot(lambda column: mmlu_chart(global_output_mmlu, column), inputs=plot_column_dropdown_mmlu)
             with gr.TabItem("About"):
                 gr.Markdown("# About the Benchmark")
                 gr.Markdown(

     return global_output_armenian, global_output_mmlu, unified_exam_chart(global_output_armenian, 'Average'), mmlu_chart(global_output_mmlu, 'Average')
 def main():
+    # global global_mmlu_df, global_unified_exam_df, global_output_armenian, global_output_mmlu
+    # model_handler = ModelHandler()
+    # global_mmlu_df, global_unified_exam_df = model_handler.get_arm_bench_data()
+    # global_output_armenian = unified_exam_result_table(global_unified_exam_df)
+    # global_output_mmlu = mmlu_result_table(global_mmlu_df)
     with gr.Blocks() as app:
         with gr.Tabs():
+            # with gr.TabItem("Armenian Unified Exams"):
+            #     gr.Markdown("# Armenian Unified Test Exams")
+            #     gr.Markdown(
+            #         """
+            #         This benchmark contains results of various Language Models on Armenian Unified Test Exams for Armenian language and literature, Armenian history and mathematics. The scoring system is a 20-point scale, where 0-8 is a Fail, 8-18 is a Pass, and 18-20 is a Distinction.
+            #         """
+            #     )
+            #     table_output_armenian = gr.DataFrame(value=global_output_armenian)
+            #     plot_column_dropdown_unified_exam = gr.Dropdown(choices=['Average', 'Armenian language and literature', 'Armenian history', 'Mathematics'], value='Average', label='Select Column to Plot')
+            #     plot_output_armenian = gr.Plot(lambda column: unified_exam_chart(global_output_armenian, column), inputs=plot_column_dropdown_unified_exam)
+            # with gr.TabItem("MMLU-Pro-Hy"):
+            #     gr.Markdown("# MMLU-Pro Translated to Armenian (MMLU-Pro-Hy)")
+            #     gr.Markdown(
+            #         """
+            #         This benchmark contains results of various Language Models on the MMLU-Pro benchmark, translated into Armenian. MMLU-Pro is a massive multi-task test in MCQA format. The scores represent accuracy.
+            #         """
+            #     )
+            #     table_output_mmlu = gr.DataFrame(value=global_output_mmlu)
+            #     subject_cols = ['Average','Biology', 'Business', 'Chemistry', 'Computer Science', 'Economics', 'Engineering', 'Health', 'History', 'Law', 'Math', 'Philosophy', 'Physics', 'Psychology','Other']
+            #     plot_column_dropdown_mmlu = gr.Dropdown(choices=subject_cols, value='Average', label='Select Column to Plot')
+            #     plot_output_mmlu = gr.Plot(lambda column: mmlu_chart(global_output_mmlu, column), inputs=plot_column_dropdown_mmlu)
             with gr.TabItem("About"):
                 gr.Markdown("# About the Benchmark")
                 gr.Markdown(

data_handler.py CHANGED Viewed

@@ -23,6 +23,7 @@ def mmlu_result_table(mmlu_df):
     df = df.sort_values(by='Average', ascending=False).reset_index(drop=True)
     df.insert(0, 'Rank', range(1, len(df) + 1))
     cols = df.columns.tolist()
     cols.insert(2, cols.pop(cols.index('Average')))
     cols.append(cols.pop(cols.index('Other')))
     df = df[cols]

     df = df.sort_values(by='Average', ascending=False).reset_index(drop=True)
     df.insert(0, 'Rank', range(1, len(df) + 1))
     cols = df.columns.tolist()
+    print(cols)
     cols.insert(2, cols.pop(cols.index('Average')))
     cols.append(cols.pop(cols.index('Other')))
     df = df[cols]