daniel7an commited on
Commit
4781b83
·
1 Parent(s): d21b14f
app.py ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pandas as pd
3
+ import plotly.express as px
4
+
5
+ def display_table(exam_type):
6
+ if exam_type == "Armenian Exams":
7
+ df = pd.read_csv('unified_exam_results.csv')
8
+ df = df.sort_values(by='Average score', ascending=False)
9
+ cols = df.columns.tolist()
10
+ cols.insert(1, cols.pop(cols.index('Average score')))
11
+ df = df[cols]
12
+ elif exam_type == "MMLU-Pro-Hy":
13
+ df = pd.read_csv('mmlu_pro_hy_results.csv')
14
+ df = df.sort_values(by='Accuracy', ascending=False)
15
+ return df
16
+
17
+ def create_bar_chart(exam_type, plot_column):
18
+ if exam_type == "Armenian Exams":
19
+ df = pd.read_csv('unified_exam_results.csv')
20
+ df = df.sort_values(by='Average score', ascending=False)
21
+ df = df.sort_values(by=[plot_column, 'Model'], ascending=[False, True]).reset_index(drop=True)
22
+
23
+ x_col = plot_column
24
+ title = f'{plot_column} per Model'
25
+ if plot_column == 'Average score':
26
+ range_max = 20
27
+ x_range_max = 20
28
+ else:
29
+ range_max = 20
30
+ x_range_max = 20
31
+ def get_label(score):
32
+ if score < 8:
33
+ return "Fail"
34
+ elif 8 <= score <= 18:
35
+ return "Pass"
36
+ else:
37
+ return "Distinction"
38
+ df['Test Result'] = df[plot_column].apply(get_label)
39
+
40
+ if plot_column in ['Average score', 'Accuracy']:
41
+ fig = px.bar(df,
42
+ x=x_col,
43
+ y='Model',
44
+ color=x_col,
45
+ color_continuous_scale='tealrose_r',
46
+ labels={x_col: plot_column, 'Model': 'Model'},
47
+ title=title,
48
+ orientation='h',
49
+ range_color=[0, range_max])
50
+ else:
51
+ color_discrete_map = {
52
+ "Fail": "#d15d80",
53
+ "Pass": "#edd8be",
54
+ "Distinction": "#059492"
55
+ }
56
+ fig = px.bar(df,
57
+ x=x_col,
58
+ y='Model',
59
+ color=df['Test Result'],
60
+ color_discrete_map=color_discrete_map,
61
+ labels={x_col: plot_column, 'Model': 'Model'},
62
+ title=title,
63
+ orientation='h')
64
+
65
+ fig.update_layout(
66
+ xaxis=dict(range=[0, x_range_max]),
67
+ title=dict(text=title, font=dict(size=16)),
68
+ xaxis_title=dict(font=dict(size=12)),
69
+ yaxis_title=dict(font=dict(size=12)),
70
+ yaxis=dict(autorange="reversed")
71
+ )
72
+
73
+ return fig
74
+
75
+ elif exam_type == "MMLU-Pro-Hy":
76
+ df = pd.read_csv('mmlu_pro_hy_results.csv')
77
+ df = df.sort_values(by='Accuracy', ascending=False)
78
+ x_col = 'Accuracy'
79
+ title = 'Accuracy per Model (MMLU-Pro-Hy)'
80
+ range_max = 1.0
81
+ x_range_max = 1.0
82
+ if plot_column != 'Accuracy':
83
+ def get_label(accuracy):
84
+ if accuracy < 0.5:
85
+ return "Low"
86
+ elif 0.5 <= accuracy <= 0.8:
87
+ return "Medium"
88
+ else:
89
+ return "High"
90
+ df['Test Result'] = df['Accuracy'].apply(get_label)
91
+
92
+ fig = px.bar(df,
93
+ x=x_col,
94
+ y='Model',
95
+ color=x_col,
96
+ color_continuous_scale='tealrose_r',
97
+ labels={x_col: plot_column, 'Model': 'Model'},
98
+ title=title,
99
+ orientation='h',
100
+ range_color=[0, range_max])
101
+
102
+ fig.update_layout(
103
+ xaxis=dict(range=[0, x_range_max]),
104
+ title=dict(text=title, font=dict(size=16)),
105
+ xaxis_title=dict(font=dict(size=12)),
106
+ yaxis_title=dict(font=dict(size=12)),
107
+ yaxis=dict(autorange="reversed")
108
+ )
109
+
110
+ return fig
111
+
112
+ with gr.Blocks() as app:
113
+ with gr.Tabs():
114
+ with gr.TabItem("Armenian Unified Exams"):
115
+ table_output_armenian = gr.DataFrame(value=lambda: display_table("Armenian Exams"))
116
+ plot_column_dropdown = gr.Dropdown(choices=['Average score', 'Armenian language exam score', 'Armenian history exam score', 'Mathematics exam score'], value='Average score', label='Select Column to Plot')
117
+ plot_output_armenian = gr.Plot(lambda column: create_bar_chart("Armenian Exams", column), inputs=plot_column_dropdown)
118
+ with gr.TabItem("MMLU-Pro-Hy"):
119
+ table_output_mmlu = gr.DataFrame(value=lambda: display_table("MMLU-Pro-Hy"))
120
+ plot_output_mmlu = gr.Plot(lambda: create_bar_chart("MMLU-Pro-Hy", 'Accuracy'))
121
+
122
+ app.launch(share=True)
mmlu_pro_hy_results.csv ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ Model,Accuracy
2
+ claude-3-5-haiku-20241022,0.526
3
+ claude-3-5-sonnet-20241022,0.701
4
+ gemini-2.0-flash,0.741
5
+ gemini-1.5-flash,0.586
benchmark_results.csv → unified_exam_results.csv RENAMED
@@ -1,10 +1,10 @@
1
- model,armenian_language_score,armenian_history_score,mathematics_score,average_score
2
  claude-3-7-sonnet-20250219,10.5,7.75,15.0,11.08
3
  claude-3-5-sonnet-20241022,10.0,9.25,12.75,10.67
4
  gemini-2.0-flash,5.5,6.75,17.25,9.83
5
  gpt-4o,6.75,6.75,13.25,8.92
6
  qwen-max-2025-01-25,7.25,4.5,14.25,8.67
7
  gemini-1.5-flash,4.75,3.75,15.0,7.83
8
- deepseek-ai/DeepSeek-V3,5.25,5.0,12.25,7.5
9
  Meta-Llama-3.3-70B-Instruct,4.5,5.25,11.5,7.08
10
  claude-3-5-haiku-20241022,5.0,3.75,10.75,6.5
 
1
+ Model,Armenian language exam score,Armenian history exam score,Mathematics exam score,Average score
2
  claude-3-7-sonnet-20250219,10.5,7.75,15.0,11.08
3
  claude-3-5-sonnet-20241022,10.0,9.25,12.75,10.67
4
  gemini-2.0-flash,5.5,6.75,17.25,9.83
5
  gpt-4o,6.75,6.75,13.25,8.92
6
  qwen-max-2025-01-25,7.25,4.5,14.25,8.67
7
  gemini-1.5-flash,4.75,3.75,15.0,7.83
8
+ DeepSeek-V3,5.25,5.0,12.25,7.5
9
  Meta-Llama-3.3-70B-Instruct,4.5,5.25,11.5,7.08
10
  claude-3-5-haiku-20241022,5.0,3.75,10.75,6.5