Spaces:
Running
Running
import gradio as gr | |
import pandas as pd | |
import plotly.express as px | |
from model_handler import ModelHandler | |
from data_handler import unified_exam_result_table, mmlu_result_table, unified_exam_chart, mmlu_chart | |
global_unified_exam_df = None | |
global_mmlu_df = None | |
global_output_armenian = None | |
global_output_mmlu = None | |
def refresh_data(): | |
global global_mmlu_df, global_unified_exam_df, global_output_armenian, global_output_mmlu | |
model_handler = ModelHandler() | |
global_mmlu_df, global_unified_exam_df = model_handler.get_arm_bench_data() | |
global_output_armenian = unified_exam_result_table(global_unified_exam_df) | |
global_output_mmlu = mmlu_result_table(global_mmlu_df) | |
return global_output_armenian, unified_exam_chart(global_output_armenian, 'Average') | |
def main(): | |
global global_mmlu_df, global_unified_exam_df, global_output_armenian, global_output_mmlu | |
model_handler = ModelHandler() | |
global_mmlu_df, global_unified_exam_df = model_handler.get_arm_bench_data() | |
global_output_armenian = unified_exam_result_table(global_unified_exam_df) | |
global_output_mmlu = mmlu_result_table(global_mmlu_df) | |
with gr.Blocks() as app: | |
with gr.Tabs(): | |
with gr.TabItem("Armenian Unified Exams"): | |
gr.Markdown("# Armenian Unified Test Exams") | |
gr.Markdown( | |
""" | |
This benchmark contains results of various Language Models on Armenian Unified Test Exams for Armenian language and literature, Armenian history and mathematics. The scoring system is a 20-point scale, where 0-8 is a Fail, 8-18 is a Pass, and 18-20 is a Distinction. | |
""" | |
) | |
table_output_armenian = gr.DataFrame(value=global_output_armenian) | |
plot_column_dropdown_unified_exam = gr.Dropdown(choices=['Average', 'Armenian language and literature', 'Armenian history', 'Mathematics'], value='Average', label='Select Column to Plot') | |
plot_output_armenian = gr.Plot(lambda column: unified_exam_chart(global_output_armenian, column), inputs=plot_column_dropdown_unified_exam) | |
with gr.TabItem("MMLU-Pro-Hy"): | |
gr.Markdown("# MMLU-Pro Translated to Armenian (MMLU-Pro-Hy)") | |
gr.Markdown( | |
""" | |
This benchmark contains results of various Language Models on the MMLU-Pro benchmark, translated into Armenian. MMLU-Pro is a massive multi-task test in MCQA format. The scores represent accuracy. | |
""" | |
) | |
table_output_mmlu = gr.DataFrame(value=global_output_mmlu) | |
subject_cols = ['Average','Biology', 'Business', 'Chemistry', 'Computer Science', 'Economics', 'Engineering', 'Health', 'History', 'Law', 'Math', 'Philosophy', 'Physics', 'Psychology','Other'] | |
plot_column_dropdown_mmlu = gr.Dropdown(choices=subject_cols, value='Average', label='Select Column to Plot') | |
plot_output_mmlu = gr.Plot(lambda column: mmlu_chart(global_output_mmlu, column), inputs=plot_column_dropdown_mmlu) | |
with gr.TabItem("About"): | |
gr.Markdown("# About the Benchmark") | |
gr.Markdown( | |
""" | |
This benchmark evaluates Language Models on Armenian-specific tasks, including Armenian Unified Test Exams and a translated version of the MMLU-Pro benchmark (MMLU-Pro-Hy). It is designed to measure the models' understanding and generation capabilities in the Armenian language. | |
**Creator Company:** Metric AI Research Lab, Yerevan, Armenia.""" | |
) | |
gr.Image("logo.png", width=200, show_label=False, show_download_button=False, show_fullscreen_button=False, show_share_button=False) | |
gr.Markdown(""" | |
- [Website](https://metric.am/) | |
- [Hugging Face](https://huggingface.co./Metric-AI) | |
MMLU-Pro-Hy is a massive multi-task test in MCQA format, inspired by the original MMLU benchmark, adapted for the Armenian language. The Armenian Unified Exams benchmark allows for comparison with human-level knowledge. | |
""" | |
) | |
gr.Markdown("## Submission Guide") | |
gr.Markdown( | |
""" | |
To submit a model for evaluation, please follow these steps: | |
1. **Evaluate your model**: | |
- Follow the evaluation script provided here: [https://github.com/Anania-AI/Arm-LLM-Benchmark](https://github.com/Anania-AI/Arm-LLM-Benchmark) | |
2. **Format your submission file**: | |
- After evaluation, you will get a `result.json` file. Ensure the file follows this format: | |
```json | |
{ | |
"mmlu_results": [ | |
{ | |
"category": "category_name", | |
"score": score_value | |
}, | |
... | |
], | |
"unified_exam_results": [ | |
{ | |
"category": "category_name", | |
"score": score_value | |
}, | |
... | |
] | |
} | |
``` | |
3. **Submit your model**: | |
- Add the `arm_bench` tag and the `result.json` file to your model card. | |
- Click on the "Refresh Data" button in this app, and you will see your model's results. | |
""" | |
) | |
gr.Markdown("## Contributing") | |
gr.Markdown( | |
""" | |
You can contribute to this benchmark in several ways: | |
- Providing API credits for evaluating API-based models. | |
- Citing our work in your research and publications. | |
- Contributing to the development of the benchmark itself. | |
""" | |
) | |
refresh_button = gr.Button("Refresh Data") | |
refresh_button.click( | |
fn=refresh_data, | |
outputs=[table_output_armenian, | |
table_output_mmlu, | |
plot_output_armenian, | |
plot_output_mmlu | |
], | |
) | |
app.launch(share=True, debug=True) | |
if __name__ == "__main__": | |
main() |