Spaces:
Running
Running
File size: 8,342 Bytes
4781b83 a4d362f 4781b83 779cbde 4781b83 779cbde 2f6fff2 6b4ef20 779cbde cabb2f4 4781b83 cabb2f4 2f6fff2 779cbde cabb2f4 2f6fff2 779cbde f4b3542 779cbde f4b3542 779cbde f4b3542 779cbde 6b4ef20 f4b3542 1007960 779cbde deb84cd 1e273fd 779cbde 1e273fd f4b3542 1e273fd 63762be 779cbde a2c56f4 9c3791b dbd743c a2c56f4 f4b3542 779cbde f4b3542 779cbde cabb2f4 6b4ef20 cabb2f4 6b4ef20 779cbde |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 |
import gradio as gr
import pandas as pd
import plotly.express as px
from model_handler import ModelHandler
from data_handler import unified_exam_result_table, mmlu_result_table, unified_exam_chart, mmlu_chart
global_unified_exam_df = None
global_mmlu_df = None
global_output_armenian = None
global_output_mmlu = None
def refresh_data():
global global_mmlu_df, global_unified_exam_df, global_output_armenian, global_output_mmlu
model_handler = ModelHandler()
global_mmlu_df, global_unified_exam_df = model_handler.get_arm_bench_data()
global_output_armenian = unified_exam_result_table(global_unified_exam_df)
global_output_mmlu = mmlu_result_table(global_mmlu_df)
unified_chart = unified_exam_chart(global_output_armenian, 'Average')
mmlu_chart_output = mmlu_chart(global_output_mmlu, 'Average')
return global_output_armenian, global_output_mmlu, unified_chart, mmlu_chart_output, 'Average', 'Average'
def main():
global global_mmlu_df, global_unified_exam_df, global_output_armenian, global_output_mmlu
model_handler = ModelHandler()
global_mmlu_df, global_unified_exam_df = model_handler.get_arm_bench_data()
global_output_armenian = unified_exam_result_table(global_unified_exam_df)
global_output_mmlu = mmlu_result_table(global_mmlu_df)
with gr.Blocks() as app:
with gr.Tabs():
with gr.TabItem("Armenian Unified Exams"):
gr.Markdown("# Armenian Unified Test Exams")
gr.Markdown(
"""
This benchmark contains results of various Language Models on Armenian Unified Test Exams for Armenian language and literature, Armenian history and mathematics. The scoring system is a 20-point scale, where 0-8 is a Fail, 8-18 is a Pass, and 18-20 is a Distinction.
"""
)
table_output_armenian = gr.DataFrame(value=global_output_armenian)
plot_column_dropdown_unified_exam = gr.Dropdown(choices=['Average', 'Armenian language and literature', 'Armenian history', 'Mathematics'], value='Average', label='Select Column to Plot')
plot_output_armenian = gr.Plot(lambda column: unified_exam_chart(global_output_armenian, column), inputs=plot_column_dropdown_unified_exam)
with gr.TabItem("MMLU-Pro-Hy"):
gr.Markdown("# MMLU-Pro Translated to Armenian (MMLU-Pro-Hy)")
gr.Markdown(
"""
This benchmark contains results of various Language Models on the MMLU-Pro benchmark, translated into Armenian. MMLU-Pro is a massive multi-task test in MCQA format. The scores represent accuracy.
"""
)
table_output_mmlu = gr.DataFrame(value=global_output_mmlu)
subject_cols = ['Average','Biology', 'Business', 'Chemistry', 'Computer Science', 'Economics', 'Engineering', 'Health', 'History', 'Law', 'Math', 'Philosophy', 'Physics', 'Psychology','Other']
plot_column_dropdown_mmlu = gr.Dropdown(choices=subject_cols, value='Average', label='Select Column to Plot')
plot_output_mmlu = gr.Plot(lambda column: mmlu_chart(global_output_mmlu, column), inputs=plot_column_dropdown_mmlu)
with gr.TabItem("About"):
gr.Markdown("# Overview")
gr.Markdown(
"""
This benchmark is developed and maintained by [Metric](https://metric.am/). It evaluates the capabilities of Large Language Models on Armenian-specific tasks, including Armenian Unified Test Exams and a translated version of the MMLU-Pro benchmark (MMLU-Pro-Hy). It is designed to measure the models' understanding and generation capabilities in the Armenian language.
"""
)
gr.Markdown("# Dataset")
gr.Markdown("""
- [Armenian Unified Exams](https://dimord.am/public/tests): collection of High School graduation test exams used in 2025 in Armenia. The highest achievable score per test is 20. The data is extracted from PDFs and manually prepared for LLM evaluation.
- MMLU-Pro-Hy: a massive multi-task test in MCQA format, inspired by the original [MMLU benchmark](https://arxiv.org/abs/2406.01574), adapted for the Armenian language. Currently, a stratified sample is sued for evaluation summing up to 1000 questions in total. The Armenian version is generated through machine-translation. Resulting dataset went extensive post-processing to ensure high quality subsample is selected for evaluation..
"""
)
gr.Markdown("## Submission Guide")
gr.Markdown(
"""
To submit a model for evaluation, please follow these steps:
1. **Evaluate your model**:
- Follow the evaluation script provided here: [https://github.com/Metricam/ArmBench-LLM](https://github.com/Metricam/ArmBench-LLM)
- For more details about the evaluation and submission process, read the README in the ArmBench-LLM GitHub repository.
2. **Format your submission file**:
- After evaluation, you will get a `results.json` file. Ensure the file follows this format:
```json
{
"mmlu_results": [
{
"category": "category_name",
"score": score_value
},
...
],
"unified_exam_results": [
{
"category": "category_name",
"score": score_value
},
...
]
}
```
3. **Submit your model**:
- Add the `ArmBench-LLM` tag and the `results.json` file to your model card.
- Click on the "Refresh Data" button in this app, and you will see your model's results.
"""
)
with gr.Row():
with gr.Column():
gr.Markdown("## Contributing")
gr.Markdown(
"""
You can contribute to this benchmark in several ways:
- Provide API credits for evaluating additional API-based models.
- Citing our work in your research and publications.
- Contributing to the development of the benchmark itself with data or with evaluation results.
"""
)
with gr.Column():
gr.Markdown("")
gr.Markdown("")
gr.Markdown("")
gr.Markdown("")
gr.Image("logo.png", width=200, show_label=False, show_download_button=False, show_fullscreen_button=False, show_share_button=False)
gr.Markdown("# About Metric")
gr.Markdown(
"""
[Metric](https://metric.am/) is an AI Research Lab based in Yerevan, Armenia. It is specialized in training custom embedding and generation models for use cases such as Document AI or low-represented languages. If you are interested in our research or advisory services, drop an email to [email protected].
"""
)
refresh_button = gr.Button("Refresh Data")
refresh_button.click(
fn=refresh_data,
outputs=[table_output_armenian, table_output_mmlu, plot_output_armenian, plot_output_mmlu, plot_column_dropdown_unified_exam, plot_column_dropdown_mmlu],
)
app.launch(share=True, debug=True)
if __name__ == "__main__":
main() |