Spaces:
Running
Running
File size: 14,044 Bytes
01d59fd 0f65663 8e67ebe 4ade002 8e67ebe a67391c 55fc7f4 8e67ebe 69b4d58 8e67ebe d6ca95d 01d59fd 431078d 01d59fd 0f65663 01d59fd cd0858a 01d59fd d4a2bc9 01d59fd 2d8fd5c 8303012 01d59fd 431078d 8303012 01d59fd 8303012 01d59fd 8303012 01d59fd d4a2bc9 8303012 cd0858a 2a47787 d4a2bc9 01d59fd 0f65663 d4a2bc9 431078d e2fe86b d4a2bc9 2a47787 d4a2bc9 8e67ebe 0f65663 8e67ebe d6ca95d 8e67ebe 69b4d58 8e67ebe 0f65663 ce477d4 0d47bf5 e2fe86b 6863798 ce477d4 d0e8be9 ce477d4 d0e8be9 ce477d4 d0e8be9 ce477d4 01d59fd ce477d4 b2fa6ba 3f4d979 ce477d4 d0e8be9 01d59fd 266420a 01d59fd d4a2bc9 01d59fd f317a71 01d59fd f317a71 01d59fd f317a71 01d59fd f317a71 69b4d58 431078d 69b4d58 431078d e7ae6b8 e2fe86b e7ae6b8 bdcd265 e7ae6b8 bdcd265 dc516e7 e7ae6b8 dc516e7 01d59fd d6ca95d 01d59fd e7ae6b8 8fd12ea 01d59fd e7ae6b8 0a4d80c e7ae6b8 0a4d80c e7ae6b8 0a4d80c 34ecb22 e7ae6b8 8eaf913 e7ae6b8 8eaf913 e7ae6b8 431078d e348563 e7ae6b8 8eaf913 0f65663 8e67ebe d0e8be9 01d59fd 8e67ebe 8eaf913 1cf374d 8e67ebe d0e8be9 ce477d4 01d59fd |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 |
import gradio_client.utils as gc_utils
_original_json_schema_to_python_type = gc_utils._json_schema_to_python_type
def patched_json_schema_to_python_type(schema, defs=None):
if isinstance(schema, bool):
return {}
return _original_json_schema_to_python_type(schema, defs)
gc_utils._json_schema_to_python_type = patched_json_schema_to_python_type
import logging
import os
os.makedirs("tmp", exist_ok=True)
os.environ['TMP_DIR'] = "tmp"
import subprocess
import shutil
import glob
import gradio as gr
import numpy as np
from apscheduler.schedulers.background import BackgroundScheduler
import json
from io import BytesIO
from src.radial.radial import create_plot
from gradio_leaderboard import Leaderboard, SelectColumns
from gradio_space_ci import enable_space_ci
from src.display.about import INTRODUCTION_TEXT, TITLE, LLM_BENCHMARKS_TEXT
from src.display.css_html_js import custom_css
from src.display.utils import AutoEvalColumn, fields
from src.envs import API, H4_TOKEN, HF_HOME, REPO_ID, RESET_JUDGEMENT_ENV
from src.leaderboard.build_leaderboard import build_leadearboard_df, download_openbench, download_dataset
import huggingface_hub
os.environ["GRADIO_ANALYTICS_ENABLED"] = "false"
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
enable_space_ci()
def handle_file_upload(file_bytes):
"""
Read the uploaded bytes and parse JSON directly,
avoiding ephemeral disk paths or file read issues.
"""
logging.info("File uploaded (bytes). Size: %d bytes", len(file_bytes))
v = json.loads(file_bytes.decode("utf-8"))
return v
def submit_file(v, mn):
"""
We removed file_path because we no longer need it
(no ephemeral path). 'v' is the loaded JSON object.
"""
print('START SUBMITTING!!!')
if 'results' not in v:
return "Invalid JSON: missing 'results' key"
new_file = v['results']
new_file['model'] = mn
columns = [
'mmlu_translated_kk', 'kk_constitution_mc', 'kk_dastur_mc',
'kazakh_and_literature_unt_mc', 'kk_geography_unt_mc',
'kk_world_history_unt_mc', 'kk_history_of_kazakhstan_unt_mc',
'kk_english_unt_mc', 'kk_biology_unt_mc', 'kk_human_society_rights_unt_mc'
]
for column in columns:
if column not in new_file or not isinstance(new_file[column], dict):
return f"Missing or invalid column: {column}"
if 'acc,none' not in new_file[column]:
return f"Missing 'acc,none' key in column: {column}"
new_file[column] = new_file[column]['acc,none']
if 'config' not in v or 'model_dtype' not in v['config']:
return "Missing 'config' or 'model_dtype' in JSON"
new_file['model_dtype'] = v['config']["model_dtype"]
new_file['ppl'] = 0
print('WE READ FILE: ', new_file)
buf = BytesIO()
buf.write(json.dumps(new_file).encode('utf-8'))
buf.seek(0)
API.upload_file(
path_or_fileobj=buf,
path_in_repo="model_data/external/" + mn.replace('/', '__') + ".json",
repo_id="kz-transformers/s-openbench-eval",
repo_type="dataset",
)
os.environ[RESET_JUDGEMENT_ENV] = "1"
return "Success!"
def restart_space():
API.restart_space(repo_id=REPO_ID)
download_openbench()
def update_plot(selected_models):
return create_plot(selected_models)
def build_demo():
download_openbench()
demo = gr.Blocks(title="Kaz LLM LB", css=custom_css)
leaderboard_df = build_leadearboard_df()
with demo:
gr.HTML(TITLE)
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
with gr.Tabs(elem_classes="tab-buttons"):
with gr.TabItem("๐
LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
Leaderboard(
value=leaderboard_df,
datatype=[c.type for c in fields(AutoEvalColumn)],
select_columns=SelectColumns(
default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden or c.dummy],
label="Select Columns to Display:",
),
search_columns=[AutoEvalColumn.model.name],
)
with gr.TabItem("๐ Submit ", elem_id="llm-benchmark-tab-table", id=3):
with gr.Row():
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
with gr.Row():
gr.Markdown("# โจ Submit your model here!", elem_classes="markdown-text")
with gr.Column():
model_name_textbox = gr.Textbox(label="Model name")
file_output = gr.File(
label="Drag and drop JSON file judgment here",
type="binary"
)
uploaded_file = gr.State()
with gr.Row():
with gr.Column():
out = gr.Textbox("Submission Status")
submit_button = gr.Button("Submit File", variant='primary')
file_output.upload(
fn=handle_file_upload,
inputs=file_output,
outputs=uploaded_file
)
submit_button.click(
fn=submit_file,
inputs=[uploaded_file, model_name_textbox],
outputs=[out]
)
with gr.TabItem("๐ Analytics", elem_id="llm-benchmark-tab-table", id=4):
with gr.Column():
model_dropdown = gr.Dropdown(
choices=leaderboard_df["model"].tolist(),
label="Models",
value=leaderboard_df["model"].tolist(),
multiselect=True,
info="Select models"
)
with gr.Column():
plot = gr.Plot(update_plot(model_dropdown.value))
model_dropdown.change(
fn=update_plot,
inputs=[model_dropdown],
outputs=[plot]
)
return demo
def aggregate_leaderboard_data():
download_dataset("kz-transformers/s-openbench-eval", "m_data")
data_list = [
{
"model_dtype": "torch.float16",
"model": "dummy-random-baseline",
"ppl": 0,
"mmlu_translated_kk": 0.22991508817766165,
"kk_constitution_mc": 0.25120772946859904,
"kk_dastur_mc": 0.24477611940298508,
"kazakh_and_literature_unt_mc": 0.2090443686006826,
"kk_geography_unt_mc": 0.2019790454016298,
"kk_world_history_unt_mc": 0.1986970684039088,
"kk_history_of_kazakhstan_unt_mc": 0.19417177914110428,
"kk_english_unt_mc": 0.189804278561675,
"kk_biology_unt_mc": 0.22330729166666666,
"kk_human_society_rights_unt_mc": 0.242152466367713,
},
{
"model_dtype": "torch.float16",
"model": "gpt-4o-mini",
"ppl": 0,
"mmlu_translated_kk": 0.5623775310254735,
"kk_constitution_mc": 0.79,
"kk_dastur_mc": 0.755,
"kazakh_and_literature_unt_mc": 0.4953071672354949,
"kk_geography_unt_mc": 0.5675203725261933,
"kk_world_history_unt_mc": 0.6091205211726385,
"kk_history_of_kazakhstan_unt_mc": 0.47883435582822087,
"kk_english_unt_mc": 0.6763768775603095,
"kk_biology_unt_mc": 0.607421875,
"kk_human_society_rights_unt_mc": 0.7309417040358744,
},
{
"model_dtype": "api",
"model": "gpt-4o",
"ppl": 0,
"mmlu_translated_kk": 0.7419986936642717,
"kk_constitution_mc": 0.841,
"kk_dastur_mc": 0.798,
"kazakh_and_literature_unt_mc": 0.6785409556313993,
"kk_geography_unt_mc": 0.629802095459837,
"kk_world_history_unt_mc": 0.6783387622149837,
"kk_history_of_kazakhstan_unt_mc": 0.6785276073619632,
"kk_english_unt_mc": 0.7410104688211198,
"kk_biology_unt_mc": 0.6979166666666666,
"kk_human_society_rights_unt_mc": 0.7937219730941704,
},
{
"model_dtype": "torch.float16",
"model": "nova-pro-v1",
"ppl": 0,
"mmlu_translated_kk": 0.6792945787067276,
"kk_constitution_mc": 0.7753623188405797,
"kk_dastur_mc": 0.718407960199005,
"kazakh_and_literature_unt_mc": 0.4656569965870307,
"kk_geography_unt_mc": 0.5541327124563445,
"kk_world_history_unt_mc": 0.6425081433224755,
"kk_history_of_kazakhstan_unt_mc": 0.5,
"kk_english_unt_mc": 0.6845698680018206,
"kk_biology_unt_mc": 0.6197916666666666,
"kk_human_society_rights_unt_mc": 0.7713004484304933,
},
{
"model_dtype": "torch.float16",
"model": "gemini-1.5-pro",
"ppl": 0,
"mmlu_translated_kk": 0.7380796864794252,
"kk_constitution_mc": 0.8164251207729468,
"kk_dastur_mc": 0.7383084577114428,
"kazakh_and_literature_unt_mc": 0.5565273037542662,
"kk_geography_unt_mc": 0.6065192083818394,
"kk_world_history_unt_mc": 0.6669381107491856,
"kk_history_of_kazakhstan_unt_mc": 0.5791411042944785,
"kk_english_unt_mc": 0.7114246700045517,
"kk_biology_unt_mc": 0.6673177083333334,
"kk_human_society_rights_unt_mc": 0.7623318385650224,
},
{
"model_dtype": "torch.float16",
"model": "gemini-1.5-flash",
"ppl": 0,
"mmlu_translated_kk": 0.6335728282168517,
"kk_constitution_mc": 0.748792270531401,
"kk_dastur_mc": 0.7054726368159204,
"kazakh_and_literature_unt_mc": 0.4761092150170648,
"kk_geography_unt_mc": 0.5640279394644936,
"kk_world_history_unt_mc": 0.5838762214983714,
"kk_history_of_kazakhstan_unt_mc": 0.43374233128834355,
"kk_english_unt_mc": 0.6681838871187984,
"kk_biology_unt_mc": 0.6217447916666666,
"kk_human_society_rights_unt_mc": 0.7040358744394619,
},
{
"model_dtype": "torch.float16",
"model": "claude-3-5-sonnet",
"ppl": 0,
"mmlu_translated_kk": 0.7335075114304376,
"kk_constitution_mc": 0.8623188405797102,
"kk_dastur_mc": 0.7950248756218905,
"kazakh_and_literature_unt_mc": 0.6548634812286689,
"kk_geography_unt_mc": 0.6431897555296857,
"kk_world_history_unt_mc": 0.6669381107491856,
"kk_history_of_kazakhstan_unt_mc": 0.6251533742331289,
"kk_english_unt_mc": 0.7291761492944925,
"kk_biology_unt_mc": 0.6686197916666666,
"kk_human_society_rights_unt_mc": 0.8026905829596412,
},
{
"model_dtype": "torch.float16",
"model": "yandex-gpt",
"ppl": 0,
"mmlu_translated_kk": 0.39777922926192033,
"kk_constitution_mc": 0.7028985507246377,
"kk_dastur_mc": 0.6159203980099502,
"kazakh_and_literature_unt_mc": 0.3914249146757679,
"kk_geography_unt_mc": 0.4912689173457509,
"kk_world_history_unt_mc": 0.5244299674267101,
"kk_history_of_kazakhstan_unt_mc": 0.4030674846625767,
"kk_english_unt_mc": 0.5844333181611289,
"kk_biology_unt_mc": 0.4368489583333333,
"kk_human_society_rights_unt_mc": 0.6995515695067265,
},
]
files_list = glob.glob("./m_data/model_data/external/*.json")
logging.info(f'FILES LIST: {files_list}')
for file in files_list:
logging.info(f'Trying to read external submit file: {file}')
try:
with open(file) as f:
data = json.load(f)
if not isinstance(data, dict):
logging.warning(f"File {file} is not a dict, skipping")
continue
required_keys = {'model_dtype', 'model', 'ppl', 'mmlu_translated_kk'}
if not required_keys.issubset(data.keys()):
logging.warning(f"File {file} missing required keys, skipping")
continue
logging.info(f'Successfully read: {file}, got {len(data)} keys')
data_list.append(data)
except Exception as e:
logging.error(f"Error reading file {file}: {e}")
continue
logging.info("Combined data_list length: %d", len(data_list))
with open("genned.json", "w") as f:
json.dump(data_list, f)
API.upload_file(
path_or_fileobj="genned.json",
path_in_repo="leaderboard.json",
repo_id="kz-transformers/kaz-llm-lb-metainfo",
repo_type="dataset",
)
def update_board():
need_reset = os.environ.get(RESET_JUDGEMENT_ENV)
logging.info("Updating the judgement (scheduled update): %s", need_reset)
if need_reset != "1":
pass
os.environ[RESET_JUDGEMENT_ENV] = "0"
aggregate_leaderboard_data()
restart_space()
def update_board_():
logging.info("Updating the judgement at startup")
aggregate_leaderboard_data()
if __name__ == "__main__":
os.environ[RESET_JUDGEMENT_ENV] = "1"
from apscheduler.schedulers.background import BackgroundScheduler
scheduler = BackgroundScheduler()
update_board_()
scheduler.add_job(update_board, "interval", minutes=10)
scheduler.start()
demo_app = build_demo()
demo_app.launch(debug=True, share=False, show_api=False)
|