Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Update Evaluation tab
Browse files- yourbench_space/app.py +66 -20
- yourbench_space/evaluation.py +3 -1
yourbench_space/app.py
CHANGED
@@ -9,7 +9,7 @@ from loguru import logger
|
|
9 |
|
10 |
import gradio as gr
|
11 |
from datasets import load_dataset
|
12 |
-
from huggingface_hub import whoami
|
13 |
from yourbench_space import PATH
|
14 |
from yourbench_space.utils import (
|
15 |
STAGES,
|
@@ -136,23 +136,26 @@ def enable_button(files):
|
|
136 |
return gr.update(interactive=bool(files))
|
137 |
|
138 |
|
139 |
-
def run_evaluation_pipeline(oauth_token: gr.OAuthToken | None, org_name, eval_name):
|
140 |
-
# Test dataset existence
|
141 |
eval_ds_name = f"{org_name}/{eval_name}"
|
142 |
-
|
|
|
|
|
143 |
try:
|
144 |
-
load_dataset(eval_ds_name, streaming=True, token=oauth_token.token)
|
145 |
except Exception as e:
|
146 |
-
|
147 |
-
return
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
|
|
|
|
153 |
|
154 |
-
repo_id = f"{org_name}/leaderboard_yourbench_{eval_ds_name.replace('/', '_')}"
|
155 |
api = HfApi()
|
|
|
156 |
|
157 |
try:
|
158 |
api.create_repo(
|
@@ -161,10 +164,30 @@ def run_evaluation_pipeline(oauth_token: gr.OAuthToken | None, org_name, eval_na
|
|
161 |
space_sdk="gradio",
|
162 |
token=oauth_token.token,
|
163 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
164 |
api.upload_folder(
|
165 |
repo_id=repo_id,
|
166 |
repo_type="space",
|
167 |
-
folder_path=
|
168 |
token=oauth_token.token,
|
169 |
)
|
170 |
api.add_space_secret(
|
@@ -176,8 +199,12 @@ def run_evaluation_pipeline(oauth_token: gr.OAuthToken | None, org_name, eval_na
|
|
176 |
api.add_space_variable(repo_id=repo_id, key="TASK", value=eval_ds_name, token=oauth_token.token)
|
177 |
api.add_space_variable(repo_id=repo_id, key="ORG_NAME", value=org_name, token=oauth_token.token)
|
178 |
except Exception as e:
|
179 |
-
|
180 |
-
|
|
|
|
|
|
|
|
|
181 |
|
182 |
|
183 |
def init_session(profile: gr.OAuthProfile | None):
|
@@ -338,11 +365,30 @@ with gr.Blocks(theme=gr.themes.Default()) as app:
|
|
338 |
outputs=[log_output, stages_table],
|
339 |
)
|
340 |
|
|
|
|
|
|
|
|
|
|
|
|
|
341 |
with gr.Tab("Evaluate", id=2):
|
342 |
-
with gr.
|
343 |
-
|
344 |
-
|
345 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
346 |
|
347 |
app.load(init_session, outputs=session_state)
|
348 |
|
|
|
9 |
|
10 |
import gradio as gr
|
11 |
from datasets import load_dataset
|
12 |
+
from huggingface_hub import whoami, HfApi
|
13 |
from yourbench_space import PATH
|
14 |
from yourbench_space.utils import (
|
15 |
STAGES,
|
|
|
136 |
return gr.update(interactive=bool(files))
|
137 |
|
138 |
|
139 |
+
def run_evaluation_pipeline(oauth_token: gr.OAuthToken | None, org_name, eval_name, config_name="lighteval"):
|
|
|
140 |
eval_ds_name = f"{org_name}/{eval_name}"
|
141 |
+
repo_id = f"{org_name}/leaderboard_yourbench_{eval_ds_name.replace('/', '_')}"
|
142 |
+
folder_path = str(Path(PATH) / "yourbench_space" / "leaderboard_space")
|
143 |
+
|
144 |
try:
|
145 |
+
load_dataset(eval_ds_name, name=config_name, streaming=True, token=oauth_token.token)
|
146 |
except Exception as e:
|
147 |
+
logger.error(f"Failed to load dataset '{eval_ds_name}': {e}")
|
148 |
+
return "β Failed: Dataset loading error"
|
149 |
+
|
150 |
+
try:
|
151 |
+
create_eval_file(eval_ds_name)
|
152 |
+
status = asyncio.run(run_evaluations(eval_ds_name=eval_ds_name, org=org_name))
|
153 |
+
except Exception as e:
|
154 |
+
logger.error(f"Evaluation error: {e}")
|
155 |
+
return f"β Failed: Evaluation error\n{e}"
|
156 |
|
|
|
157 |
api = HfApi()
|
158 |
+
space_was_regenerated = False
|
159 |
|
160 |
try:
|
161 |
api.create_repo(
|
|
|
164 |
space_sdk="gradio",
|
165 |
token=oauth_token.token,
|
166 |
)
|
167 |
+
except Exception as e:
|
168 |
+
if "409" in str(e) and "already created this space repo" in str(e):
|
169 |
+
logger.info(f"Space '{repo_id}' already exists. Deleting and regenerating it.")
|
170 |
+
try:
|
171 |
+
api.delete_repo(repo_id=repo_id, repo_type="space", token=oauth_token.token)
|
172 |
+
api.create_repo(
|
173 |
+
repo_id=repo_id,
|
174 |
+
repo_type="space",
|
175 |
+
space_sdk="gradio",
|
176 |
+
token=oauth_token.token,
|
177 |
+
)
|
178 |
+
space_was_regenerated = True
|
179 |
+
except Exception as delete_err:
|
180 |
+
logger.error(f"Failed to delete and recreate space '{repo_id}': {delete_err}")
|
181 |
+
return f"β
Evaluation succeeded\nβ Failed: Could not recreate space\n{delete_err}"
|
182 |
+
else:
|
183 |
+
logger.error(f"Space creation error: {e}")
|
184 |
+
return f"β
Evaluation succeeded\nβ Failed: Space creation error\n{e}"
|
185 |
+
|
186 |
+
try:
|
187 |
api.upload_folder(
|
188 |
repo_id=repo_id,
|
189 |
repo_type="space",
|
190 |
+
folder_path=folder_path,
|
191 |
token=oauth_token.token,
|
192 |
)
|
193 |
api.add_space_secret(
|
|
|
199 |
api.add_space_variable(repo_id=repo_id, key="TASK", value=eval_ds_name, token=oauth_token.token)
|
200 |
api.add_space_variable(repo_id=repo_id, key="ORG_NAME", value=org_name, token=oauth_token.token)
|
201 |
except Exception as e:
|
202 |
+
logger.error(f"Failed during space setup: {e}")
|
203 |
+
return f"β
Evaluation succeeded\nβ Failed: Space setup error\n{e}"
|
204 |
+
|
205 |
+
if space_was_regenerated:
|
206 |
+
return f"β
Evaluation succeeded\nπ Space '{repo_id}' was regenerated successfully"
|
207 |
+
return f"β
Evaluation and Space creation completed successfully for: {repo_id}"
|
208 |
|
209 |
|
210 |
def init_session(profile: gr.OAuthProfile | None):
|
|
|
365 |
outputs=[log_output, stages_table],
|
366 |
)
|
367 |
|
368 |
+
# with gr.Tab("Evaluate", id=2):
|
369 |
+
# with gr.Row():
|
370 |
+
# btn_launch_evals = gr.Button("Launch evaluations")
|
371 |
+
# status = gr.Textbox(label="Status")
|
372 |
+
# btn_launch_evals.click(run_evaluation_pipeline, [hf_org_dropdown, hf_dataset_name, gr.State("lighteval")], status)
|
373 |
+
|
374 |
with gr.Tab("Evaluate", id=2):
|
375 |
+
with gr.Column():
|
376 |
+
gr.Markdown("### π§ͺ Run YourBench Evaluation")
|
377 |
+
gr.Markdown("Run the full evaluation pipeline on the uploaded dataset. This includes computing metrics, creating the leaderboard, and pushing results.")
|
378 |
+
|
379 |
+
with gr.Row():
|
380 |
+
btn_launch_evals = gr.Button("π Launch Evaluation", variant="primary")
|
381 |
+
clear_status_btn = gr.Button("Clear", variant="secondary")
|
382 |
+
|
383 |
+
with gr.Accordion("Evaluation Log", open=True):
|
384 |
+
eval_status = gr.Textbox(label="", lines=6, interactive=False, show_label=False)
|
385 |
+
|
386 |
+
btn_launch_evals.click(
|
387 |
+
run_evaluation_pipeline,
|
388 |
+
[hf_org_dropdown, hf_dataset_name, gr.State("lighteval")],
|
389 |
+
eval_status,
|
390 |
+
)
|
391 |
+
clear_status_btn.click(lambda: "", outputs=eval_status)
|
392 |
|
393 |
app.load(init_session, outputs=session_state)
|
394 |
|
yourbench_space/evaluation.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1 |
import os
|
2 |
import subprocess
|
3 |
import asyncio
|
|
|
4 |
|
5 |
from yourbench_space.leaderboard_space.env import INIT_MODELS
|
6 |
|
@@ -11,7 +12,8 @@ OUTPUT_DIR = "/data" if ON_SPACES else "."
|
|
11 |
|
12 |
def create_eval_file(eval_ds_name: str):
|
13 |
task_name = eval_ds_name.replace("/", "_")
|
14 |
-
|
|
|
15 |
|
16 |
async def run_process(args: list) -> dict:
|
17 |
process = await asyncio.create_subprocess_exec(
|
|
|
1 |
import os
|
2 |
import subprocess
|
3 |
import asyncio
|
4 |
+
from pathlib import Path
|
5 |
|
6 |
from yourbench_space.leaderboard_space.env import INIT_MODELS
|
7 |
|
|
|
12 |
|
13 |
def create_eval_file(eval_ds_name: str):
|
14 |
task_name = eval_ds_name.replace("/", "_")
|
15 |
+
template_path = Path("/home/user/app/yourbench_space/lighteval_task/yourbench_task.py")
|
16 |
+
subprocess.run(["lighteval", "tasks", "create", str(template_path), task_name, eval_ds_name])
|
17 |
|
18 |
async def run_process(args: list) -> dict:
|
19 |
process = await asyncio.create_subprocess_exec(
|