Spaces:
Sleeping
Sleeping
Improve Evaluate tab
Browse files- yourbench_space/app.py +19 -10
- yourbench_space/utils.py +11 -0
yourbench_space/app.py
CHANGED
@@ -12,9 +12,11 @@ from datasets import load_dataset
|
|
12 |
from huggingface_hub import whoami, HfApi
|
13 |
from yourbench_space import PATH
|
14 |
from yourbench_space.utils import (
|
|
|
15 |
STAGES,
|
16 |
SubprocessManagerGroup,
|
17 |
save_files,
|
|
|
18 |
update_dataset,
|
19 |
map_stage_names,
|
20 |
is_running_locally,
|
@@ -234,6 +236,12 @@ def init_session(profile: gr.OAuthProfile | None):
|
|
234 |
logger.info(f"Started session for {local_uuid}")
|
235 |
return gr.State(local_uuid, delete_callback=lambda uid: MANAGERS.remove(uid))
|
236 |
|
|
|
|
|
|
|
|
|
|
|
|
|
237 |
|
238 |
with gr.Blocks(theme=gr.themes.Default()) as app:
|
239 |
session_state = gr.State()
|
@@ -349,14 +357,18 @@ with gr.Blocks(theme=gr.themes.Default()) as app:
|
|
349 |
|
350 |
with gr.Accordion("Lighteval Preview"):
|
351 |
lighteval_df = gr.DataFrame()
|
352 |
-
|
353 |
stages_table.change(
|
354 |
update_dataset,
|
355 |
inputs=[stages_table, hf_org_dropdown, hf_dataset_name],
|
356 |
outputs=[ingestion_df, summarization_df, single_shot_df, multi_hop_df, lighteval_df],
|
357 |
)
|
358 |
|
359 |
-
|
|
|
|
|
|
|
|
|
|
|
360 |
# TODO: this timer should only be active when the second tab is passed to active for the first time
|
361 |
log_timer = gr.Timer(1.0, active=True)
|
362 |
log_timer.tick(
|
@@ -365,20 +377,16 @@ with gr.Blocks(theme=gr.themes.Default()) as app:
|
|
365 |
outputs=[log_output, stages_table],
|
366 |
)
|
367 |
|
368 |
-
# with gr.Tab("Evaluate", id=2):
|
369 |
-
# with gr.Row():
|
370 |
-
# btn_launch_evals = gr.Button("Launch evaluations")
|
371 |
-
# status = gr.Textbox(label="Status")
|
372 |
-
# btn_launch_evals.click(run_evaluation_pipeline, [hf_org_dropdown, hf_dataset_name, gr.State("lighteval")], status)
|
373 |
-
|
374 |
with gr.Tab("Evaluate", id=2):
|
375 |
with gr.Column():
|
376 |
gr.Markdown("### 🧪 Run YourBench Evaluation")
|
377 |
gr.Markdown("Run the full evaluation pipeline on the uploaded dataset. This includes computing metrics, creating the leaderboard, and pushing results.")
|
378 |
|
379 |
with gr.Row():
|
380 |
-
|
381 |
-
|
|
|
|
|
382 |
|
383 |
with gr.Accordion("Evaluation Log", open=True):
|
384 |
eval_status = gr.Textbox(label="", lines=6, interactive=False, show_label=False)
|
@@ -390,6 +398,7 @@ with gr.Blocks(theme=gr.themes.Default()) as app:
|
|
390 |
)
|
391 |
clear_status_btn.click(lambda: "", outputs=eval_status)
|
392 |
|
|
|
393 |
app.load(init_session, outputs=session_state)
|
394 |
|
395 |
app.launch(allowed_paths=[PATH])
|
|
|
12 |
from huggingface_hub import whoami, HfApi
|
13 |
from yourbench_space import PATH
|
14 |
from yourbench_space.utils import (
|
15 |
+
STAGE_DISPLAY_MAP,
|
16 |
STAGES,
|
17 |
SubprocessManagerGroup,
|
18 |
save_files,
|
19 |
+
on_generation_succsess,
|
20 |
update_dataset,
|
21 |
map_stage_names,
|
22 |
is_running_locally,
|
|
|
236 |
logger.info(f"Started session for {local_uuid}")
|
237 |
return gr.State(local_uuid, delete_callback=lambda uid: MANAGERS.remove(uid))
|
238 |
|
239 |
+
btn_launch_evals = gr.Button(
|
240 |
+
"🚀 Launch Evaluation",
|
241 |
+
visible=True,
|
242 |
+
interactive=False, # Start non-interactive
|
243 |
+
variant="primary"
|
244 |
+
)
|
245 |
|
246 |
with gr.Blocks(theme=gr.themes.Default()) as app:
|
247 |
session_state = gr.State()
|
|
|
357 |
|
358 |
with gr.Accordion("Lighteval Preview"):
|
359 |
lighteval_df = gr.DataFrame()
|
|
|
360 |
stages_table.change(
|
361 |
update_dataset,
|
362 |
inputs=[stages_table, hf_org_dropdown, hf_dataset_name],
|
363 |
outputs=[ingestion_df, summarization_df, single_shot_df, multi_hop_df, lighteval_df],
|
364 |
)
|
365 |
|
366 |
+
stages_table.change(
|
367 |
+
on_generation_succsess,
|
368 |
+
inputs=stages_table,
|
369 |
+
outputs=[tabs,btn_launch_evals],
|
370 |
+
)
|
371 |
+
|
372 |
# TODO: this timer should only be active when the second tab is passed to active for the first time
|
373 |
log_timer = gr.Timer(1.0, active=True)
|
374 |
log_timer.tick(
|
|
|
377 |
outputs=[log_output, stages_table],
|
378 |
)
|
379 |
|
|
|
|
|
|
|
|
|
|
|
|
|
380 |
with gr.Tab("Evaluate", id=2):
|
381 |
with gr.Column():
|
382 |
gr.Markdown("### 🧪 Run YourBench Evaluation")
|
383 |
gr.Markdown("Run the full evaluation pipeline on the uploaded dataset. This includes computing metrics, creating the leaderboard, and pushing results.")
|
384 |
|
385 |
with gr.Row():
|
386 |
+
with gr.Column():
|
387 |
+
btn_launch_evals.render()
|
388 |
+
with gr.Column():
|
389 |
+
clear_status_btn = gr.Button("Clear", variant="secondary")
|
390 |
|
391 |
with gr.Accordion("Evaluation Log", open=True):
|
392 |
eval_status = gr.Textbox(label="", lines=6, interactive=False, show_label=False)
|
|
|
398 |
)
|
399 |
clear_status_btn.click(lambda: "", outputs=eval_status)
|
400 |
|
401 |
+
|
402 |
app.load(init_session, outputs=session_state)
|
403 |
|
404 |
app.launch(allowed_paths=[PATH])
|
yourbench_space/utils.py
CHANGED
@@ -129,6 +129,17 @@ def update_dataset(stages: list, hf_org: str, hf_prefix: str, oauth_token: gr.OA
|
|
129 |
|
130 |
return (ingestion_df, summarization_df, single_shot_df, multi_hop_df, lighteval_df)
|
131 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
132 |
|
133 |
class SubprocessManagerGroup:
|
134 |
"""Instanciates one manager per user (should be used as a singleton class)"""
|
|
|
129 |
|
130 |
return (ingestion_df, summarization_df, single_shot_df, multi_hop_df, lighteval_df)
|
131 |
|
132 |
+
def should_enable_eval_tab(stages):
|
133 |
+
logger.info(f"Stages received: {stages}")
|
134 |
+
logger.info(f"Lighteval stage name: {STAGE_DISPLAY_MAP['lighteval']}")
|
135 |
+
return STAGE_DISPLAY_MAP["lighteval"] in stages
|
136 |
+
|
137 |
+
def on_generation_succsess(stages):
|
138 |
+
stages = stages or []
|
139 |
+
if STAGE_DISPLAY_MAP["lighteval"] in stages:
|
140 |
+
gr.Success("🌟 Your Dataset is ready for evaluation!")
|
141 |
+
return gr.update(selected=2), gr.update(interactive=True, visible=True)
|
142 |
+
return gr.update(), gr.update(interactive=False, visible=True)
|
143 |
|
144 |
class SubprocessManagerGroup:
|
145 |
"""Instanciates one manager per user (should be used as a singleton class)"""
|