alozowski HF Staff commited on
Commit
78afa9e
·
1 Parent(s): 6883f10

Improve Evaluate tab

Browse files
Files changed (2) hide show
  1. yourbench_space/app.py +19 -10
  2. yourbench_space/utils.py +11 -0
yourbench_space/app.py CHANGED
@@ -12,9 +12,11 @@ from datasets import load_dataset
12
  from huggingface_hub import whoami, HfApi
13
  from yourbench_space import PATH
14
  from yourbench_space.utils import (
 
15
  STAGES,
16
  SubprocessManagerGroup,
17
  save_files,
 
18
  update_dataset,
19
  map_stage_names,
20
  is_running_locally,
@@ -234,6 +236,12 @@ def init_session(profile: gr.OAuthProfile | None):
234
  logger.info(f"Started session for {local_uuid}")
235
  return gr.State(local_uuid, delete_callback=lambda uid: MANAGERS.remove(uid))
236
 
 
 
 
 
 
 
237
 
238
  with gr.Blocks(theme=gr.themes.Default()) as app:
239
  session_state = gr.State()
@@ -349,14 +357,18 @@ with gr.Blocks(theme=gr.themes.Default()) as app:
349
 
350
  with gr.Accordion("Lighteval Preview"):
351
  lighteval_df = gr.DataFrame()
352
-
353
  stages_table.change(
354
  update_dataset,
355
  inputs=[stages_table, hf_org_dropdown, hf_dataset_name],
356
  outputs=[ingestion_df, summarization_df, single_shot_df, multi_hop_df, lighteval_df],
357
  )
358
 
359
-
 
 
 
 
 
360
  # TODO: this timer should only be active when the second tab is passed to active for the first time
361
  log_timer = gr.Timer(1.0, active=True)
362
  log_timer.tick(
@@ -365,20 +377,16 @@ with gr.Blocks(theme=gr.themes.Default()) as app:
365
  outputs=[log_output, stages_table],
366
  )
367
 
368
- # with gr.Tab("Evaluate", id=2):
369
- # with gr.Row():
370
- # btn_launch_evals = gr.Button("Launch evaluations")
371
- # status = gr.Textbox(label="Status")
372
- # btn_launch_evals.click(run_evaluation_pipeline, [hf_org_dropdown, hf_dataset_name, gr.State("lighteval")], status)
373
-
374
  with gr.Tab("Evaluate", id=2):
375
  with gr.Column():
376
  gr.Markdown("### 🧪 Run YourBench Evaluation")
377
  gr.Markdown("Run the full evaluation pipeline on the uploaded dataset. This includes computing metrics, creating the leaderboard, and pushing results.")
378
 
379
  with gr.Row():
380
- btn_launch_evals = gr.Button("🚀 Launch Evaluation", variant="primary")
381
- clear_status_btn = gr.Button("Clear", variant="secondary")
 
 
382
 
383
  with gr.Accordion("Evaluation Log", open=True):
384
  eval_status = gr.Textbox(label="", lines=6, interactive=False, show_label=False)
@@ -390,6 +398,7 @@ with gr.Blocks(theme=gr.themes.Default()) as app:
390
  )
391
  clear_status_btn.click(lambda: "", outputs=eval_status)
392
 
 
393
  app.load(init_session, outputs=session_state)
394
 
395
  app.launch(allowed_paths=[PATH])
 
12
  from huggingface_hub import whoami, HfApi
13
  from yourbench_space import PATH
14
  from yourbench_space.utils import (
15
+ STAGE_DISPLAY_MAP,
16
  STAGES,
17
  SubprocessManagerGroup,
18
  save_files,
19
+ on_generation_succsess,
20
  update_dataset,
21
  map_stage_names,
22
  is_running_locally,
 
236
  logger.info(f"Started session for {local_uuid}")
237
  return gr.State(local_uuid, delete_callback=lambda uid: MANAGERS.remove(uid))
238
 
239
+ btn_launch_evals = gr.Button(
240
+ "🚀 Launch Evaluation",
241
+ visible=True,
242
+ interactive=False, # Start non-interactive
243
+ variant="primary"
244
+ )
245
 
246
  with gr.Blocks(theme=gr.themes.Default()) as app:
247
  session_state = gr.State()
 
357
 
358
  with gr.Accordion("Lighteval Preview"):
359
  lighteval_df = gr.DataFrame()
 
360
  stages_table.change(
361
  update_dataset,
362
  inputs=[stages_table, hf_org_dropdown, hf_dataset_name],
363
  outputs=[ingestion_df, summarization_df, single_shot_df, multi_hop_df, lighteval_df],
364
  )
365
 
366
+ stages_table.change(
367
+ on_generation_succsess,
368
+ inputs=stages_table,
369
+ outputs=[tabs,btn_launch_evals],
370
+ )
371
+
372
  # TODO: this timer should only be active when the second tab is passed to active for the first time
373
  log_timer = gr.Timer(1.0, active=True)
374
  log_timer.tick(
 
377
  outputs=[log_output, stages_table],
378
  )
379
 
 
 
 
 
 
 
380
  with gr.Tab("Evaluate", id=2):
381
  with gr.Column():
382
  gr.Markdown("### 🧪 Run YourBench Evaluation")
383
  gr.Markdown("Run the full evaluation pipeline on the uploaded dataset. This includes computing metrics, creating the leaderboard, and pushing results.")
384
 
385
  with gr.Row():
386
+ with gr.Column():
387
+ btn_launch_evals.render()
388
+ with gr.Column():
389
+ clear_status_btn = gr.Button("Clear", variant="secondary")
390
 
391
  with gr.Accordion("Evaluation Log", open=True):
392
  eval_status = gr.Textbox(label="", lines=6, interactive=False, show_label=False)
 
398
  )
399
  clear_status_btn.click(lambda: "", outputs=eval_status)
400
 
401
+
402
  app.load(init_session, outputs=session_state)
403
 
404
  app.launch(allowed_paths=[PATH])
yourbench_space/utils.py CHANGED
@@ -129,6 +129,17 @@ def update_dataset(stages: list, hf_org: str, hf_prefix: str, oauth_token: gr.OA
129
 
130
  return (ingestion_df, summarization_df, single_shot_df, multi_hop_df, lighteval_df)
131
 
 
 
 
 
 
 
 
 
 
 
 
132
 
133
  class SubprocessManagerGroup:
134
  """Instanciates one manager per user (should be used as a singleton class)"""
 
129
 
130
  return (ingestion_df, summarization_df, single_shot_df, multi_hop_df, lighteval_df)
131
 
132
+ def should_enable_eval_tab(stages):
133
+ logger.info(f"Stages received: {stages}")
134
+ logger.info(f"Lighteval stage name: {STAGE_DISPLAY_MAP['lighteval']}")
135
+ return STAGE_DISPLAY_MAP["lighteval"] in stages
136
+
137
+ def on_generation_succsess(stages):
138
+ stages = stages or []
139
+ if STAGE_DISPLAY_MAP["lighteval"] in stages:
140
+ gr.Success("🌟 Your Dataset is ready for evaluation!")
141
+ return gr.update(selected=2), gr.update(interactive=True, visible=True)
142
+ return gr.update(), gr.update(interactive=False, visible=True)
143
 
144
  class SubprocessManagerGroup:
145
  """Instanciates one manager per user (should be used as a singleton class)"""