Alina Lozovskaya commited on
Commit
ea047ad
·
1 Parent(s): 2617bee

Update Setup and Run Generatation tabs

Browse files
yourbench_space/app.py CHANGED
@@ -1,32 +1,33 @@
1
- import asyncio
2
  import os
3
  import sys
4
  import time
5
- import gradio as gr
6
  import uuid
 
 
7
 
8
- from datasets import load_dataset
9
- from huggingface_hub import whoami
10
  from loguru import logger
11
- from pathlib import Path
12
 
13
- from yourbench_space.config import generate_and_save_config
 
 
14
  from yourbench_space.utils import (
 
15
  SubprocessManagerGroup,
16
  save_files,
17
  update_dataset,
18
- STAGES,
19
- is_running_locally
20
  )
21
- from yourbench_space.evaluation import create_eval_file, run_evaluations
22
- from yourbench_space.leaderboard_space.env import HF_TOKEN
 
23
 
24
  project_description = """
25
- # YourBench 🚀
26
  **Dynamic Benchmark Generation for Language Models**
27
 
28
  Quickly create zero-shot benchmarks from your documents – keeping models accurate and adaptable
29
- - 📖 [FAQ](#)
30
  - 💻 [GitHub](https://github.com/huggingface/yourbench/tree/v0.2-alpha-space)
31
  """
32
 
@@ -35,7 +36,7 @@ logger.add(sys.stderr, level="INFO")
35
 
36
  # Global to store all managers per session
37
  MANAGERS = SubprocessManagerGroup()
38
- USER_ID_SESSION_MAP: dict[str, str] = dict()
39
 
40
 
41
  docs_path = Path(__file__).parent / "docs.md"
@@ -45,30 +46,36 @@ citation_content = (
45
  else "# Citation\n\nDocumentation file not found."
46
  )
47
 
 
48
  def generate_and_return(hf_org, hf_dataset_name, session_state: gr.State):
49
  manager = MANAGERS.get(session_state)
50
- if manager is None: # should not be possible
51
  return (
52
- "❌ Config generation failed.",
53
- gr.update(visible=False, interactive=False),
54
- )
55
-
56
  session_uid = session_state.value
57
  config_path = generate_and_save_config(hf_org, hf_dataset_name, session_uid, manager.config_path)
58
  for _ in range(5):
59
  time.sleep(0.5)
60
  if config_path.exists():
 
61
  return (
62
  "✅ Config saved!",
63
  gr.update(value=str(config_path), visible=True, interactive=True),
64
  )
 
 
65
  return (
66
  "❌ Config generation failed.",
67
  gr.update(visible=False, interactive=False),
68
  )
69
 
 
70
  final_dataset = None
71
 
 
72
  def update_process_status(session_state: gr.State):
73
  """Update process status and include exit details if process has terminated"""
74
  if session_state is None:
@@ -79,17 +86,22 @@ def update_process_status(session_state: gr.State):
79
  return gr.update(value=False, label="Not running")
80
 
81
  is_running = manager.is_running()
82
-
83
  if not is_running:
84
  exit_code, exit_reason = manager.get_exit_details()
85
- status_text = f"Process Status: Stopped - {exit_reason}, exit code - {exit_code}" if exit_reason else "Process Status: Stopped"
 
 
 
 
86
  return gr.update(value=False, label=status_text)
87
-
88
  return gr.update(value=True, label="Process Status: Running")
89
 
 
90
  def prepare_task(session_uid: str, oauth_token: gr.OAuthToken | None, hf_dataset_name: str, _=None):
91
  if oauth_token is None and not is_running_locally():
92
- gr.Warning('You need to log in to use this Space')
93
  return
94
  new_env = os.environ.copy()
95
 
@@ -122,6 +134,7 @@ def switch_to_run_generation_tab():
122
  def enable_button(files):
123
  return gr.update(interactive=bool(files))
124
 
 
125
  def run_evaluation_pipeline(oauth_token: gr.OAuthToken | None, org_name, eval_name):
126
  # Test dataset existence
127
  eval_ds_name = f"{org_name}/{eval_name}"
@@ -136,13 +149,29 @@ def run_evaluation_pipeline(oauth_token: gr.OAuthToken | None, org_name, eval_na
136
  status = asyncio.run(run_evaluations(eval_ds_name=eval_ds_name, org=org_name))
137
  # Create space
138
  from huggingface_hub import HfApi
 
139
  repo_id = f"{org_name}/leaderboard_yourbench_{eval_ds_name.replace('/', '_')}"
140
  api = HfApi()
141
 
142
  try:
143
- api.create_repo(repo_id=repo_id, repo_type="space", space_sdk="gradio", token=oauth_token.token)
144
- api.upload_folder(repo_id=repo_id, repo_type="space", folder_path="src/", token=oauth_token.token)
145
- api.add_space_secret(repo_id=repo_id, key="HF_TOKEN", value=oauth_token.token, token=oauth_token.token)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
146
  api.add_space_variable(repo_id=repo_id, key="TASK", value=eval_ds_name, token=oauth_token.token)
147
  api.add_space_variable(repo_id=repo_id, key="ORG_NAME", value=org_name, token=oauth_token.token)
148
  except Exception as e:
@@ -179,8 +208,6 @@ def init_session(profile: gr.OAuthProfile | None):
179
 
180
 
181
  with gr.Blocks(theme=gr.themes.Default()) as app:
182
- # We initialize the session state with the user randomly generated uuid
183
- # Using uuid4 makes collision cases extremely unlikely even for concurrent users
184
  session_state = gr.State()
185
 
186
  gr.Markdown(project_description)
@@ -190,12 +217,8 @@ with gr.Blocks(theme=gr.themes.Default()) as app:
190
  with gr.Row():
191
  with gr.Accordion("Hugging Face Settings"):
192
  login_btn = gr.LoginButton()
193
- hf_org_dropdown = gr.Dropdown(
194
- choices=[], label="Organization", allow_custom_value=True
195
- )
196
- app.load(
197
- update_hf_org_dropdown, inputs=None, outputs=hf_org_dropdown
198
- )
199
 
200
  hf_dataset_name = gr.Textbox(
201
  label="Dataset name",
@@ -213,17 +236,36 @@ with gr.Blocks(theme=gr.themes.Default()) as app:
213
  file_input.upload(
214
  save_files,
215
  inputs=[session_state, file_input],
216
- outputs = output,
217
  )
 
218
 
219
  preview_button = gr.Button("Generate New Config", interactive=False)
220
  log_message = gr.Textbox(label="Log Message", visible=True)
221
- download_button = gr.File(
222
- label="Download Config", visible=False, interactive=False
 
 
 
 
223
  )
224
 
225
  file_input.change(enable_button, inputs=file_input, outputs=preview_button)
226
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
227
  preview_button.click(
228
  generate_and_return,
229
  inputs=[hf_org_dropdown, hf_dataset_name, session_state],
@@ -234,66 +276,72 @@ with gr.Blocks(theme=gr.themes.Default()) as app:
234
  inputs=None,
235
  outputs=tabs,
236
  )
237
-
238
  with gr.Tab("Run Generation", id=1):
239
- with gr.Row():
240
- start_button = gr.Button("Start Task")
241
- start_button.click(prepare_task, inputs=[session_state, login_btn, hf_dataset_name])
 
 
242
 
243
- stop_button = gr.Button("Stop Task")
244
  stop_button.click(MANAGERS.stop_process, inputs=session_state)
245
-
246
- kill_button = gr.Button("Kill Task")
247
  kill_button.click(MANAGERS.kill_process, inputs=session_state)
248
 
 
 
 
249
 
250
- with gr.Row():
251
- with gr.Column():
252
- with gr.Accordion("Log Output", open=True):
253
- log_output = gr.Code(language=None, lines=20, interactive=False)
254
-
255
- process_status = gr.Checkbox(label="Process Status", interactive=False)
256
- status_timer = gr.Timer(2.0, active=True)
257
- status_timer.tick(update_process_status, inputs=session_state, outputs=process_status)
258
-
259
- with gr.Column():
260
  with gr.Accordion("Stages", open=True):
261
  stages_table = gr.CheckboxGroup(
262
- choices=STAGES,
263
  value=[],
264
  label="Pipeline Stages Completed",
 
265
  interactive=False,
266
  )
267
 
268
- with gr.Accordion("Ingestion"):
269
- ingestion_df = gr.DataFrame()
270
-
271
- with gr.Accordion("Summarization"):
272
- summarization_df = gr.DataFrame()
273
-
274
- with gr.Accordion("Single-Hop"):
275
- single_hop = gr.DataFrame()
276
-
277
- with gr.Accordion("Answer Generation"):
278
- answers_df = gr.DataFrame()
279
-
280
- stages_table.change(
281
- update_dataset, inputs=[stages_table, hf_org_dropdown, hf_dataset_name], outputs=[ingestion_df, summarization_df, single_hop, answers_df]
282
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
283
 
284
- # TODO: this timer should only be active when the second tab is passed to active for the first time
285
- log_timer = gr.Timer(1.0, active=True)
286
- log_timer.tick(
287
- MANAGERS.read_and_get_output, inputs=session_state, outputs=[log_output, stages_table]
288
- )
289
  with gr.Tab("Evaluate", id=2, visible=False):
290
  with gr.Row():
291
  btn_launch_evals = gr.Button("Launch evaluations")
292
  status = gr.Textbox(label="Status")
293
-
294
  btn_launch_evals.click(run_evaluation_pipeline, [hf_org_dropdown, hf_dataset_name], status)
295
 
296
  app.load(init_session, outputs=session_state)
297
 
298
-
299
- app.launch(allowed_paths=["/app"])
 
 
1
  import os
2
  import sys
3
  import time
 
4
  import uuid
5
+ import asyncio
6
+ from pathlib import Path
7
 
 
 
8
  from loguru import logger
 
9
 
10
+ import gradio as gr
11
+ from datasets import load_dataset
12
+ from huggingface_hub import whoami
13
  from yourbench_space.utils import (
14
+ STAGES,
15
  SubprocessManagerGroup,
16
  save_files,
17
  update_dataset,
18
+ map_stage_names,
19
+ is_running_locally,
20
  )
21
+ from yourbench_space.config import generate_and_save_config
22
+ from yourbench_space.evaluation import run_evaluations, create_eval_file
23
+
24
 
25
  project_description = """
26
+ # YourBench 🚀
27
  **Dynamic Benchmark Generation for Language Models**
28
 
29
  Quickly create zero-shot benchmarks from your documents – keeping models accurate and adaptable
30
+ - 📖 [FAQ](#)
31
  - 💻 [GitHub](https://github.com/huggingface/yourbench/tree/v0.2-alpha-space)
32
  """
33
 
 
36
 
37
  # Global to store all managers per session
38
  MANAGERS = SubprocessManagerGroup()
39
+ USER_ID_SESSION_MAP: dict[str, str] = {}
40
 
41
 
42
  docs_path = Path(__file__).parent / "docs.md"
 
46
  else "# Citation\n\nDocumentation file not found."
47
  )
48
 
49
+
50
  def generate_and_return(hf_org, hf_dataset_name, session_state: gr.State):
51
  manager = MANAGERS.get(session_state)
52
+ if manager is None: # should not be possible
53
  return (
54
+ "❌ Config generation failed.",
55
+ gr.update(visible=False, interactive=False),
56
+ )
57
+
58
  session_uid = session_state.value
59
  config_path = generate_and_save_config(hf_org, hf_dataset_name, session_uid, manager.config_path)
60
  for _ in range(5):
61
  time.sleep(0.5)
62
  if config_path.exists():
63
+ gr.Success("Config generated")
64
  return (
65
  "✅ Config saved!",
66
  gr.update(value=str(config_path), visible=True, interactive=True),
67
  )
68
+
69
+ gr.Error("Failed to generate config")
70
  return (
71
  "❌ Config generation failed.",
72
  gr.update(visible=False, interactive=False),
73
  )
74
 
75
+
76
  final_dataset = None
77
 
78
+
79
  def update_process_status(session_state: gr.State):
80
  """Update process status and include exit details if process has terminated"""
81
  if session_state is None:
 
86
  return gr.update(value=False, label="Not running")
87
 
88
  is_running = manager.is_running()
89
+
90
  if not is_running:
91
  exit_code, exit_reason = manager.get_exit_details()
92
+ status_text = (
93
+ f"Process Status: Stopped - {exit_reason}, exit code - {exit_code}"
94
+ if exit_reason
95
+ else "Process Status: Stopped"
96
+ )
97
  return gr.update(value=False, label=status_text)
98
+
99
  return gr.update(value=True, label="Process Status: Running")
100
 
101
+
102
  def prepare_task(session_uid: str, oauth_token: gr.OAuthToken | None, hf_dataset_name: str, _=None):
103
  if oauth_token is None and not is_running_locally():
104
+ gr.Warning("You need to log in to use this Space")
105
  return
106
  new_env = os.environ.copy()
107
 
 
134
  def enable_button(files):
135
  return gr.update(interactive=bool(files))
136
 
137
+
138
  def run_evaluation_pipeline(oauth_token: gr.OAuthToken | None, org_name, eval_name):
139
  # Test dataset existence
140
  eval_ds_name = f"{org_name}/{eval_name}"
 
149
  status = asyncio.run(run_evaluations(eval_ds_name=eval_ds_name, org=org_name))
150
  # Create space
151
  from huggingface_hub import HfApi
152
+
153
  repo_id = f"{org_name}/leaderboard_yourbench_{eval_ds_name.replace('/', '_')}"
154
  api = HfApi()
155
 
156
  try:
157
+ api.create_repo(
158
+ repo_id=repo_id,
159
+ repo_type="space",
160
+ space_sdk="gradio",
161
+ token=oauth_token.token,
162
+ )
163
+ api.upload_folder(
164
+ repo_id=repo_id,
165
+ repo_type="space",
166
+ folder_path="src/",
167
+ token=oauth_token.token,
168
+ )
169
+ api.add_space_secret(
170
+ repo_id=repo_id,
171
+ key="HF_TOKEN",
172
+ value=oauth_token.token,
173
+ token=oauth_token.token,
174
+ )
175
  api.add_space_variable(repo_id=repo_id, key="TASK", value=eval_ds_name, token=oauth_token.token)
176
  api.add_space_variable(repo_id=repo_id, key="ORG_NAME", value=org_name, token=oauth_token.token)
177
  except Exception as e:
 
208
 
209
 
210
  with gr.Blocks(theme=gr.themes.Default()) as app:
 
 
211
  session_state = gr.State()
212
 
213
  gr.Markdown(project_description)
 
217
  with gr.Row():
218
  with gr.Accordion("Hugging Face Settings"):
219
  login_btn = gr.LoginButton()
220
+ hf_org_dropdown = gr.Dropdown(choices=[], label="Organization", allow_custom_value=True)
221
+ app.load(update_hf_org_dropdown, inputs=None, outputs=hf_org_dropdown)
 
 
 
 
222
 
223
  hf_dataset_name = gr.Textbox(
224
  label="Dataset name",
 
236
  file_input.upload(
237
  save_files,
238
  inputs=[session_state, file_input],
239
+ outputs=output,
240
  )
241
+ delete_button = gr.Button("Delete Uploaded Files", visible=False)
242
 
243
  preview_button = gr.Button("Generate New Config", interactive=False)
244
  log_message = gr.Textbox(label="Log Message", visible=True)
245
+ download_button = gr.File(label="Download Config", visible=False, interactive=False)
246
+
247
+ file_input.change(
248
+ lambda files: gr.update(visible=bool(files)),
249
+ inputs=file_input,
250
+ outputs=delete_button,
251
  )
252
 
253
  file_input.change(enable_button, inputs=file_input, outputs=preview_button)
254
 
255
+ def clean_and_confirm(uid):
256
+ MANAGERS.clean_workdir(uid)
257
+ return (
258
+ "Deleted all uploaded files.",
259
+ gr.update(value=None),
260
+ gr.update(interactive=False),
261
+ )
262
+
263
+ delete_button.click(
264
+ clean_and_confirm,
265
+ inputs=session_state,
266
+ outputs=[output, file_input, preview_button],
267
+ )
268
+
269
  preview_button.click(
270
  generate_and_return,
271
  inputs=[hf_org_dropdown, hf_dataset_name, session_state],
 
276
  inputs=None,
277
  outputs=tabs,
278
  )
279
+
280
  with gr.Tab("Run Generation", id=1):
281
+ with gr.Column():
282
+ with gr.Row():
283
+ start_button = gr.Button("Start Task")
284
+ stop_button = gr.Button("Stop Task")
285
+ kill_button = gr.Button("Kill Task")
286
 
287
+ start_button.click(prepare_task, inputs=[session_state, login_btn, hf_dataset_name])
288
  stop_button.click(MANAGERS.stop_process, inputs=session_state)
 
 
289
  kill_button.click(MANAGERS.kill_process, inputs=session_state)
290
 
291
+ process_status = gr.Checkbox(label="Process Status", interactive=False)
292
+ status_timer = gr.Timer(2.0, active=True)
293
+ status_timer.tick(update_process_status, inputs=session_state, outputs=process_status)
294
 
295
+ with gr.Row():
 
 
 
 
 
 
 
 
 
296
  with gr.Accordion("Stages", open=True):
297
  stages_table = gr.CheckboxGroup(
298
+ choices=map_stage_names(STAGES),
299
  value=[],
300
  label="Pipeline Stages Completed",
301
+ container=False,
302
  interactive=False,
303
  )
304
 
305
+ with gr.Row():
306
+ with gr.Column(scale=2):
307
+ with gr.Accordion("Ingestion Preview"):
308
+ ingestion_df = gr.DataFrame()
309
+
310
+ with gr.Accordion("Summarization Preview"):
311
+ summarization_df = gr.DataFrame()
312
+
313
+ with gr.Accordion("Single Shot Preview"):
314
+ single_shot_df = gr.DataFrame()
315
+
316
+ with gr.Accordion("Multi Hop Preview"):
317
+ multi_hop_df = gr.DataFrame()
318
+
319
+ with gr.Accordion("Lighteval Preview"):
320
+ lighteval_df = gr.DataFrame()
321
+
322
+ stages_table.change(
323
+ update_dataset,
324
+ inputs=[stages_table, hf_org_dropdown, hf_dataset_name],
325
+ outputs=[ingestion_df, summarization_df, single_shot_df, multi_hop_df, lighteval_df],
326
+ )
327
+
328
+ with gr.Accordion("Log Output", open=False):
329
+ log_output = gr.Code(language=None, lines=20, interactive=False)
330
+
331
+ # TODO: this timer should only be active when the second tab is passed to active for the first time
332
+ log_timer = gr.Timer(1.0, active=True)
333
+ log_timer.tick(
334
+ MANAGERS.read_and_get_output,
335
+ inputs=session_state,
336
+ outputs=[log_output, stages_table],
337
+ )
338
 
 
 
 
 
 
339
  with gr.Tab("Evaluate", id=2, visible=False):
340
  with gr.Row():
341
  btn_launch_evals = gr.Button("Launch evaluations")
342
  status = gr.Textbox(label="Status")
 
343
  btn_launch_evals.click(run_evaluation_pipeline, [hf_org_dropdown, hf_dataset_name], status)
344
 
345
  app.load(init_session, outputs=session_state)
346
 
347
+ app.launch(allowed_paths=["/home/user/app"])
 
yourbench_space/config.py CHANGED
@@ -7,13 +7,14 @@ def generate_base_config(hf_org: str, hf_dataset_name: str, session_uid: str):
7
  return {
8
  "hf_configuration": {
9
  "token": "$HF_TOKEN",
10
- "private": True,
11
  "hf_organization": hf_org,
 
12
  "hf_dataset_name": hf_dataset_name,
 
13
  },
14
  "model_list": [
15
  {
16
- "model_name": "meta-llama/Llama-3.3-70B-Instruct",
17
  "provider": "novita",
18
  "max_concurrent_requests": 32,
19
  },
@@ -21,63 +22,59 @@ def generate_base_config(hf_org: str, hf_dataset_name: str, session_uid: str):
21
  "model_name": "Qwen/Qwen2.5-72B-Instruct",
22
  "provider": "novita",
23
  "max_concurrent_requests": 32,
24
- }
25
  ],
26
  "model_roles": {
27
- "ingestion": ["meta-llama/Llama-3.3-70B-Instruct"],
28
  "summarization": ["Qwen/Qwen2.5-72B-Instruct"],
29
- "single_shot_question_generation": ["meta-llama/Llama-3.3-70B-Instruct"],
30
- "multi_hop_question_generation": ["meta-llama/Llama-3.3-70B-Instruct"],
31
- "answer_generation": ["Qwen/Qwen2.5-72B-Instruct"],
32
- "judge_answers": ["meta-llama/Llama-3.3-70B-Instruct"],
33
  },
34
  "pipeline": {
35
  "ingestion": {
36
- "source_documents_dir": f"/app/{session_uid}/uploaded_files/",
37
- "output_dir": f"/app/{session_uid}/ingested",
38
  "run": True,
39
  },
40
  "upload_ingest_to_hub": {
41
- "source_documents_dir": f"/app/{session_uid}/ingested",
 
 
 
42
  "run": True,
43
  },
44
- "summarization": {"run": True},
45
  "chunking": {
 
46
  "chunking_configuration": {
47
  "l_min_tokens": 64,
48
  "l_max_tokens": 128,
49
- "tau_threshold": 0.3,
50
  "h_min": 2,
51
- "h_max": 4,
 
52
  },
53
- "run": True,
54
  },
55
  "single_shot_question_generation": {
56
- "diversification_seed": "24 year old adult",
57
  "run": True,
 
 
 
 
 
 
58
  },
59
- "multi_hop_question_generation": {"run": False},
60
- "answer_generation": {
61
- "question_type": "single_shot",
62
  "run": True,
63
- "strategies": [
64
- {
65
- "name": "zeroshot",
66
- "prompt": "ZEROSHOT_QA_USER_PROMPT",
67
- "model_name": "meta-llama/Llama-3.3-70B-Instruct",
68
- },
69
- {
70
- "name": "gold",
71
- "prompt": "GOLD_QA_USER_PROMPT",
72
- "model_name": "meta-llama/Llama-3.3-70B-Instruct",
73
- },
74
- ],
75
  },
76
- "judge_answers": {
77
- "run": False, # to change when fixed
78
- "comparing_strategies": [["zeroshot", "gold"]],
79
- "chunk_column_index": 0,
80
- "random_seed": 42,
81
  },
82
  },
83
  }
@@ -97,4 +94,3 @@ def generate_and_save_config(hf_org: str, hf_name: str, session_uid: str, config
97
  file_path = save_yaml_file(config, config_path)
98
  logger.success(f"Config saved at: {file_path}")
99
  return file_path
100
-
 
7
  return {
8
  "hf_configuration": {
9
  "token": "$HF_TOKEN",
 
10
  "hf_organization": hf_org,
11
+ "private": True,
12
  "hf_dataset_name": hf_dataset_name,
13
+ "concat_if_exist": False,
14
  },
15
  "model_list": [
16
  {
17
+ "model_name": "Qwen/Qwen2.5-VL-72B-Instruct",
18
  "provider": "novita",
19
  "max_concurrent_requests": 32,
20
  },
 
22
  "model_name": "Qwen/Qwen2.5-72B-Instruct",
23
  "provider": "novita",
24
  "max_concurrent_requests": 32,
25
+ },
26
  ],
27
  "model_roles": {
28
+ "ingestion": ["Qwen/Qwen2.5-VL-72B-Instruct"],
29
  "summarization": ["Qwen/Qwen2.5-72B-Instruct"],
30
+ "chunking": ["intfloat/multilingual-e5-large-instruct"],
31
+ "single_shot_question_generation": ["Qwen/Qwen2.5-72B-Instruct"],
32
+ "multi_hop_question_generation": ["Qwen/Qwen2.5-72B-Instruct"],
 
33
  },
34
  "pipeline": {
35
  "ingestion": {
36
+ "source_documents_dir": f"/home/user/app/{session_uid}/uploaded_files/",
37
+ "output_dir": f"/home/user/app/{session_uid}/ingested",
38
  "run": True,
39
  },
40
  "upload_ingest_to_hub": {
41
+ "source_documents_dir": f"/home/user/app/{session_uid}/ingested",
42
+ "run": True,
43
+ },
44
+ "summarization": {
45
  "run": True,
46
  },
 
47
  "chunking": {
48
+ "run": True,
49
  "chunking_configuration": {
50
  "l_min_tokens": 64,
51
  "l_max_tokens": 128,
52
+ "tau_threshold": 0.8,
53
  "h_min": 2,
54
+ "h_max": 5,
55
+ "num_multihops_factor": 2,
56
  },
 
57
  },
58
  "single_shot_question_generation": {
 
59
  "run": True,
60
+ "additional_instructions": "Generate questions to test a curious adult",
61
+ "chunk_sampling": {
62
+ "mode": "count",
63
+ "value": 5,
64
+ "random_seed": 123,
65
+ },
66
  },
67
+ "multi_hop_question_generation": {
 
 
68
  "run": True,
69
+ "additional_instructions": "Generate questions to test a curious adult",
70
+ "chunk_sampling": {
71
+ "mode": "percentage",
72
+ "value": 0.3,
73
+ "random_seed": 42,
74
+ },
 
 
 
 
 
 
75
  },
76
+ "lighteval": {
77
+ "run": True,
 
 
 
78
  },
79
  },
80
  }
 
94
  file_path = save_yaml_file(config, config_path)
95
  logger.success(f"Config saved at: {file_path}")
96
  return file_path
 
yourbench_space/evaluation.py CHANGED
@@ -1,12 +1,17 @@
1
- import asyncio, os
 
 
2
  from yourbench_space.leaderboard_space.env import INIT_MODELS
3
 
4
- ON_SPACES=os.environ.get("system") == "spaces"
 
5
  OUTPUT_DIR = "/data" if ON_SPACES else "."
6
 
 
7
  def create_eval_file(eval_ds_name):
8
  # TODO: replace by Nathan's call
9
- content = """
 
10
  from aenum import extend_enum
11
 
12
  from lighteval.metrics.metrics import Metrics
@@ -31,10 +36,11 @@ def prompt_function(line, task_name: str = None):
31
  gold_index=0,
32
  specific={"question": line["question"]},
33
  )
34
- """ + f"""
 
35
 
36
  hle = LightevalTaskConfig(
37
- name="{eval_ds_name.replace('/', '_')}",
38
  suite=["custom"],
39
  prompt_function=prompt_function,
40
  hf_repo="{eval_ds_name}",
@@ -52,38 +58,46 @@ hle = LightevalTaskConfig(
52
 
53
 
54
  TASKS_TABLE = [hle]
55
- """
56
-
 
57
  with open(f"{OUTPUT_DIR}/custom_task.py", "w") as f:
58
  f.write(content)
59
 
 
60
  async def run_process(args: list) -> dict:
61
  process = await asyncio.create_subprocess_exec(
62
- *args,
63
- stdout=asyncio.subprocess.PIPE,
64
- stderr=asyncio.subprocess.PIPE
65
  )
66
  await asyncio.wait_for(process.wait(), timeout=180)
67
  stdout = await process.stdout.read()
68
  stderr = await process.stderr.read()
69
- return {
70
- 'pid': process.pid,
71
- 'stdout': stdout.decode(),
72
- 'stderr': stderr.decode()
73
- }
74
 
75
  async def run_evaluations(eval_ds_name: str, org: str) -> list:
76
  tasks = []
77
  for model_name, provider in INIT_MODELS:
78
  args = [
79
- "lighteval",
80
- "endpoint", "inference-providers", f"model={model_name},provider={provider}",
81
- f"custom|{eval_ds_name.replace('/', '_')}|0|0", "--custom-tasks", f"{OUTPUT_DIR}/custom_task.py", "--max-samples", "10",
82
- "--output-dir", f"{OUTPUT_DIR}", "--save-details", "--results-org", org, "--push-to-hub"
 
 
 
 
 
 
 
 
 
 
 
83
  ]
84
  tasks.append(run_process(args))
85
  # Will capture the task if failed
86
  processes = await asyncio.gather(*tasks, return_exceptions=True)
87
  if all(not isinstance(result, Exception) for result in processes):
88
  return "✅"
89
- return "At least one model failed"
 
1
+ import os
2
+ import asyncio
3
+
4
  from yourbench_space.leaderboard_space.env import INIT_MODELS
5
 
6
+
7
+ ON_SPACES = os.environ.get("system") == "spaces"
8
  OUTPUT_DIR = "/data" if ON_SPACES else "."
9
 
10
+
11
  def create_eval_file(eval_ds_name):
12
  # TODO: replace by Nathan's call
13
+ content = (
14
+ """
15
  from aenum import extend_enum
16
 
17
  from lighteval.metrics.metrics import Metrics
 
36
  gold_index=0,
37
  specific={"question": line["question"]},
38
  )
39
+ """
40
+ + f"""
41
 
42
  hle = LightevalTaskConfig(
43
+ name="{eval_ds_name.replace("/", "_")}",
44
  suite=["custom"],
45
  prompt_function=prompt_function,
46
  hf_repo="{eval_ds_name}",
 
58
 
59
 
60
  TASKS_TABLE = [hle]
61
+ """
62
+ )
63
+
64
  with open(f"{OUTPUT_DIR}/custom_task.py", "w") as f:
65
  f.write(content)
66
 
67
+
68
  async def run_process(args: list) -> dict:
69
  process = await asyncio.create_subprocess_exec(
70
+ *args, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE
 
 
71
  )
72
  await asyncio.wait_for(process.wait(), timeout=180)
73
  stdout = await process.stdout.read()
74
  stderr = await process.stderr.read()
75
+ return {"pid": process.pid, "stdout": stdout.decode(), "stderr": stderr.decode()}
76
+
 
 
 
77
 
78
  async def run_evaluations(eval_ds_name: str, org: str) -> list:
79
  tasks = []
80
  for model_name, provider in INIT_MODELS:
81
  args = [
82
+ "lighteval",
83
+ "endpoint",
84
+ "inference-providers",
85
+ f"model={model_name},provider={provider}",
86
+ f"custom|{eval_ds_name.replace('/', '_')}|0|0",
87
+ "--custom-tasks",
88
+ f"{OUTPUT_DIR}/custom_task.py",
89
+ "--max-samples",
90
+ "10",
91
+ "--output-dir",
92
+ f"{OUTPUT_DIR}",
93
+ "--save-details",
94
+ "--results-org",
95
+ org,
96
+ "--push-to-hub",
97
  ]
98
  tasks.append(run_process(args))
99
  # Will capture the task if failed
100
  processes = await asyncio.gather(*tasks, return_exceptions=True)
101
  if all(not isinstance(result, Exception) for result in processes):
102
  return "✅"
103
+ return "At least one model failed"
yourbench_space/utils.py CHANGED
@@ -1,15 +1,17 @@
1
  import io
2
  import os
3
  import re
4
- import pathlib
5
  import shutil
 
6
  import subprocess
7
- import gradio as gr
 
8
  import pandas as pd
9
- from collections import defaultdict
10
- from datasets import load_dataset
11
  from loguru import logger
12
- from typing import List, Union, Optional
 
 
 
13
 
14
  STAGES = [
15
  "ingestion",
@@ -17,12 +19,25 @@ STAGES = [
17
  "summarization",
18
  "chunking",
19
  "single_shot_question_generation",
20
- "answer_generation",
21
- #"evaluate_models",
22
- #"create_leaderboard"
23
- # "judge_answers", # to uncomment when fixed
24
  ]
25
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  def is_running_locally() -> bool:
27
  """
28
  Returns True if Gradio is running locally, False if it's running in a Hugging Face Space.
@@ -33,7 +48,7 @@ def is_running_locally() -> bool:
33
  def save_files(oauth_token: gr.OAuthToken | None, session_state: gr.State, files: List[pathlib.Path]) -> str:
34
  """Save uploaded files to the UPLOAD_DIRECTORY/uuid safely"""
35
  if oauth_token is None and not is_running_locally():
36
- gr.Warning('You need to log in to use this Space')
37
  return
38
 
39
  saved_paths = []
@@ -41,7 +56,7 @@ def save_files(oauth_token: gr.OAuthToken | None, session_state: gr.State, files
41
  for file in [file.name for file in files]:
42
  try:
43
  source_path = pathlib.Path(file)
44
- upload_directory_uuid = pathlib.Path(f"/app/{session_state.value}/uploaded_files")
45
  # Ensure the upload directory exists
46
  upload_directory_uuid.mkdir(parents=True, exist_ok=True)
47
  destination_path = upload_directory_uuid / source_path.name
@@ -56,11 +71,8 @@ def save_files(oauth_token: gr.OAuthToken | None, session_state: gr.State, files
56
  except Exception as e:
57
  print(f"Error moving file {file}: {e}")
58
 
59
- return (
60
- f"Files saved to: {', '.join(saved_paths)}"
61
- if saved_paths
62
- else "No files were saved"
63
- )
64
 
65
  def update_dataset(stages: list, hf_org: str, hf_prefix: str, oauth_token: gr.OAuthToken):
66
  """
@@ -68,31 +80,57 @@ def update_dataset(stages: list, hf_org: str, hf_prefix: str, oauth_token: gr.OA
68
  """
69
  ingestion_df = pd.DataFrame()
70
  summarization_df = pd.DataFrame()
71
- single_hop_df = pd.DataFrame()
72
- answers_df = pd.DataFrame()
 
73
 
74
  # Construct dataset name from config
75
  dataset_name = f"{hf_org}/{hf_prefix}"
76
 
77
- if "ingestion" in stages:
78
- # TODO: why is the key "ingested" and not "ingestion"? (does not match the other splits)
79
- ingestion_ds = load_dataset(dataset_name, name="ingested", split="train", streaming=True, token=oauth_token.token).select_columns("document_text")
80
- ingestion_df = pd.DataFrame([next(iter(ingestion_ds)) for _ in range(1)]) # only one row
81
- if "summarization" in stages:
82
- summarization_ds = load_dataset(dataset_name, name="summarization", split="train", streaming=True, token=oauth_token.token).select_columns(['raw_document_summary', 'document_summary', 'summarization_model'])
83
- summarization_df = pd.DataFrame([next(iter(summarization_ds)) for _ in range(1)])
84
- if "single_shot_question_generation" in stages:
85
- single_hop_ds = load_dataset(dataset_name, name="single_shot_question_generation", split="train", streaming=True, token=oauth_token.token)
86
- single_hop_df = pd.DataFrame([next(iter(single_hop_ds)) for _ in range(5)])
87
- if "answer_generation" in stages:
88
- answers_ds = load_dataset(dataset_name, name="answer_generation", split="train", streaming=True, token=oauth_token.token)
89
- answers_df = pd.DataFrame([next(iter(answers_ds)) for _ in range(5)])
90
-
91
- return (ingestion_df, summarization_df, single_hop_df, answers_df)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92
 
93
 
94
  class SubprocessManagerGroup:
95
  """Instanciates one manager per user (should be used as a singleton class)"""
 
96
  def __init__(self):
97
  self.managers: dict[str, SubprocessManager] = {}
98
 
@@ -115,8 +153,15 @@ class SubprocessManagerGroup:
115
  uid = SubprocessManagerGroup.grab_uuid(uid)
116
  if manager := self.managers.get(uid):
117
  manager.stop_process()
 
 
118
  del self.managers[uid]
119
 
 
 
 
 
 
120
  def start_process(self, uid: Union[str, gr.State], custom_env: dict | None):
121
  uid = SubprocessManagerGroup.grab_uuid(uid)
122
  self.managers[uid].start_process(custom_env=custom_env)
@@ -141,13 +186,14 @@ class SubprocessManagerGroup:
141
  return manager.is_running()
142
  return False
143
 
 
144
  class SubprocessManager:
145
  def __init__(self, session_uid: str):
146
  self.session_uid = session_uid
147
- self.path = pathlib.Path(f"/app/{session_uid}")
148
  self.path.mkdir(parents=True, exist_ok=True)
149
  self.config_path = pathlib.Path(f"{self.path}/config.yml")
150
- self.command = ["uv", "run", "yourbench", f"--config", str(self.config_path)]
151
  self.process = None
152
  self.output_stream = io.StringIO()
153
  self.exit_code = None
@@ -160,7 +206,7 @@ class SubprocessManager:
160
 
161
  self.output_stream = io.StringIO()
162
  self.exit_code = None
163
-
164
  try:
165
  logger.info(f"Starting process with command: {' '.join(self.command)}")
166
  self.process = subprocess.Popen(
@@ -195,9 +241,12 @@ class SubprocessManager:
195
  pass
196
 
197
  current_output = self.output_stream.getvalue()
198
- completed_stages = list(set(re.findall(r"Successfully completed stage: (\w+)", current_output)))
 
 
199
 
200
- return current_output, completed_stages
 
201
 
202
  def stop_process(self):
203
  """Terminate the subprocess."""
@@ -207,7 +256,7 @@ class SubprocessManager:
207
  logger.info("Sending SIGTERM to the Process")
208
  try:
209
  self.process.terminate()
210
- self.exit_code = self.process.wait(timeout=5) # Wait up to 5 seconds for process to terminate
211
  logger.info(f"Process terminated by user with exit code {self.exit_code}")
212
  except subprocess.TimeoutExpired:
213
  logger.warning("Process did not terminate within timeout, sending SIGKILL")
@@ -221,7 +270,7 @@ class SubprocessManager:
221
  logger.info("Sending SIGKILL to the Process")
222
  try:
223
  self.process.kill()
224
- self.exit_code = self.process.wait(timeout=5) # Wait up to 5 seconds for process to be killed
225
  logger.info(f"Process killed by user with exit code {self.exit_code}")
226
  except subprocess.TimeoutExpired:
227
  logger.error("Process could not be killed within timeout")
@@ -237,11 +286,11 @@ class SubprocessManager:
237
  """Return exit code and reason if process has terminated"""
238
  if self.process is None:
239
  return None, "Process was never started"
240
-
241
  if self.is_running():
242
  return None, "Process is still running"
243
-
244
- if not self.exit_code is None and self.exit_code != 0 :
245
  return self.exit_code, "Process exited abnormaly"
246
 
247
  return self.exit_code, "Process exited normaly"
@@ -250,3 +299,5 @@ class SubprocessManager:
250
  """Stop the process when object is deleted"""
251
  if self.process:
252
  self.process.kill()
 
 
 
1
  import io
2
  import os
3
  import re
 
4
  import shutil
5
+ import pathlib
6
  import subprocess
7
+ from typing import List, Union, Optional
8
+
9
  import pandas as pd
 
 
10
  from loguru import logger
11
+
12
+ import gradio as gr
13
+ from datasets import load_dataset
14
+
15
 
16
  STAGES = [
17
  "ingestion",
 
19
  "summarization",
20
  "chunking",
21
  "single_shot_question_generation",
22
+ "multi_hop_question_generation",
23
+ "lighteval",
 
 
24
  ]
25
 
26
+ STAGE_DISPLAY_MAP = {
27
+ "ingestion": "Process Input Docs",
28
+ "upload_ingest_to_hub": "Upload Dataset to Hub",
29
+ "summarization": "Summarize Documents",
30
+ "chunking": "Chunk Documents",
31
+ "single_shot_question_generation": "Generate Single Shot Questions",
32
+ "multi_hop_question_generation": "Generate Multi Hop Questions",
33
+ "lighteval": "Generate Lighteval Subset",
34
+ }
35
+
36
+
37
+ def map_stage_names(stages: list[str]) -> list[str]:
38
+ return [STAGE_DISPLAY_MAP.get(stage, stage) for stage in stages]
39
+
40
+
41
  def is_running_locally() -> bool:
42
  """
43
  Returns True if Gradio is running locally, False if it's running in a Hugging Face Space.
 
48
  def save_files(oauth_token: gr.OAuthToken | None, session_state: gr.State, files: List[pathlib.Path]) -> str:
49
  """Save uploaded files to the UPLOAD_DIRECTORY/uuid safely"""
50
  if oauth_token is None and not is_running_locally():
51
+ gr.Warning("You need to log in to use this Space")
52
  return
53
 
54
  saved_paths = []
 
56
  for file in [file.name for file in files]:
57
  try:
58
  source_path = pathlib.Path(file)
59
+ upload_directory_uuid = pathlib.Path(f"/home/user/app/{session_state.value}/uploaded_files")
60
  # Ensure the upload directory exists
61
  upload_directory_uuid.mkdir(parents=True, exist_ok=True)
62
  destination_path = upload_directory_uuid / source_path.name
 
71
  except Exception as e:
72
  print(f"Error moving file {file}: {e}")
73
 
74
+ return f"Files saved to: {', '.join(saved_paths)}" if saved_paths else "No files were saved"
75
+
 
 
 
76
 
77
  def update_dataset(stages: list, hf_org: str, hf_prefix: str, oauth_token: gr.OAuthToken):
78
  """
 
80
  """
81
  ingestion_df = pd.DataFrame()
82
  summarization_df = pd.DataFrame()
83
+ single_shot_df = pd.DataFrame()
84
+ multi_hop_df = pd.DataFrame()
85
+ lighteval_df = pd.DataFrame()
86
 
87
  # Construct dataset name from config
88
  dataset_name = f"{hf_org}/{hf_prefix}"
89
 
90
+ if STAGE_DISPLAY_MAP["upload_ingest_to_hub"] in stages:
91
+ ingestion_ds = load_dataset(
92
+ dataset_name, name="ingested", split="train", streaming=True, token=oauth_token.token
93
+ ).select_columns("document_text")
94
+ ingestion_df = pd.DataFrame(ingestion_ds.take(1))
95
+
96
+ if STAGE_DISPLAY_MAP["summarization"] in stages:
97
+ summarization_ds = load_dataset(
98
+ dataset_name, name="summarized", split="train", streaming=True, token=oauth_token.token
99
+ ).select_columns(["raw_document_summary", "document_summary", "summarization_model"])
100
+ summarization_df = pd.DataFrame(summarization_ds.take(5))
101
+
102
+ if STAGE_DISPLAY_MAP["single_shot_question_generation"] in stages:
103
+ single_shot_ds = load_dataset(
104
+ dataset_name,
105
+ name="single_shot_questions",
106
+ split="train",
107
+ streaming=True,
108
+ token=oauth_token.token,
109
+ ).select_columns(["question", "self_answer", "estimated_difficulty"])
110
+ single_shot_df = pd.DataFrame(single_shot_ds.take(5))
111
+
112
+ if STAGE_DISPLAY_MAP["multi_hop_question_generation"] in stages:
113
+ multi_hop_ds = load_dataset(
114
+ dataset_name,
115
+ name="multi_hop_questions",
116
+ split="train",
117
+ streaming=True,
118
+ token=oauth_token.token,
119
+ ).select_columns(["question", "self_answer", "estimated_difficulty"])
120
+ multi_hop_df = pd.DataFrame(multi_hop_ds.take(5))
121
+
122
+ if STAGE_DISPLAY_MAP["lighteval"] in stages:
123
+ lighteval_ds = load_dataset(
124
+ dataset_name, name="lighteval", split="train", streaming=True, token=oauth_token.token
125
+ ).select_columns(["question", "ground_truth_answer", "question_category", "kind"])
126
+ lighteval_df = pd.DataFrame(lighteval_ds.take(5))
127
+
128
+ return (ingestion_df, summarization_df, single_shot_df, multi_hop_df, lighteval_df)
129
 
130
 
131
  class SubprocessManagerGroup:
132
  """Instanciates one manager per user (should be used as a singleton class)"""
133
+
134
  def __init__(self):
135
  self.managers: dict[str, SubprocessManager] = {}
136
 
 
153
  uid = SubprocessManagerGroup.grab_uuid(uid)
154
  if manager := self.managers.get(uid):
155
  manager.stop_process()
156
+ manager.clean_workdir()
157
+
158
  del self.managers[uid]
159
 
160
+ def clean_workdir(self, uid: Union[str, gr.State]):
161
+ uid = SubprocessManagerGroup.grab_uuid(uid)
162
+ if manager := self.managers.get(uid):
163
+ manager.clean_workdir()
164
+
165
  def start_process(self, uid: Union[str, gr.State], custom_env: dict | None):
166
  uid = SubprocessManagerGroup.grab_uuid(uid)
167
  self.managers[uid].start_process(custom_env=custom_env)
 
186
  return manager.is_running()
187
  return False
188
 
189
+
190
  class SubprocessManager:
191
  def __init__(self, session_uid: str):
192
  self.session_uid = session_uid
193
+ self.path = pathlib.Path(f"/home/user/app/{session_uid}")
194
  self.path.mkdir(parents=True, exist_ok=True)
195
  self.config_path = pathlib.Path(f"{self.path}/config.yml")
196
+ self.command = ["uv", "run", "yourbench", "run", "--config", str(self.config_path)]
197
  self.process = None
198
  self.output_stream = io.StringIO()
199
  self.exit_code = None
 
206
 
207
  self.output_stream = io.StringIO()
208
  self.exit_code = None
209
+
210
  try:
211
  logger.info(f"Starting process with command: {' '.join(self.command)}")
212
  self.process = subprocess.Popen(
 
241
  pass
242
 
243
  current_output = self.output_stream.getvalue()
244
+ completed_stages = list(set(re.findall(r"Completed stage: '([^']*)'", current_output)))
245
+
246
+ return current_output, map_stage_names(completed_stages)
247
 
248
+ def clean_workdir(self):
249
+ shutil.rmtree(self.path, ignore_errors=True)
250
 
251
  def stop_process(self):
252
  """Terminate the subprocess."""
 
256
  logger.info("Sending SIGTERM to the Process")
257
  try:
258
  self.process.terminate()
259
+ self.exit_code = self.process.wait(timeout=5) # Wait up to 5 seconds for process to terminate
260
  logger.info(f"Process terminated by user with exit code {self.exit_code}")
261
  except subprocess.TimeoutExpired:
262
  logger.warning("Process did not terminate within timeout, sending SIGKILL")
 
270
  logger.info("Sending SIGKILL to the Process")
271
  try:
272
  self.process.kill()
273
+ self.exit_code = self.process.wait(timeout=5) # Wait up to 5 seconds for process to be killed
274
  logger.info(f"Process killed by user with exit code {self.exit_code}")
275
  except subprocess.TimeoutExpired:
276
  logger.error("Process could not be killed within timeout")
 
286
  """Return exit code and reason if process has terminated"""
287
  if self.process is None:
288
  return None, "Process was never started"
289
+
290
  if self.is_running():
291
  return None, "Process is still running"
292
+
293
+ if self.exit_code is not None and self.exit_code != 0:
294
  return self.exit_code, "Process exited abnormaly"
295
 
296
  return self.exit_code, "Process exited normaly"
 
299
  """Stop the process when object is deleted"""
300
  if self.process:
301
  self.process.kill()
302
+
303
+ self.clean_workdir()