Clémentine commited on
Commit
3adea5e
·
1 Parent(s): 1d9fcdf
pyproject.toml CHANGED
@@ -18,6 +18,7 @@ dependencies = [
18
  "python-dotenv>=1.0.1",
19
  "tqdm>=4.67.1",
20
  "ruff>=0.11.2",
 
21
  ]
22
 
23
  [build-system]
 
18
  "python-dotenv>=1.0.1",
19
  "tqdm>=4.67.1",
20
  "ruff>=0.11.2",
21
+ "lighteval @ git+https://github.com/huggingface/[email protected]",
22
  ]
23
 
24
  [build-system]
yourbench_space/__init__.py CHANGED
@@ -0,0 +1,2 @@
 
 
 
1
+ import os
2
+ PATH = "/home/user/app" if os.environ.get("system") == "spaces" else "app"
yourbench_space/app.py CHANGED
@@ -10,6 +10,7 @@ from loguru import logger
10
  import gradio as gr
11
  from datasets import load_dataset
12
  from huggingface_hub import whoami
 
13
  from yourbench_space.utils import (
14
  STAGES,
15
  SubprocessManagerGroup,
@@ -278,46 +279,49 @@ with gr.Blocks(theme=gr.themes.Default()) as app:
278
  )
279
 
280
  with gr.Tab("Run Generation", id=1):
281
- with gr.Column():
282
- with gr.Row():
283
- start_button = gr.Button("Start Task")
284
- stop_button = gr.Button("Stop Task")
285
- kill_button = gr.Button("Kill Task")
286
-
287
- start_button.click(prepare_task, inputs=[session_state, login_btn, hf_dataset_name])
288
- stop_button.click(MANAGERS.stop_process, inputs=session_state)
289
- kill_button.click(MANAGERS.kill_process, inputs=session_state)
290
-
291
- process_status = gr.Checkbox(label="Process Status", interactive=False)
292
- status_timer = gr.Timer(2.0, active=True)
293
- status_timer.tick(update_process_status, inputs=session_state, outputs=process_status)
294
-
295
- with gr.Row():
296
- with gr.Accordion("Stages", open=True):
297
- stages_table = gr.CheckboxGroup(
298
- choices=map_stage_names(STAGES),
299
- value=[],
300
- label="Pipeline Stages Completed",
301
- container=False,
302
- interactive=False,
303
- )
304
-
305
- with gr.Row():
306
- with gr.Column(scale=2):
307
- with gr.Accordion("Ingestion Preview"):
308
- ingestion_df = gr.DataFrame()
309
-
310
- with gr.Accordion("Summarization Preview"):
311
- summarization_df = gr.DataFrame()
312
-
313
- with gr.Accordion("Single Shot Preview"):
314
- single_shot_df = gr.DataFrame()
315
-
316
- with gr.Accordion("Multi Hop Preview"):
317
- multi_hop_df = gr.DataFrame()
318
-
319
- with gr.Accordion("Lighteval Preview"):
320
- lighteval_df = gr.DataFrame()
 
 
 
321
 
322
  stages_table.change(
323
  update_dataset,
@@ -325,8 +329,6 @@ with gr.Blocks(theme=gr.themes.Default()) as app:
325
  outputs=[ingestion_df, summarization_df, single_shot_df, multi_hop_df, lighteval_df],
326
  )
327
 
328
- with gr.Accordion("Log Output", open=False):
329
- log_output = gr.Code(language=None, lines=20, interactive=False)
330
 
331
  # TODO: this timer should only be active when the second tab is passed to active for the first time
332
  log_timer = gr.Timer(1.0, active=True)
@@ -336,7 +338,7 @@ with gr.Blocks(theme=gr.themes.Default()) as app:
336
  outputs=[log_output, stages_table],
337
  )
338
 
339
- with gr.Tab("Evaluate", id=2, visible=False):
340
  with gr.Row():
341
  btn_launch_evals = gr.Button("Launch evaluations")
342
  status = gr.Textbox(label="Status")
@@ -344,4 +346,4 @@ with gr.Blocks(theme=gr.themes.Default()) as app:
344
 
345
  app.load(init_session, outputs=session_state)
346
 
347
- app.launch(allowed_paths=["/home/user/app"])
 
10
  import gradio as gr
11
  from datasets import load_dataset
12
  from huggingface_hub import whoami
13
+ from yourbench_space import PATH
14
  from yourbench_space.utils import (
15
  STAGES,
16
  SubprocessManagerGroup,
 
279
  )
280
 
281
  with gr.Tab("Run Generation", id=1):
282
+ with gr.Row():
283
+ start_button = gr.Button("Start Task")
284
+ stop_button = gr.Button("Stop Task")
285
+ kill_button = gr.Button("Kill Task")
286
+
287
+ start_button.click(prepare_task, inputs=[session_state, login_btn, hf_dataset_name])
288
+ stop_button.click(MANAGERS.stop_process, inputs=session_state)
289
+ kill_button.click(MANAGERS.kill_process, inputs=session_state)
290
+
291
+ process_status = gr.Checkbox(label="Process Status", interactive=False)
292
+ status_timer = gr.Timer(2.0, active=True)
293
+ status_timer.tick(update_process_status, inputs=session_state, outputs=process_status)
294
+
295
+ with gr.Row():
296
+ with gr.Accordion("Stages", open=True):
297
+ stages_table = gr.CheckboxGroup(
298
+ choices=map_stage_names(STAGES),
299
+ value=[],
300
+ label="Pipeline Stages Completed",
301
+ container=False,
302
+ interactive=False,
303
+ )
304
+
305
+ with gr.Row():
306
+ with gr.Column():
307
+ with gr.Accordion("Log Output", open=True):
308
+ log_output = gr.Code(language=None, lines=20, interactive=False)
309
+
310
+ with gr.Column():
311
+ with gr.Accordion("Ingestion Preview"):
312
+ ingestion_df = gr.DataFrame()
313
+
314
+ with gr.Accordion("Summarization Preview"):
315
+ summarization_df = gr.DataFrame()
316
+
317
+ with gr.Accordion("Single Shot Preview"):
318
+ single_shot_df = gr.DataFrame()
319
+
320
+ with gr.Accordion("Multi Hop Preview"):
321
+ multi_hop_df = gr.DataFrame()
322
+
323
+ with gr.Accordion("Lighteval Preview"):
324
+ lighteval_df = gr.DataFrame()
325
 
326
  stages_table.change(
327
  update_dataset,
 
329
  outputs=[ingestion_df, summarization_df, single_shot_df, multi_hop_df, lighteval_df],
330
  )
331
 
 
 
332
 
333
  # TODO: this timer should only be active when the second tab is passed to active for the first time
334
  log_timer = gr.Timer(1.0, active=True)
 
338
  outputs=[log_output, stages_table],
339
  )
340
 
341
+ with gr.Tab("Evaluate", id=2):
342
  with gr.Row():
343
  btn_launch_evals = gr.Button("Launch evaluations")
344
  status = gr.Textbox(label="Status")
 
346
 
347
  app.load(init_session, outputs=session_state)
348
 
349
+ app.launch(allowed_paths=[PATH])
yourbench_space/config.py CHANGED
@@ -1,6 +1,8 @@
1
  import yaml
2
  from loguru import logger
3
 
 
 
4
 
5
  def generate_base_config(hf_org: str, hf_dataset_name: str, session_uid: str):
6
  """Creates the base config dictionary"""
@@ -33,12 +35,12 @@ def generate_base_config(hf_org: str, hf_dataset_name: str, session_uid: str):
33
  },
34
  "pipeline": {
35
  "ingestion": {
36
- "source_documents_dir": f"/home/user/app/{session_uid}/uploaded_files/",
37
- "output_dir": f"/home/user/app/{session_uid}/ingested",
38
  "run": True,
39
  },
40
  "upload_ingest_to_hub": {
41
- "source_documents_dir": f"/home/user/app/{session_uid}/ingested",
42
  "run": True,
43
  },
44
  "summarization": {
 
1
  import yaml
2
  from loguru import logger
3
 
4
+ from yourbench_space import PATH
5
+
6
 
7
  def generate_base_config(hf_org: str, hf_dataset_name: str, session_uid: str):
8
  """Creates the base config dictionary"""
 
35
  },
36
  "pipeline": {
37
  "ingestion": {
38
+ "source_documents_dir": f"{PATH}/{session_uid}/uploaded_files/",
39
+ "output_dir": f"{PATH}/{session_uid}/ingested",
40
  "run": True,
41
  },
42
  "upload_ingest_to_hub": {
43
+ "source_documents_dir": f"{PATH}/{session_uid}/ingested",
44
  "run": True,
45
  },
46
  "summarization": {
yourbench_space/evaluation.py CHANGED
@@ -1,4 +1,5 @@
1
  import os
 
2
  import asyncio
3
 
4
  from yourbench_space.leaderboard_space.env import INIT_MODELS
@@ -8,62 +9,9 @@ ON_SPACES = os.environ.get("system") == "spaces"
8
  OUTPUT_DIR = "/data" if ON_SPACES else "."
9
 
10
 
11
- def create_eval_file(eval_ds_name):
12
- # TODO: replace by Nathan's call
13
- content = (
14
- """
15
- from aenum import extend_enum
16
-
17
- from lighteval.metrics.metrics import Metrics
18
- from lighteval.metrics.utils.metric_utils import (
19
- CorpusLevelMetricGrouping,
20
- MetricCategory,
21
- MetricUseCase,
22
- )
23
- from lighteval.tasks.lighteval_task import LightevalTaskConfig
24
- from lighteval.tasks.extended.hle.main import JudgeLLMHLE
25
- from lighteval.tasks.requests import Doc
26
-
27
-
28
- def prompt_function(line, task_name: str = None):
29
- if line["image"] not in [None, ""]:
30
- return
31
-
32
- return Doc(
33
- task_name=task_name,
34
- query="Question: " + line["question"] + "\\nAnswer:",
35
- choices=[line["answer"]],
36
- gold_index=0,
37
- specific={"question": line["question"]},
38
- )
39
- """
40
- + f"""
41
-
42
- hle = LightevalTaskConfig(
43
- name="{eval_ds_name.replace("/", "_")}",
44
- suite=["custom"],
45
- prompt_function=prompt_function,
46
- hf_repo="{eval_ds_name}",
47
- hf_subset="default",
48
- hf_avail_splits=["test"],
49
- evaluation_splits=["test"],
50
- few_shots_split=None,
51
- few_shots_select=None,
52
- generation_size=8192,
53
- metric=[Metrics.exact_match],
54
- stop_sequence=[],
55
- trust_dataset=True,
56
- version=0,
57
- )
58
-
59
-
60
- TASKS_TABLE = [hle]
61
- """
62
- )
63
-
64
- with open(f"{OUTPUT_DIR}/custom_task.py", "w") as f:
65
- f.write(content)
66
-
67
 
68
  async def run_process(args: list) -> dict:
69
  process = await asyncio.create_subprocess_exec(
@@ -76,6 +24,7 @@ async def run_process(args: list) -> dict:
76
 
77
 
78
  async def run_evaluations(eval_ds_name: str, org: str) -> list:
 
79
  tasks = []
80
  for model_name, provider in INIT_MODELS:
81
  args = [
@@ -83,11 +32,11 @@ async def run_evaluations(eval_ds_name: str, org: str) -> list:
83
  "endpoint",
84
  "inference-providers",
85
  f"model={model_name},provider={provider}",
86
- f"custom|{eval_ds_name.replace('/', '_')}|0|0",
87
  "--custom-tasks",
88
- f"{OUTPUT_DIR}/custom_task.py",
89
  "--max-samples",
90
- "10",
91
  "--output-dir",
92
  f"{OUTPUT_DIR}",
93
  "--save-details",
 
1
  import os
2
+ import subprocess
3
  import asyncio
4
 
5
  from yourbench_space.leaderboard_space.env import INIT_MODELS
 
9
  OUTPUT_DIR = "/data" if ON_SPACES else "."
10
 
11
 
12
+ def create_eval_file(eval_ds_name: str):
13
+ task_name = eval_ds_name.replace("/", "_")
14
+ subprocess.run(["lighteval", "tasks", "create", "examples/custom_tasks_templates/custom_yourbench_task.py", task_name, eval_ds_name])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
 
16
  async def run_process(args: list) -> dict:
17
  process = await asyncio.create_subprocess_exec(
 
24
 
25
 
26
  async def run_evaluations(eval_ds_name: str, org: str) -> list:
27
+ task_name = eval_ds_name.replace("/", "_")
28
  tasks = []
29
  for model_name, provider in INIT_MODELS:
30
  args = [
 
32
  "endpoint",
33
  "inference-providers",
34
  f"model={model_name},provider={provider}",
35
+ f"custom|{task_name}|0|0",
36
  "--custom-tasks",
37
+ f"custom_{task_name}_task.py",
38
  "--max-samples",
39
+ "30",
40
  "--output-dir",
41
  f"{OUTPUT_DIR}",
42
  "--save-details",
yourbench_space/utils.py CHANGED
@@ -12,6 +12,8 @@ from loguru import logger
12
  import gradio as gr
13
  from datasets import load_dataset
14
 
 
 
15
 
16
  STAGES = [
17
  "ingestion",
@@ -56,7 +58,7 @@ def save_files(oauth_token: gr.OAuthToken | None, session_state: gr.State, files
56
  for file in [file.name for file in files]:
57
  try:
58
  source_path = pathlib.Path(file)
59
- upload_directory_uuid = pathlib.Path(f"/home/user/app/{session_state.value}/uploaded_files")
60
  # Ensure the upload directory exists
61
  upload_directory_uuid.mkdir(parents=True, exist_ok=True)
62
  destination_path = upload_directory_uuid / source_path.name
@@ -190,7 +192,7 @@ class SubprocessManagerGroup:
190
  class SubprocessManager:
191
  def __init__(self, session_uid: str):
192
  self.session_uid = session_uid
193
- self.path = pathlib.Path(f"/home/user/app/{session_uid}")
194
  self.path.mkdir(parents=True, exist_ok=True)
195
  self.config_path = pathlib.Path(f"{self.path}/config.yml")
196
  self.command = ["uv", "run", "yourbench", "run", "--config", str(self.config_path)]
 
12
  import gradio as gr
13
  from datasets import load_dataset
14
 
15
+ from yourbench_space import PATH
16
+
17
 
18
  STAGES = [
19
  "ingestion",
 
58
  for file in [file.name for file in files]:
59
  try:
60
  source_path = pathlib.Path(file)
61
+ upload_directory_uuid = pathlib.Path(f"{PATH}/{session_state.value}/uploaded_files")
62
  # Ensure the upload directory exists
63
  upload_directory_uuid.mkdir(parents=True, exist_ok=True)
64
  destination_path = upload_directory_uuid / source_path.name
 
192
  class SubprocessManager:
193
  def __init__(self, session_uid: str):
194
  self.session_uid = session_uid
195
+ self.path = pathlib.Path(f"{PATH}/{session_uid}")
196
  self.path.mkdir(parents=True, exist_ok=True)
197
  self.config_path = pathlib.Path(f"{self.path}/config.yml")
198
  self.command = ["uv", "run", "yourbench", "run", "--config", str(self.config_path)]