advanced

Running on CPU Upgrade

App Files Files Community

Alina Lozovskaya commited on Mar 24

Commit

ea047ad

1 Parent(s): 2617bee

Update Setup and Run Generatation tabs

Browse files

Files changed (4) hide show

yourbench_space/app.py +125 -77
yourbench_space/config.py +33 -37
yourbench_space/evaluation.py +34 -20
yourbench_space/utils.py +94 -43

yourbench_space/app.py CHANGED Viewed

@@ -1,32 +1,33 @@
-import asyncio
 import os
 import sys
 import time
-import gradio as gr
 import uuid
-from datasets import load_dataset
-from huggingface_hub import whoami
 from loguru import logger
-from pathlib import Path
-from yourbench_space.config import generate_and_save_config
 from yourbench_space.utils import (
     SubprocessManagerGroup,
     save_files,
     update_dataset,
-    STAGES,
-    is_running_locally
 )
-from yourbench_space.evaluation import create_eval_file, run_evaluations
-from yourbench_space.leaderboard_space.env import HF_TOKEN
 project_description = """
-# YourBench 🚀
 **Dynamic Benchmark Generation for Language Models**
 Quickly create zero-shot benchmarks from your documents – keeping models accurate and adaptable
-- 📖 [FAQ](#)
 - 💻 [GitHub](https://github.com/huggingface/yourbench/tree/v0.2-alpha-space)
 """
@@ -35,7 +36,7 @@ logger.add(sys.stderr, level="INFO")
 # Global to store all managers per session
 MANAGERS = SubprocessManagerGroup()
-USER_ID_SESSION_MAP: dict[str, str] = dict()
 docs_path = Path(__file__).parent / "docs.md"
@@ -45,30 +46,36 @@ citation_content = (
     else "# Citation\n\nDocumentation file not found."
 )
 def generate_and_return(hf_org, hf_dataset_name, session_state: gr.State):
     manager = MANAGERS.get(session_state)
-    if manager is None: # should not be possible
         return (
-        "❌ Config generation failed.",
-        gr.update(visible=False, interactive=False),
-    )
     session_uid = session_state.value
     config_path = generate_and_save_config(hf_org, hf_dataset_name, session_uid, manager.config_path)
     for _ in range(5):
         time.sleep(0.5)
         if config_path.exists():
             return (
                 "✅ Config saved!",
                 gr.update(value=str(config_path), visible=True, interactive=True),
             )
     return (
         "❌ Config generation failed.",
         gr.update(visible=False, interactive=False),
     )
 final_dataset = None
 def update_process_status(session_state: gr.State):
     """Update process status and include exit details if process has terminated"""
     if session_state is None:
@@ -79,17 +86,22 @@ def update_process_status(session_state: gr.State):
         return gr.update(value=False, label="Not running")
     is_running = manager.is_running()
     if not is_running:
         exit_code, exit_reason = manager.get_exit_details()
-        status_text = f"Process Status: Stopped - {exit_reason}, exit code - {exit_code}" if exit_reason else "Process Status: Stopped"
         return gr.update(value=False, label=status_text)
     return gr.update(value=True, label="Process Status: Running")
 def prepare_task(session_uid: str, oauth_token: gr.OAuthToken | None, hf_dataset_name: str, _=None):
     if oauth_token is None and not is_running_locally():
-        gr.Warning('You need to log in to use this Space')
         return
     new_env = os.environ.copy()
@@ -122,6 +134,7 @@ def switch_to_run_generation_tab():
 def enable_button(files):
     return gr.update(interactive=bool(files))
 def run_evaluation_pipeline(oauth_token: gr.OAuthToken | None, org_name, eval_name):
     # Test dataset existence
     eval_ds_name = f"{org_name}/{eval_name}"
@@ -136,13 +149,29 @@ def run_evaluation_pipeline(oauth_token: gr.OAuthToken | None, org_name, eval_na
     status = asyncio.run(run_evaluations(eval_ds_name=eval_ds_name, org=org_name))
     # Create space
     from huggingface_hub import HfApi
     repo_id = f"{org_name}/leaderboard_yourbench_{eval_ds_name.replace('/', '_')}"
     api = HfApi()
     try:
-        api.create_repo(repo_id=repo_id, repo_type="space", space_sdk="gradio", token=oauth_token.token)
-        api.upload_folder(repo_id=repo_id, repo_type="space", folder_path="src/", token=oauth_token.token)
-        api.add_space_secret(repo_id=repo_id, key="HF_TOKEN", value=oauth_token.token, token=oauth_token.token)
         api.add_space_variable(repo_id=repo_id, key="TASK", value=eval_ds_name, token=oauth_token.token)
         api.add_space_variable(repo_id=repo_id, key="ORG_NAME", value=org_name, token=oauth_token.token)
     except Exception as e:
@@ -179,8 +208,6 @@ def init_session(profile: gr.OAuthProfile | None):
 with gr.Blocks(theme=gr.themes.Default()) as app:
-    # We initialize the session state with the user randomly generated uuid
-    # Using uuid4 makes collision cases extremely unlikely even for concurrent users
     session_state = gr.State()
     gr.Markdown(project_description)
@@ -190,12 +217,8 @@ with gr.Blocks(theme=gr.themes.Default()) as app:
             with gr.Row():
                 with gr.Accordion("Hugging Face Settings"):
                     login_btn = gr.LoginButton()
-                    hf_org_dropdown = gr.Dropdown(
-                        choices=[], label="Organization", allow_custom_value=True
-                    )
-                    app.load(
-                        update_hf_org_dropdown, inputs=None, outputs=hf_org_dropdown
-                    )
                     hf_dataset_name = gr.Textbox(
                         label="Dataset name",
@@ -213,17 +236,36 @@ with gr.Blocks(theme=gr.themes.Default()) as app:
                     file_input.upload(
                         save_files,
                         inputs=[session_state, file_input],
-                        outputs = output,
                     )
             preview_button = gr.Button("Generate New Config", interactive=False)
             log_message = gr.Textbox(label="Log Message", visible=True)
-            download_button = gr.File(
-                label="Download Config", visible=False, interactive=False
             )
             file_input.change(enable_button, inputs=file_input, outputs=preview_button)
             preview_button.click(
                 generate_and_return,
                 inputs=[hf_org_dropdown, hf_dataset_name, session_state],
@@ -234,66 +276,72 @@ with gr.Blocks(theme=gr.themes.Default()) as app:
                 inputs=None,
                 outputs=tabs,
             )
         with gr.Tab("Run Generation", id=1):
-            with gr.Row():
-                start_button = gr.Button("Start Task")
-                start_button.click(prepare_task, inputs=[session_state, login_btn, hf_dataset_name])
-                stop_button = gr.Button("Stop Task")
                 stop_button.click(MANAGERS.stop_process, inputs=session_state)
-                kill_button = gr.Button("Kill Task")
                 kill_button.click(MANAGERS.kill_process, inputs=session_state)
-            with gr.Row():
-                with gr.Column():
-                    with gr.Accordion("Log Output", open=True):
-                        log_output = gr.Code(language=None, lines=20, interactive=False)
-                    process_status = gr.Checkbox(label="Process Status", interactive=False)
-                    status_timer = gr.Timer(2.0, active=True)
-                    status_timer.tick(update_process_status, inputs=session_state, outputs=process_status)
-                with gr.Column():
                     with gr.Accordion("Stages", open=True):
                         stages_table = gr.CheckboxGroup(
-                            choices=STAGES,
                             value=[],
                             label="Pipeline Stages Completed",
                             interactive=False,
                         )
-                    with gr.Accordion("Ingestion"):
-                        ingestion_df = gr.DataFrame()
-                    with gr.Accordion("Summarization"):
-                        summarization_df = gr.DataFrame()
-                    with gr.Accordion("Single-Hop"):
-                        single_hop = gr.DataFrame()
-                    with gr.Accordion("Answer Generation"):
-                        answers_df = gr.DataFrame()
-                    stages_table.change(
-                        update_dataset, inputs=[stages_table, hf_org_dropdown, hf_dataset_name], outputs=[ingestion_df, summarization_df, single_hop, answers_df]
-                    )
-            # TODO: this timer should only be active when the second tab is passed to active for the first time
-            log_timer = gr.Timer(1.0, active=True)
-            log_timer.tick(
-                MANAGERS.read_and_get_output, inputs=session_state, outputs=[log_output, stages_table]
-            )
         with gr.Tab("Evaluate", id=2, visible=False):
             with gr.Row():
                 btn_launch_evals = gr.Button("Launch evaluations")
                 status = gr.Textbox(label="Status")
             btn_launch_evals.click(run_evaluation_pipeline, [hf_org_dropdown, hf_dataset_name], status)
     app.load(init_session, outputs=session_state)
-app.launch(allowed_paths=["/app"])

 import os
 import sys
 import time
 import uuid
+import asyncio
+from pathlib import Path
 from loguru import logger
+import gradio as gr
+from datasets import load_dataset
+from huggingface_hub import whoami
 from yourbench_space.utils import (
+    STAGES,
     SubprocessManagerGroup,
     save_files,
     update_dataset,
+    map_stage_names,
+    is_running_locally,
 )
+from yourbench_space.config import generate_and_save_config
+from yourbench_space.evaluation import run_evaluations, create_eval_file
 project_description = """
+# YourBench 🚀
 **Dynamic Benchmark Generation for Language Models**
 Quickly create zero-shot benchmarks from your documents – keeping models accurate and adaptable
+- 📖 [FAQ](#)
 - 💻 [GitHub](https://github.com/huggingface/yourbench/tree/v0.2-alpha-space)
 """
 # Global to store all managers per session
 MANAGERS = SubprocessManagerGroup()
+USER_ID_SESSION_MAP: dict[str, str] = {}
 docs_path = Path(__file__).parent / "docs.md"
     else "# Citation\n\nDocumentation file not found."
 )
 def generate_and_return(hf_org, hf_dataset_name, session_state: gr.State):
     manager = MANAGERS.get(session_state)
+    if manager is None:  # should not be possible
         return (
+            "❌ Config generation failed.",
+            gr.update(visible=False, interactive=False),
+        )
     session_uid = session_state.value
     config_path = generate_and_save_config(hf_org, hf_dataset_name, session_uid, manager.config_path)
     for _ in range(5):
         time.sleep(0.5)
         if config_path.exists():
+            gr.Success("Config generated")
             return (
                 "✅ Config saved!",
                 gr.update(value=str(config_path), visible=True, interactive=True),
             )
+    gr.Error("Failed to generate config")
     return (
         "❌ Config generation failed.",
         gr.update(visible=False, interactive=False),
     )
 final_dataset = None
 def update_process_status(session_state: gr.State):
     """Update process status and include exit details if process has terminated"""
     if session_state is None:
         return gr.update(value=False, label="Not running")
     is_running = manager.is_running()
     if not is_running:
         exit_code, exit_reason = manager.get_exit_details()
+        status_text = (
+            f"Process Status: Stopped - {exit_reason}, exit code - {exit_code}"
+            if exit_reason
+            else "Process Status: Stopped"
+        )
         return gr.update(value=False, label=status_text)
     return gr.update(value=True, label="Process Status: Running")
 def prepare_task(session_uid: str, oauth_token: gr.OAuthToken | None, hf_dataset_name: str, _=None):
     if oauth_token is None and not is_running_locally():
+        gr.Warning("You need to log in to use this Space")
         return
     new_env = os.environ.copy()
 def enable_button(files):
     return gr.update(interactive=bool(files))
 def run_evaluation_pipeline(oauth_token: gr.OAuthToken | None, org_name, eval_name):
     # Test dataset existence
     eval_ds_name = f"{org_name}/{eval_name}"
     status = asyncio.run(run_evaluations(eval_ds_name=eval_ds_name, org=org_name))
     # Create space
     from huggingface_hub import HfApi
     repo_id = f"{org_name}/leaderboard_yourbench_{eval_ds_name.replace('/', '_')}"
     api = HfApi()
     try:
+        api.create_repo(
+            repo_id=repo_id,
+            repo_type="space",
+            space_sdk="gradio",
+            token=oauth_token.token,
+        )
+        api.upload_folder(
+            repo_id=repo_id,
+            repo_type="space",
+            folder_path="src/",
+            token=oauth_token.token,
+        )
+        api.add_space_secret(
+            repo_id=repo_id,
+            key="HF_TOKEN",
+            value=oauth_token.token,
+            token=oauth_token.token,
+        )
         api.add_space_variable(repo_id=repo_id, key="TASK", value=eval_ds_name, token=oauth_token.token)
         api.add_space_variable(repo_id=repo_id, key="ORG_NAME", value=org_name, token=oauth_token.token)
     except Exception as e:
 with gr.Blocks(theme=gr.themes.Default()) as app:
     session_state = gr.State()
     gr.Markdown(project_description)
             with gr.Row():
                 with gr.Accordion("Hugging Face Settings"):
                     login_btn = gr.LoginButton()
+                    hf_org_dropdown = gr.Dropdown(choices=[], label="Organization", allow_custom_value=True)
+                    app.load(update_hf_org_dropdown, inputs=None, outputs=hf_org_dropdown)
                     hf_dataset_name = gr.Textbox(
                         label="Dataset name",
                     file_input.upload(
                         save_files,
                         inputs=[session_state, file_input],
+                        outputs=output,
                     )
+                    delete_button = gr.Button("Delete Uploaded Files", visible=False)
             preview_button = gr.Button("Generate New Config", interactive=False)
             log_message = gr.Textbox(label="Log Message", visible=True)
+            download_button = gr.File(label="Download Config", visible=False, interactive=False)
+            file_input.change(
+                lambda files: gr.update(visible=bool(files)),
+                inputs=file_input,
+                outputs=delete_button,
             )
             file_input.change(enable_button, inputs=file_input, outputs=preview_button)
+            def clean_and_confirm(uid):
+                MANAGERS.clean_workdir(uid)
+                return (
+                    "Deleted all uploaded files.",
+                    gr.update(value=None),
+                    gr.update(interactive=False),
+                )
+            delete_button.click(
+                clean_and_confirm,
+                inputs=session_state,
+                outputs=[output, file_input, preview_button],
+            )
             preview_button.click(
                 generate_and_return,
                 inputs=[hf_org_dropdown, hf_dataset_name, session_state],
                 inputs=None,
                 outputs=tabs,
             )
         with gr.Tab("Run Generation", id=1):
+            with gr.Column():
+                with gr.Row():
+                    start_button = gr.Button("Start Task")
+                    stop_button = gr.Button("Stop Task")
+                    kill_button = gr.Button("Kill Task")
+                start_button.click(prepare_task, inputs=[session_state, login_btn, hf_dataset_name])
                 stop_button.click(MANAGERS.stop_process, inputs=session_state)
                 kill_button.click(MANAGERS.kill_process, inputs=session_state)
+                process_status = gr.Checkbox(label="Process Status", interactive=False)
+                status_timer = gr.Timer(2.0, active=True)
+                status_timer.tick(update_process_status, inputs=session_state, outputs=process_status)
+                with gr.Row():
                     with gr.Accordion("Stages", open=True):
                         stages_table = gr.CheckboxGroup(
+                            choices=map_stage_names(STAGES),
                             value=[],
                             label="Pipeline Stages Completed",
+                            container=False,
                             interactive=False,
                         )
+                with gr.Row():
+                    with gr.Column(scale=2):
+                        with gr.Accordion("Ingestion Preview"):
+                            ingestion_df = gr.DataFrame()
+                        with gr.Accordion("Summarization Preview"):
+                            summarization_df = gr.DataFrame()
+                        with gr.Accordion("Single Shot Preview"):
+                            single_shot_df = gr.DataFrame()
+                        with gr.Accordion("Multi Hop Preview"):
+                            multi_hop_df = gr.DataFrame()
+                        with gr.Accordion("Lighteval Preview"):
+                            lighteval_df = gr.DataFrame()
+                stages_table.change(
+                    update_dataset,
+                    inputs=[stages_table, hf_org_dropdown, hf_dataset_name],
+                    outputs=[ingestion_df, summarization_df, single_shot_df, multi_hop_df, lighteval_df],
+                )
+                with gr.Accordion("Log Output", open=False):
+                    log_output = gr.Code(language=None, lines=20, interactive=False)
+                # TODO: this timer should only be active when the second tab is passed to active for the first time
+                log_timer = gr.Timer(1.0, active=True)
+                log_timer.tick(
+                    MANAGERS.read_and_get_output,
+                    inputs=session_state,
+                    outputs=[log_output, stages_table],
+                )
         with gr.Tab("Evaluate", id=2, visible=False):
             with gr.Row():
                 btn_launch_evals = gr.Button("Launch evaluations")
                 status = gr.Textbox(label="Status")
             btn_launch_evals.click(run_evaluation_pipeline, [hf_org_dropdown, hf_dataset_name], status)
     app.load(init_session, outputs=session_state)
+app.launch(allowed_paths=["/home/user/app"])

yourbench_space/config.py CHANGED Viewed

@@ -7,13 +7,14 @@ def generate_base_config(hf_org: str, hf_dataset_name: str, session_uid: str):
     return {
         "hf_configuration": {
             "token": "$HF_TOKEN",
-            "private": True,
             "hf_organization": hf_org,
             "hf_dataset_name": hf_dataset_name,
         },
         "model_list": [
             {
-                "model_name": "meta-llama/Llama-3.3-70B-Instruct",
                 "provider": "novita",
                 "max_concurrent_requests": 32,
             },
@@ -21,63 +22,59 @@ def generate_base_config(hf_org: str, hf_dataset_name: str, session_uid: str):
                 "model_name": "Qwen/Qwen2.5-72B-Instruct",
                 "provider": "novita",
                 "max_concurrent_requests": 32,
-            }
         ],
         "model_roles": {
-            "ingestion": ["meta-llama/Llama-3.3-70B-Instruct"],
             "summarization": ["Qwen/Qwen2.5-72B-Instruct"],
-            "single_shot_question_generation": ["meta-llama/Llama-3.3-70B-Instruct"],
-            "multi_hop_question_generation": ["meta-llama/Llama-3.3-70B-Instruct"],
-            "answer_generation": ["Qwen/Qwen2.5-72B-Instruct"],
-            "judge_answers": ["meta-llama/Llama-3.3-70B-Instruct"],
         },
         "pipeline": {
             "ingestion": {
-                "source_documents_dir": f"/app/{session_uid}/uploaded_files/",
-                "output_dir": f"/app/{session_uid}/ingested",
                 "run": True,
             },
             "upload_ingest_to_hub": {
-                "source_documents_dir": f"/app/{session_uid}/ingested",
                 "run": True,
             },
-            "summarization": {"run": True},
             "chunking": {
                 "chunking_configuration": {
                     "l_min_tokens": 64,
                     "l_max_tokens": 128,
-                    "tau_threshold": 0.3,
                     "h_min": 2,
-                    "h_max": 4,
                 },
-                "run": True,
             },
             "single_shot_question_generation": {
-                "diversification_seed": "24 year old adult",
                 "run": True,
             },
-            "multi_hop_question_generation": {"run": False},
-            "answer_generation": {
-                "question_type": "single_shot",
                 "run": True,
-                "strategies": [
-                    {
-                        "name": "zeroshot",
-                        "prompt": "ZEROSHOT_QA_USER_PROMPT",
-                        "model_name": "meta-llama/Llama-3.3-70B-Instruct",
-                    },
-                    {
-                        "name": "gold",
-                        "prompt": "GOLD_QA_USER_PROMPT",
-                        "model_name": "meta-llama/Llama-3.3-70B-Instruct",
-                    },
-                ],
             },
-            "judge_answers": {
-                "run": False, # to change when fixed
-                "comparing_strategies": [["zeroshot", "gold"]],
-                "chunk_column_index": 0,
-                "random_seed": 42,
             },
         },
     }
@@ -97,4 +94,3 @@ def generate_and_save_config(hf_org: str, hf_name: str, session_uid: str, config
     file_path = save_yaml_file(config, config_path)
     logger.success(f"Config saved at: {file_path}")
     return file_path

     return {
         "hf_configuration": {
             "token": "$HF_TOKEN",
             "hf_organization": hf_org,
+            "private": True,
             "hf_dataset_name": hf_dataset_name,
+            "concat_if_exist": False,
         },
         "model_list": [
             {
+                "model_name": "Qwen/Qwen2.5-VL-72B-Instruct",
                 "provider": "novita",
                 "max_concurrent_requests": 32,
             },
                 "model_name": "Qwen/Qwen2.5-72B-Instruct",
                 "provider": "novita",
                 "max_concurrent_requests": 32,
+            },
         ],
         "model_roles": {
+            "ingestion": ["Qwen/Qwen2.5-VL-72B-Instruct"],
             "summarization": ["Qwen/Qwen2.5-72B-Instruct"],
+            "chunking": ["intfloat/multilingual-e5-large-instruct"],
+            "single_shot_question_generation": ["Qwen/Qwen2.5-72B-Instruct"],
+            "multi_hop_question_generation": ["Qwen/Qwen2.5-72B-Instruct"],
         },
         "pipeline": {
             "ingestion": {
+                "source_documents_dir": f"/home/user/app/{session_uid}/uploaded_files/",
+                "output_dir": f"/home/user/app/{session_uid}/ingested",
                 "run": True,
             },
             "upload_ingest_to_hub": {
+                "source_documents_dir": f"/home/user/app/{session_uid}/ingested",
+                "run": True,
+            },
+            "summarization": {
                 "run": True,
             },
             "chunking": {
+                "run": True,
                 "chunking_configuration": {
                     "l_min_tokens": 64,
                     "l_max_tokens": 128,
+                    "tau_threshold": 0.8,
                     "h_min": 2,
+                    "h_max": 5,
+                    "num_multihops_factor": 2,
                 },
             },
             "single_shot_question_generation": {
                 "run": True,
+                "additional_instructions": "Generate questions to test a curious adult",
+                "chunk_sampling": {
+                    "mode": "count",
+                    "value": 5,
+                    "random_seed": 123,
+                },
             },
+            "multi_hop_question_generation": {
                 "run": True,
+                "additional_instructions": "Generate questions to test a curious adult",
+                "chunk_sampling": {
+                    "mode": "percentage",
+                    "value": 0.3,
+                    "random_seed": 42,
+                },
             },
+            "lighteval": {
+                "run": True,
             },
         },
     }
     file_path = save_yaml_file(config, config_path)
     logger.success(f"Config saved at: {file_path}")
     return file_path

yourbench_space/evaluation.py CHANGED Viewed

@@ -1,12 +1,17 @@
-import asyncio, os
 from yourbench_space.leaderboard_space.env import INIT_MODELS
-ON_SPACES=os.environ.get("system") == "spaces"
 OUTPUT_DIR = "/data" if ON_SPACES else "."
 def create_eval_file(eval_ds_name):
     # TODO: replace by Nathan's call
-    content = """
 from aenum import extend_enum
 from lighteval.metrics.metrics import Metrics
@@ -31,10 +36,11 @@ def prompt_function(line, task_name: str = None):
         gold_index=0,
         specific={"question": line["question"]},
     )
-""" + f"""
 hle = LightevalTaskConfig(
-    name="{eval_ds_name.replace('/', '_')}",
     suite=["custom"],
     prompt_function=prompt_function,
     hf_repo="{eval_ds_name}",
@@ -52,38 +58,46 @@ hle = LightevalTaskConfig(
 TASKS_TABLE = [hle]
-"""
     with open(f"{OUTPUT_DIR}/custom_task.py", "w") as f:
         f.write(content)
 async def run_process(args: list) -> dict:
     process = await asyncio.create_subprocess_exec(
-        *args,
-        stdout=asyncio.subprocess.PIPE,
-        stderr=asyncio.subprocess.PIPE
     )
     await asyncio.wait_for(process.wait(), timeout=180)
     stdout = await process.stdout.read()
     stderr = await process.stderr.read()
-    return {
-        'pid': process.pid,
-        'stdout': stdout.decode(),
-        'stderr': stderr.decode()
-    }
 async def run_evaluations(eval_ds_name: str, org: str) -> list:
     tasks = []
     for model_name, provider in INIT_MODELS:
         args = [
-            "lighteval",
-            "endpoint", "inference-providers", f"model={model_name},provider={provider}",
-            f"custom|{eval_ds_name.replace('/', '_')}|0|0", "--custom-tasks", f"{OUTPUT_DIR}/custom_task.py", "--max-samples", "10",
-            "--output-dir", f"{OUTPUT_DIR}", "--save-details", "--results-org", org, "--push-to-hub"
         ]
         tasks.append(run_process(args))
     # Will capture the task if failed
     processes = await asyncio.gather(*tasks, return_exceptions=True)
     if all(not isinstance(result, Exception) for result in processes):
         return "✅"
-    return "At least one model failed"

+import os
+import asyncio
 from yourbench_space.leaderboard_space.env import INIT_MODELS
+ON_SPACES = os.environ.get("system") == "spaces"
 OUTPUT_DIR = "/data" if ON_SPACES else "."
 def create_eval_file(eval_ds_name):
     # TODO: replace by Nathan's call
+    content = (
+        """
 from aenum import extend_enum
 from lighteval.metrics.metrics import Metrics
         gold_index=0,
         specific={"question": line["question"]},
     )
+"""
+        + f"""
 hle = LightevalTaskConfig(
+    name="{eval_ds_name.replace("/", "_")}",
     suite=["custom"],
     prompt_function=prompt_function,
     hf_repo="{eval_ds_name}",
 TASKS_TABLE = [hle]
+"""
+    )
     with open(f"{OUTPUT_DIR}/custom_task.py", "w") as f:
         f.write(content)
 async def run_process(args: list) -> dict:
     process = await asyncio.create_subprocess_exec(
+        *args, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE
     )
     await asyncio.wait_for(process.wait(), timeout=180)
     stdout = await process.stdout.read()
     stderr = await process.stderr.read()
+    return {"pid": process.pid, "stdout": stdout.decode(), "stderr": stderr.decode()}
 async def run_evaluations(eval_ds_name: str, org: str) -> list:
     tasks = []
     for model_name, provider in INIT_MODELS:
         args = [
+            "lighteval",
+            "endpoint",
+            "inference-providers",
+            f"model={model_name},provider={provider}",
+            f"custom|{eval_ds_name.replace('/', '_')}|0|0",
+            "--custom-tasks",
+            f"{OUTPUT_DIR}/custom_task.py",
+            "--max-samples",
+            "10",
+            "--output-dir",
+            f"{OUTPUT_DIR}",
+            "--save-details",
+            "--results-org",
+            org,
+            "--push-to-hub",
         ]
         tasks.append(run_process(args))
     # Will capture the task if failed
     processes = await asyncio.gather(*tasks, return_exceptions=True)
     if all(not isinstance(result, Exception) for result in processes):
         return "✅"
+    return "At least one model failed"

yourbench_space/utils.py CHANGED Viewed

@@ -1,15 +1,17 @@
 import io
 import os
 import re
-import pathlib
 import shutil
 import subprocess
-import gradio as gr
 import pandas as pd
-from collections import defaultdict
-from datasets import load_dataset
 from loguru import logger
-from typing import List, Union, Optional
 STAGES = [
     "ingestion",
@@ -17,12 +19,25 @@ STAGES = [
     "summarization",
     "chunking",
     "single_shot_question_generation",
-    "answer_generation",
-    #"evaluate_models",
-    #"create_leaderboard"
-    # "judge_answers", # to uncomment when fixed
 ]
 def is_running_locally() -> bool:
     """
     Returns True if Gradio is running locally, False if it's running in a Hugging Face Space.
@@ -33,7 +48,7 @@ def is_running_locally() -> bool:
 def save_files(oauth_token: gr.OAuthToken | None, session_state: gr.State, files: List[pathlib.Path]) -> str:
     """Save uploaded files to the UPLOAD_DIRECTORY/uuid safely"""
     if oauth_token is None and not is_running_locally():
-        gr.Warning('You need to log in to use this Space')
         return
     saved_paths = []
@@ -41,7 +56,7 @@ def save_files(oauth_token: gr.OAuthToken | None, session_state: gr.State, files
     for file in [file.name for file in files]:
         try:
             source_path = pathlib.Path(file)
-            upload_directory_uuid = pathlib.Path(f"/app/{session_state.value}/uploaded_files")
             # Ensure the upload directory exists
             upload_directory_uuid.mkdir(parents=True, exist_ok=True)
             destination_path = upload_directory_uuid / source_path.name
@@ -56,11 +71,8 @@ def save_files(oauth_token: gr.OAuthToken | None, session_state: gr.State, files
         except Exception as e:
             print(f"Error moving file {file}: {e}")
-    return (
-        f"Files saved to: {', '.join(saved_paths)}"
-        if saved_paths
-        else "No files were saved"
-    )
 def update_dataset(stages: list, hf_org: str, hf_prefix: str, oauth_token: gr.OAuthToken):
     """
@@ -68,31 +80,57 @@ def update_dataset(stages: list, hf_org: str, hf_prefix: str, oauth_token: gr.OA
     """
     ingestion_df = pd.DataFrame()
     summarization_df = pd.DataFrame()
-    single_hop_df = pd.DataFrame()
-    answers_df = pd.DataFrame()
     # Construct dataset name from config
     dataset_name = f"{hf_org}/{hf_prefix}"
-    if "ingestion" in stages:
-        # TODO: why is the key "ingested" and not "ingestion"? (does not match the other splits)
-        ingestion_ds = load_dataset(dataset_name, name="ingested", split="train", streaming=True, token=oauth_token.token).select_columns("document_text")
-        ingestion_df = pd.DataFrame([next(iter(ingestion_ds)) for _ in range(1)]) # only one row
-    if "summarization" in stages:
-        summarization_ds = load_dataset(dataset_name, name="summarization", split="train", streaming=True, token=oauth_token.token).select_columns(['raw_document_summary', 'document_summary', 'summarization_model'])
-        summarization_df = pd.DataFrame([next(iter(summarization_ds)) for _ in range(1)])
-    if "single_shot_question_generation" in stages:
-        single_hop_ds = load_dataset(dataset_name, name="single_shot_question_generation", split="train", streaming=True, token=oauth_token.token)
-        single_hop_df = pd.DataFrame([next(iter(single_hop_ds)) for _ in range(5)])
-    if "answer_generation" in stages:
-        answers_ds = load_dataset(dataset_name, name="answer_generation", split="train", streaming=True, token=oauth_token.token)
-        answers_df = pd.DataFrame([next(iter(answers_ds)) for _ in range(5)])
-    return (ingestion_df, summarization_df, single_hop_df, answers_df)
 class SubprocessManagerGroup:
     """Instanciates one manager per user (should be used as a singleton class)"""
     def __init__(self):
         self.managers: dict[str, SubprocessManager] = {}
@@ -115,8 +153,15 @@ class SubprocessManagerGroup:
         uid = SubprocessManagerGroup.grab_uuid(uid)
         if manager := self.managers.get(uid):
             manager.stop_process()
         del self.managers[uid]
     def start_process(self, uid: Union[str, gr.State], custom_env: dict | None):
         uid = SubprocessManagerGroup.grab_uuid(uid)
         self.managers[uid].start_process(custom_env=custom_env)
@@ -141,13 +186,14 @@ class SubprocessManagerGroup:
             return manager.is_running()
         return False
 class SubprocessManager:
     def __init__(self, session_uid: str):
         self.session_uid = session_uid
-        self.path = pathlib.Path(f"/app/{session_uid}")
         self.path.mkdir(parents=True, exist_ok=True)
         self.config_path = pathlib.Path(f"{self.path}/config.yml")
-        self.command = ["uv", "run", "yourbench", f"--config", str(self.config_path)]
         self.process = None
         self.output_stream = io.StringIO()
         self.exit_code = None
@@ -160,7 +206,7 @@ class SubprocessManager:
         self.output_stream = io.StringIO()
         self.exit_code = None
         try:
             logger.info(f"Starting process with command: {' '.join(self.command)}")
             self.process = subprocess.Popen(
@@ -195,9 +241,12 @@ class SubprocessManager:
                 pass
             current_output = self.output_stream.getvalue()
-            completed_stages = list(set(re.findall(r"Successfully completed stage: (\w+)", current_output)))
-        return current_output, completed_stages
     def stop_process(self):
         """Terminate the subprocess."""
@@ -207,7 +256,7 @@ class SubprocessManager:
         logger.info("Sending SIGTERM to the Process")
         try:
             self.process.terminate()
-            self.exit_code =  self.process.wait(timeout=5)  # Wait up to 5 seconds for process to terminate
             logger.info(f"Process terminated by user with exit code {self.exit_code}")
         except subprocess.TimeoutExpired:
             logger.warning("Process did not terminate within timeout, sending SIGKILL")
@@ -221,7 +270,7 @@ class SubprocessManager:
         logger.info("Sending SIGKILL to the Process")
         try:
             self.process.kill()
-            self.exit_code = self.process.wait(timeout=5) # Wait up to 5 seconds for process to be killed
             logger.info(f"Process killed by user with exit code {self.exit_code}")
         except subprocess.TimeoutExpired:
             logger.error("Process could not be killed within timeout")
@@ -237,11 +286,11 @@ class SubprocessManager:
         """Return exit code and reason if process has terminated"""
         if self.process is None:
             return None, "Process was never started"
         if self.is_running():
             return None, "Process is still running"
-        if not self.exit_code is None and self.exit_code != 0 :
             return self.exit_code, "Process exited abnormaly"
         return self.exit_code, "Process exited normaly"
@@ -250,3 +299,5 @@ class SubprocessManager:
         """Stop the process when object is deleted"""
         if self.process:
             self.process.kill()

 import io
 import os
 import re
 import shutil
+import pathlib
 import subprocess
+from typing import List, Union, Optional
 import pandas as pd
 from loguru import logger
+import gradio as gr
+from datasets import load_dataset
 STAGES = [
     "ingestion",
     "summarization",
     "chunking",
     "single_shot_question_generation",
+    "multi_hop_question_generation",
+    "lighteval",
 ]
+STAGE_DISPLAY_MAP = {
+    "ingestion": "Process Input Docs",
+    "upload_ingest_to_hub": "Upload Dataset to Hub",
+    "summarization": "Summarize Documents",
+    "chunking": "Chunk Documents",
+    "single_shot_question_generation": "Generate Single Shot Questions",
+    "multi_hop_question_generation": "Generate Multi Hop Questions",
+    "lighteval": "Generate Lighteval Subset",
+}
+def map_stage_names(stages: list[str]) -> list[str]:
+    return [STAGE_DISPLAY_MAP.get(stage, stage) for stage in stages]
 def is_running_locally() -> bool:
     """
     Returns True if Gradio is running locally, False if it's running in a Hugging Face Space.
 def save_files(oauth_token: gr.OAuthToken | None, session_state: gr.State, files: List[pathlib.Path]) -> str:
     """Save uploaded files to the UPLOAD_DIRECTORY/uuid safely"""
     if oauth_token is None and not is_running_locally():
+        gr.Warning("You need to log in to use this Space")
         return
     saved_paths = []
     for file in [file.name for file in files]:
         try:
             source_path = pathlib.Path(file)
+            upload_directory_uuid = pathlib.Path(f"/home/user/app/{session_state.value}/uploaded_files")
             # Ensure the upload directory exists
             upload_directory_uuid.mkdir(parents=True, exist_ok=True)
             destination_path = upload_directory_uuid / source_path.name
         except Exception as e:
             print(f"Error moving file {file}: {e}")
+    return f"Files saved to: {', '.join(saved_paths)}" if saved_paths else "No files were saved"
 def update_dataset(stages: list, hf_org: str, hf_prefix: str, oauth_token: gr.OAuthToken):
     """
     """
     ingestion_df = pd.DataFrame()
     summarization_df = pd.DataFrame()
+    single_shot_df = pd.DataFrame()
+    multi_hop_df = pd.DataFrame()
+    lighteval_df = pd.DataFrame()
     # Construct dataset name from config
     dataset_name = f"{hf_org}/{hf_prefix}"
+    if STAGE_DISPLAY_MAP["upload_ingest_to_hub"] in stages:
+        ingestion_ds = load_dataset(
+            dataset_name, name="ingested", split="train", streaming=True, token=oauth_token.token
+        ).select_columns("document_text")
+        ingestion_df = pd.DataFrame(ingestion_ds.take(1))
+    if STAGE_DISPLAY_MAP["summarization"] in stages:
+        summarization_ds = load_dataset(
+            dataset_name, name="summarized", split="train", streaming=True, token=oauth_token.token
+        ).select_columns(["raw_document_summary", "document_summary", "summarization_model"])
+        summarization_df = pd.DataFrame(summarization_ds.take(5))
+    if STAGE_DISPLAY_MAP["single_shot_question_generation"] in stages:
+        single_shot_ds = load_dataset(
+            dataset_name,
+            name="single_shot_questions",
+            split="train",
+            streaming=True,
+            token=oauth_token.token,
+        ).select_columns(["question", "self_answer", "estimated_difficulty"])
+        single_shot_df = pd.DataFrame(single_shot_ds.take(5))
+    if STAGE_DISPLAY_MAP["multi_hop_question_generation"] in stages:
+        multi_hop_ds = load_dataset(
+            dataset_name,
+            name="multi_hop_questions",
+            split="train",
+            streaming=True,
+            token=oauth_token.token,
+        ).select_columns(["question", "self_answer", "estimated_difficulty"])
+        multi_hop_df = pd.DataFrame(multi_hop_ds.take(5))
+    if STAGE_DISPLAY_MAP["lighteval"] in stages:
+        lighteval_ds = load_dataset(
+            dataset_name, name="lighteval", split="train", streaming=True, token=oauth_token.token
+        ).select_columns(["question", "ground_truth_answer", "question_category", "kind"])
+        lighteval_df = pd.DataFrame(lighteval_ds.take(5))
+    return (ingestion_df, summarization_df, single_shot_df, multi_hop_df, lighteval_df)
 class SubprocessManagerGroup:
     """Instanciates one manager per user (should be used as a singleton class)"""
     def __init__(self):
         self.managers: dict[str, SubprocessManager] = {}
         uid = SubprocessManagerGroup.grab_uuid(uid)
         if manager := self.managers.get(uid):
             manager.stop_process()
+            manager.clean_workdir()
         del self.managers[uid]
+    def clean_workdir(self, uid: Union[str, gr.State]):
+        uid = SubprocessManagerGroup.grab_uuid(uid)
+        if manager := self.managers.get(uid):
+            manager.clean_workdir()
     def start_process(self, uid: Union[str, gr.State], custom_env: dict | None):
         uid = SubprocessManagerGroup.grab_uuid(uid)
         self.managers[uid].start_process(custom_env=custom_env)
             return manager.is_running()
         return False
 class SubprocessManager:
     def __init__(self, session_uid: str):
         self.session_uid = session_uid
+        self.path = pathlib.Path(f"/home/user/app/{session_uid}")
         self.path.mkdir(parents=True, exist_ok=True)
         self.config_path = pathlib.Path(f"{self.path}/config.yml")
+        self.command = ["uv", "run", "yourbench", "run", "--config", str(self.config_path)]
         self.process = None
         self.output_stream = io.StringIO()
         self.exit_code = None
         self.output_stream = io.StringIO()
         self.exit_code = None
         try:
             logger.info(f"Starting process with command: {' '.join(self.command)}")
             self.process = subprocess.Popen(
                 pass
             current_output = self.output_stream.getvalue()
+            completed_stages = list(set(re.findall(r"Completed stage: '([^']*)'", current_output)))
+        return current_output, map_stage_names(completed_stages)
+    def clean_workdir(self):
+        shutil.rmtree(self.path, ignore_errors=True)
     def stop_process(self):
         """Terminate the subprocess."""
         logger.info("Sending SIGTERM to the Process")
         try:
             self.process.terminate()
+            self.exit_code = self.process.wait(timeout=5)  # Wait up to 5 seconds for process to terminate
             logger.info(f"Process terminated by user with exit code {self.exit_code}")
         except subprocess.TimeoutExpired:
             logger.warning("Process did not terminate within timeout, sending SIGKILL")
         logger.info("Sending SIGKILL to the Process")
         try:
             self.process.kill()
+            self.exit_code = self.process.wait(timeout=5)  # Wait up to 5 seconds for process to be killed
             logger.info(f"Process killed by user with exit code {self.exit_code}")
         except subprocess.TimeoutExpired:
             logger.error("Process could not be killed within timeout")
         """Return exit code and reason if process has terminated"""
         if self.process is None:
             return None, "Process was never started"
         if self.is_running():
             return None, "Process is still running"
+        if self.exit_code is not None and self.exit_code != 0:
             return self.exit_code, "Process exited abnormaly"
         return self.exit_code, "Process exited normaly"
         """Stop the process when object is deleted"""
         if self.process:
             self.process.kill()
+        self.clean_workdir()