advanced

Running on CPU Upgrade

App Files Files Community

Clémentine commited on Mar 20

Commit

133c6d8

1 Parent(s): ab1227e

added a singleton-like class to manage all managers per session, plus session state management. Also fixes secret passing to the new leaderboard space

Browse files

Files changed (3) hide show

yourbench_space/app.py +36 -37
yourbench_space/config.py +12 -12
yourbench_space/utils.py +60 -17

yourbench_space/app.py CHANGED Viewed

@@ -3,6 +3,7 @@ import os
 import sys
 import time
 import gradio as gr
 from datasets import load_dataset
 from huggingface_hub import whoami
@@ -11,9 +12,7 @@ from pathlib import Path
 from yourbench_space.config import generate_and_save_config
 from yourbench_space.utils import (
-    CONFIG_PATH,
-    UPLOAD_DIRECTORY,
-    SubprocessManager,
     save_files,
     update_dataset,
     STAGES,
@@ -30,14 +29,11 @@ Quickly create zero-shot benchmarks from your documents – keeping models accu
 - 💻 [GitHub](https://github.com/huggingface/yourbench/tree/v0.2-alpha-space)
 """
-UPLOAD_DIRECTORY.mkdir(parents=True, exist_ok=True)
 logger.remove()
 logger.add(sys.stderr, level="INFO")
-command = ["uv", "run", "yourbench", f"--config={CONFIG_PATH}"]
-manager = SubprocessManager(command)
 docs_path = Path(__file__).parent / "docs.md"
 citation_content = (
@@ -46,30 +42,27 @@ citation_content = (
     else "# Citation\n\nDocumentation file not found."
 )
-def generate_and_return(hf_org, hf_prefix):
-    generate_and_save_config(hf_org, hf_prefix)
     for _ in range(5):
-        if CONFIG_PATH.exists():
-            break
         time.sleep(0.5)
     return (
-        (
-            "✅ Config saved!",
-            gr.update(value=str(CONFIG_PATH), visible=True, interactive=True),
-        )
-        if CONFIG_PATH.exists()
-        else (
-            "❌ Config generation failed.",
-            gr.update(visible=False, interactive=False),
-        )
     )
 final_dataset = None
-def update_process_status():
     """Update process status and include exit details if process has terminated"""
     is_running = manager.is_running()
     if not is_running:
@@ -79,7 +72,8 @@ def update_process_status():
     return gr.update(value=True, label="Process Status: Running")
-def prepare_task(oauth_token: gr.OAuthToken | None, hf_dataset_name: str, _=None):
     new_env = os.environ.copy()
     if oauth_token:
         new_env["HF_TOKEN"] = oauth_token.token
@@ -127,17 +121,22 @@ def run_evaluation_pipeline(oauth_token: gr.OAuthToken | None, org_name, eval_na
     api = HfApi()
     try:
-        api.create_repo(repo_id=repo_id, repo_type="space", space_sdk="gradio")
-        api.upload_folder(repo_id=repo_id, repo_type="space", folder_path="src/")
-        api.add_space_secret(repo_id=repo_id, key="HF_TOKEN", value=HF_TOKEN)
-        api.add_space_variable(repo_id=repo_id, key="TASK", value=eval_ds_name)
-        api.add_space_variable(repo_id=repo_id, key="ORG_NAME", value=org_name)
     except Exception as e:
         status = "Evaluation" + status + "\nLeaderboard creation:" + e
     return status
 with gr.Blocks(theme=gr.themes.Default()) as app:
     gr.Markdown(project_description)
     with gr.Tabs() as tabs:
@@ -166,7 +165,7 @@ with gr.Blocks(theme=gr.themes.Default()) as app:
                     )
                     output = gr.Textbox(label="Log")
                     file_input.upload(
-                        lambda files: save_files([file.name for file in files]),
                         file_input,
                         output,
                     )
@@ -181,7 +180,7 @@ with gr.Blocks(theme=gr.themes.Default()) as app:
             preview_button.click(
                 generate_and_return,
-                inputs=[hf_org_dropdown, hf_dataset_name],
                 outputs=[log_message, download_button],
             )
             preview_button.click(
@@ -193,13 +192,13 @@ with gr.Blocks(theme=gr.themes.Default()) as app:
         with gr.Tab("Run Generation", id=1):
             with gr.Row():
                 start_button = gr.Button("Start Task")
-                start_button.click(prepare_task, inputs=[login_btn, hf_dataset_name])
                 stop_button = gr.Button("Stop Task")
-                stop_button.click(manager.stop_process)
                 kill_button = gr.Button("Kill Task")
-                kill_button.click(manager.kill_process)
             with gr.Row():
@@ -209,7 +208,7 @@ with gr.Blocks(theme=gr.themes.Default()) as app:
                     process_status = gr.Checkbox(label="Process Status", interactive=False)
                     status_timer = gr.Timer(1.0, active=True)
-                    status_timer.tick(update_process_status, outputs=process_status)
                 with gr.Column():
                     with gr.Accordion("Stages", open=True):
@@ -238,7 +237,7 @@ with gr.Blocks(theme=gr.themes.Default()) as app:
             log_timer = gr.Timer(1.0, active=True)
             log_timer.tick(
-                manager.read_and_get_output, outputs=[log_output, stages_table]
             )
         with gr.Tab("Evaluate", id=2):
             with gr.Row():

 import sys
 import time
 import gradio as gr
+import uuid
 from datasets import load_dataset
 from huggingface_hub import whoami
 from yourbench_space.config import generate_and_save_config
 from yourbench_space.utils import (
+    SubprocessManagerGroup,
     save_files,
     update_dataset,
     STAGES,
 - 💻 [GitHub](https://github.com/huggingface/yourbench/tree/v0.2-alpha-space)
 """
 logger.remove()
 logger.add(sys.stderr, level="INFO")
+# Global to store all managers per session
+MANAGERS = SubprocessManagerGroup()
 docs_path = Path(__file__).parent / "docs.md"
 citation_content = (
     else "# Citation\n\nDocumentation file not found."
 )
+def generate_and_return(hf_org, hf_dataset_name, session_state: gr.State):
+    manager = MANAGERS.get(session_state.value)
+    config_path = generate_and_save_config(hf_org, hf_dataset_name, session_state.value, manager.config_path)
     for _ in range(5):
         time.sleep(0.5)
+        if config_path.exists():
+            return (
+                "✅ Config saved!",
+                gr.update(value=str(config_path), visible=True, interactive=True),
+            )
     return (
+        "❌ Config generation failed.",
+        gr.update(visible=False, interactive=False),
     )
 final_dataset = None
+def update_process_status(session_state: gr.State):
     """Update process status and include exit details if process has terminated"""
+    manager = MANAGERS.get(session_state.value)
     is_running = manager.is_running()
     if not is_running:
     return gr.update(value=True, label="Process Status: Running")
+def prepare_task(session_state: gr.State, oauth_token: gr.OAuthToken | None, hf_dataset_name: str, _=None):
+    manager = MANAGERS.get(session_state.value)
     new_env = os.environ.copy()
     if oauth_token:
         new_env["HF_TOKEN"] = oauth_token.token
     api = HfApi()
     try:
+        api.create_repo(repo_id=repo_id, repo_type="space", space_sdk="gradio", token=oauth_token.token)
+        api.upload_folder(repo_id=repo_id, repo_type="space", folder_path="src/", token=oauth_token.token)
+        api.add_space_secret(repo_id=repo_id, key="HF_TOKEN", value=oauth_token.token, token=oauth_token.token)
+        api.add_space_variable(repo_id=repo_id, key="TASK", value=eval_ds_name, token=oauth_token.token)
+        api.add_space_variable(repo_id=repo_id, key="ORG_NAME", value=org_name, token=oauth_token.token)
     except Exception as e:
         status = "Evaluation" + status + "\nLeaderboard creation:" + e
     return status
 with gr.Blocks(theme=gr.themes.Default()) as app:
+    # We initialize the session state with the user randomly generated uuid
+    # Using uuid4 makes collision cases extremely unlikely even for concurrent users
+    session_state = gr.State(uuid.uuid4(), delete_callback=lambda uid: MANAGERS.remove(uid))
+    MANAGERS.create(session_state.value)
     gr.Markdown(project_description)
     with gr.Tabs() as tabs:
                     )
                     output = gr.Textbox(label="Log")
                     file_input.upload(
+                        lambda files: save_files(session_state, [file.name for file in files]),
                         file_input,
                         output,
                     )
             preview_button.click(
                 generate_and_return,
+                inputs=[hf_org_dropdown, hf_dataset_name, session_state],
                 outputs=[log_message, download_button],
             )
             preview_button.click(
         with gr.Tab("Run Generation", id=1):
             with gr.Row():
                 start_button = gr.Button("Start Task")
+                start_button.click(prepare_task, inputs=[session_state, login_btn, hf_dataset_name])
                 stop_button = gr.Button("Stop Task")
+                stop_button.click(MANAGERS.stop_process, inputs=session_state)
                 kill_button = gr.Button("Kill Task")
+                kill_button.click(MANAGERS.kill_process, inputs=session_state)
             with gr.Row():
                     process_status = gr.Checkbox(label="Process Status", interactive=False)
                     status_timer = gr.Timer(1.0, active=True)
+                    status_timer.tick(update_process_status, inputs=session_state, outputs=process_status)
                 with gr.Column():
                     with gr.Accordion("Stages", open=True):
             log_timer = gr.Timer(1.0, active=True)
             log_timer.tick(
+                MANAGERS.read_and_get_output, inputs=session_state, outputs=[log_output, stages_table]
             )
         with gr.Tab("Evaluate", id=2):
             with gr.Row():

yourbench_space/config.py CHANGED Viewed

@@ -3,14 +3,14 @@ from loguru import logger
 from yourbench_space.utils import CONFIG_PATH
-def generate_base_config(hf_org, hf_prefix):
     """Creates the base config dictionary"""
     return {
         "hf_configuration": {
             "token": "$HF_TOKEN",
             "private": True,
             "hf_organization": hf_org,
-            "hf_dataset_name": hf_prefix,
         },
         "model_list": [
             {
@@ -34,12 +34,12 @@ def generate_base_config(hf_org, hf_prefix):
         },
         "pipeline": {
             "ingestion": {
-                "source_documents_dir": "/app/uploaded_files",
-                "output_dir": "/app/ingested",
                 "run": True,
             },
             "upload_ingest_to_hub": {
-                "source_documents_dir": "/app/ingested",
                 "run": True,
             },
             "summarization": {"run": True},
@@ -84,18 +84,18 @@ def generate_base_config(hf_org, hf_prefix):
     }
-def save_yaml_file(config):
     """Saves the given config dictionary to a YAML file"""
-    with open(CONFIG_PATH, "w") as file:
         yaml.dump(config, file, default_flow_style=False, sort_keys=False)
-    return CONFIG_PATH
-def generate_and_save_config(hf_org, hf_prefix):
     """Generates and saves the YAML configuration file"""
-    logger.debug(f"Generating config with org: {hf_org}, prefix: {hf_prefix}")
-    config = generate_base_config(hf_org, hf_prefix)
-    file_path = save_yaml_file(config)
     logger.success(f"Config saved at: {file_path}")
     return file_path

 from yourbench_space.utils import CONFIG_PATH
+def generate_base_config(hf_org: str, hf_dataset_name: str, session_uid: str):
     """Creates the base config dictionary"""
     return {
         "hf_configuration": {
             "token": "$HF_TOKEN",
             "private": True,
             "hf_organization": hf_org,
+            "hf_dataset_name": hf_dataset_name,
         },
         "model_list": [
             {
         },
         "pipeline": {
             "ingestion": {
+                "source_documents_dir": f"/app/{session_uid}/uploaded_files/",
+                "output_dir": f"/app/{session_uid}/ingested",
                 "run": True,
             },
             "upload_ingest_to_hub": {
+                "source_documents_dir": f"/app/{session_uid}/ingested",
                 "run": True,
             },
             "summarization": {"run": True},
     }
+def save_yaml_file(config: str, path: str):
     """Saves the given config dictionary to a YAML file"""
+    with open(path, "w") as file:
         yaml.dump(config, file, default_flow_style=False, sort_keys=False)
+    return path
+def generate_and_save_config(hf_org: str, hf_name: str, session_uid: str, config_path: str):
     """Generates and saves the YAML configuration file"""
+    logger.debug(f"Generating config with org: {hf_org}, dataset name: {hf_name}")
+    config = generate_base_config(hf_org, hf_name, session_uid)
+    file_path = save_yaml_file(config, config_path)
     logger.success(f"Config saved at: {file_path}")
     return file_path

yourbench_space/utils.py CHANGED Viewed

@@ -4,16 +4,12 @@ import re
 import pathlib
 import shutil
 import subprocess
 import pandas as pd
-from datasets import load_dataset, get_dataset_config_names
 from loguru import logger
-from typing import List
-UPLOAD_DIRECTORY = pathlib.Path("/app/uploaded_files")
-CONFIG_PATH = pathlib.Path("/app/yourbench_config.yml")
-# Ensure the upload directory exists
-UPLOAD_DIRECTORY.mkdir(parents=True, exist_ok=True)
 STAGES = [
     "ingestion",
@@ -28,14 +24,18 @@ STAGES = [
 ]
-def save_files(files: List[pathlib.Path]) -> str:
-    """Save uploaded files to the UPLOAD_DIRECTORY safely"""
     saved_paths = []
     for file in files:
         try:
             source_path = pathlib.Path(file)
-            destination_path = UPLOAD_DIRECTORY / source_path.name
             if not source_path.exists():
                 print(f"File not found: {source_path}")
@@ -65,13 +65,10 @@ def update_dataset(stages, hf_org, hf_prefix):
     # Construct dataset name from config
     dataset_name = f"{hf_org}/{hf_prefix}"
-    # TODO: add cache dir
-    # Will be able to group everything in one pass once the names get homogeneized
-    # TODO: make sure the questions are loaded with a set
     if "ingestion" in stages:
         # TODO: why is the key "ingested" and not "ingestion"? (does not match the other splits)
         ingestion_ds = load_dataset(dataset_name, name="ingested", split="train").select_columns("document_text")
-        ingestion_df = pd.DataFrame([next(iter(ingestion_ds)) for _ in range(5)])
     if "summarization" in stages:
         summarization_ds = load_dataset(dataset_name, name="summarization", split="train", streaming=True).select_columns(['raw_document_summary', 'document_summary', 'summarization_model'])
         summarization_df = pd.DataFrame([next(iter(summarization_ds)) for _ in range(5)])
@@ -84,9 +81,55 @@ def update_dataset(stages, hf_org, hf_prefix):
     return (ingestion_df, summarization_df, single_hop_df, answers_df)
 class SubprocessManager:
-    def __init__(self, command):
-        self.command = command
         self.process = None
         self.output_stream = io.StringIO()
         self.exit_code = None

 import pathlib
 import shutil
 import subprocess
+import gradio as gr
 import pandas as pd
+from collections import defaultdict
+from datasets import load_dataset
 from loguru import logger
+from typing import List, Union
 STAGES = [
     "ingestion",
 ]
+def save_files(session_state: gr.State, files: List[pathlib.Path]) -> str:
+    """Save uploaded files to the UPLOAD_DIRECTORY/uuid safely"""
+    uuid = session_state.value
     saved_paths = []
     for file in files:
         try:
             source_path = pathlib.Path(file)
+            upload_directory_uuid = pathlib.Path(f"/app/{uuid}/uploaded_files")
+            # Ensure the upload directory exists
+            upload_directory_uuid.mkdir(parents=True, exist_ok=True)
+            destination_path = upload_directory_uuid / source_path.name
             if not source_path.exists():
                 print(f"File not found: {source_path}")
     # Construct dataset name from config
     dataset_name = f"{hf_org}/{hf_prefix}"
     if "ingestion" in stages:
         # TODO: why is the key "ingested" and not "ingestion"? (does not match the other splits)
         ingestion_ds = load_dataset(dataset_name, name="ingested", split="train").select_columns("document_text")
+        ingestion_df = pd.DataFrame(ingestion_ds[0]) # only one row
     if "summarization" in stages:
         summarization_ds = load_dataset(dataset_name, name="summarization", split="train", streaming=True).select_columns(['raw_document_summary', 'document_summary', 'summarization_model'])
         summarization_df = pd.DataFrame([next(iter(summarization_ds)) for _ in range(5)])
     return (ingestion_df, summarization_df, single_hop_df, answers_df)
+class SubprocessManagerGroup:
+    """Instanciates one manager per user (should be used as a singleton class)"""
+    def __init__(self):
+        self.managers: dict[str, SubprocessManager] = {}
+    @staticmethod
+    def grab_uuid(uid: Union[str, gr.State]):
+        """If a gradio session state is provided, we pull the uuid from its value - else we assume the str is the uuid"""
+        if isinstance(uid, gr.State):
+            uid = uid.value
+        return uid
+    def create(self, uid: Union[str, gr.State]):
+        uid = SubprocessManagerGroup.grab_uuid(uid)
+        self.managers[uid] = SubprocessManager(uid)
+    def get(self, uid: Union[str, gr.State]) -> "SubprocessManager":
+        uid = SubprocessManagerGroup.grab_uuid(uid)
+        return self.managers[uid]
+    def remove(self, uid: Union[str, gr.State]):
+        uid = SubprocessManagerGroup.grab_uuid(uid)
+        del self.managers[uid]
+    def start_process(self, uid: Union[str, gr.State]):
+        uid = SubprocessManagerGroup.grab_uuid(uid)
+        self.managers[uid].start_process()
+    def stop_process(self, uid: Union[str, gr.State]):
+        uid = SubprocessManagerGroup.grab_uuid(uid)
+        self.managers[uid].stop_process()
+    def kill_process(self, uid: Union[str, gr.State]):
+        uid = SubprocessManagerGroup.grab_uuid(uid)
+        self.managers[uid].kill_process()
+    def read_and_get_output(self, uid: Union[str, gr.State]):
+        uid = SubprocessManagerGroup.grab_uuid(uid)
+        self.managers[uid].read_and_get_output()
 class SubprocessManager:
+    def __init__(self, session_uid: str):
+        self.session_uid = session_uid
+        self.path = pathlib.Path(f"/app/{session_uid}")
+        self.path.mkdir(parents=True, exist_ok=True)
+        self.config_path = pathlib.Path(f"/app/{session_uid}/config.yml")
+        self.command = ["uv", "run", "yourbench", f"--config", self.config_path]
         self.process = None
         self.output_stream = io.StringIO()
         self.exit_code = None