Spaces:

yourbench
/

demo

Running on CPU Upgrade

App Files Files Community

tfrere commited on Mar 28

Commit

d6b6619

1 Parent(s): f8ec36f

update production api url

Browse files

Files changed (11) hide show

Dockerfile +3 -0
backend/data/lighteval_results/lighteval_results.json +9 -9
backend/data/lighteval_results/results/deepseek-ai/DeepSeek-V3-0324/results_2025-03-28T12-28-57.341922.json +121 -0
backend/routes/evaluation.py +2 -1
backend/tasks/createBenchConfigFile.py +9 -7
backend/tasks/evaluationTask.py +36 -30
frontend/src/components/BenchmarkDisplay.jsx +2 -1
frontend/src/components/BenchmarkEvaluation.jsx +15 -11
frontend/src/components/BenchmarkGenerator.jsx +16 -11
frontend/src/config/api.js +11 -0
frontend/src/pages/BenchmarkDisplayPage.jsx +2 -1

Dockerfile CHANGED Viewed

@@ -5,6 +5,9 @@ COPY frontend/package*.json ./
 RUN npm install
 COPY frontend/ ./
 RUN npm run build
 # Build backend

 RUN npm install
 COPY frontend/ ./
+# Set environment variable for production build
+ENV REACT_APP_NODE_ENV=production
 RUN npm run build
 # Build backend

backend/data/lighteval_results/lighteval_results.json CHANGED Viewed

@@ -1,11 +1,4 @@
 [
-  {
-    "model": "deepseek-ai/DeepSeek-V3-0324",
-    "provider": "novita",
-    "accuracy": 1.0,
-    "execution_time": 54.32098197937012,
-    "status": "success"
-  },
   {
     "model": "Qwen/QwQ-32B",
     "provider": "sambanova",
@@ -14,14 +7,21 @@
     "status": "timeout"
   },
   {
-    "model": "deepseek-ai/DeepSeek-R1-Distill-Llama-70B",
     "provider": "sambanova",
     "accuracy": 0.0,
     "execution_time": 60.0,
     "status": "timeout"
   },
   {
-    "model": "Qwen/Qwen2.5-72B-Instruct",
     "provider": "sambanova",
     "accuracy": 0.0,
     "execution_time": 60.0,

 [
   {
     "model": "Qwen/QwQ-32B",
     "provider": "sambanova",
     "status": "timeout"
   },
   {
+    "model": "Qwen/Qwen2.5-72B-Instruct",
     "provider": "sambanova",
     "accuracy": 0.0,
     "execution_time": 60.0,
     "status": "timeout"
   },
   {
+    "model": "deepseek-ai/DeepSeek-V3-0324",
+    "provider": "novita",
+    "accuracy": 0.0,
+    "execution_time": 60.0,
+    "status": "timeout"
+  },
+  {
+    "model": "deepseek-ai/DeepSeek-R1-Distill-Llama-70B",
     "provider": "sambanova",
     "accuracy": 0.0,
     "execution_time": 60.0,

backend/data/lighteval_results/results/deepseek-ai/DeepSeek-V3-0324/results_2025-03-28T12-28-57.341922.json ADDED Viewed

	@@ -0,0 +1,121 @@

+{
+  "config_general": {
+    "lighteval_sha": "?",
+    "num_fewshot_seeds": 1,
+    "override_batch_size": null,
+    "max_samples": 30,
+    "job_id": 0,
+    "start_time": 191865.098197958,
+    "end_time": 191926.425937958,
+    "total_evaluation_time_secondes": "61.32774000000791",
+    "model_name": "deepseek-ai/DeepSeek-V3-0324",
+    "model_sha": "",
+    "model_dtype": null,
+    "model_size": "",
+    "generation_parameters": {
+      "early_stopping": null,
+      "repetition_penalty": null,
+      "frequency_penalty": null,
+      "length_penalty": null,
+      "presence_penalty": null,
+      "max_new_tokens": null,
+      "min_new_tokens": null,
+      "seed": null,
+      "stop_tokens": null,
+      "temperature": null,
+      "top_k": null,
+      "min_p": null,
+      "top_p": null,
+      "truncate_prompt": null,
+      "response_format": null
+    }
+  },
+  "results": {
+    "custom|yourbench|0": {
+      "accuracy": 1.0,
+      "accuracy_stderr": 0.0
+    },
+    "all": {
+      "accuracy": 1.0,
+      "accuracy_stderr": 0.0
+    }
+  },
+  "versions": {
+    "custom|yourbench|0": 0
+  },
+  "config_tasks": {
+    "custom|yourbench": {
+      "name": "yourbench",
+      "prompt_function": "yourbench_prompt",
+      "hf_repo": "yourbench/yourbench_d61a6289-9f2e-4138-b01a-63c43c5daf0b",
+      "hf_subset": "multi_hop_questions",
+      "metric": [
+        {
+          "metric_name": [
+            "accuracy"
+          ],
+          "higher_is_better": {
+            "accuracy": true
+          },
+          "category": "7",
+          "use_case": "1",
+          "sample_level_fn": "compute",
+          "corpus_level_fn": {
+            "accuracy": "mean"
+          }
+        }
+      ],
+      "hf_revision": null,
+      "hf_filter": null,
+      "hf_avail_splits": [
+        "train"
+      ],
+      "trust_dataset": true,
+      "evaluation_splits": [
+        "train"
+      ],
+      "few_shots_split": null,
+      "few_shots_select": null,
+      "generation_size": 8192,
+      "generation_grammar": null,
+      "stop_sequence": [],
+      "num_samples": null,
+      "suite": [
+        "custom"
+      ],
+      "original_num_docs": 34,
+      "effective_num_docs": 30,
+      "must_remove_duplicate_docs": false,
+      "version": 0
+    }
+  },
+  "summary_tasks": {
+    "custom|yourbench|0": {
+      "hashes": {
+        "hash_examples": "1b5afc5f13827f79",
+        "hash_full_prompts": "cd8c39c007643835",
+        "hash_input_tokens": "79ab129e9a18c6d6",
+        "hash_cont_tokens": "79ab129e9a18c6d6"
+      },
+      "truncated": 0,
+      "non_truncated": 30,
+      "padded": 0,
+      "non_padded": 30,
+      "effective_few_shots": 0.0,
+      "num_truncated_few_shots": 0
+    }
+  },
+  "summary_general": {
+    "hashes": {
+      "hash_examples": "b18e19e266a5bc51",
+      "hash_full_prompts": "1eaa15cbc4a17d04",
+      "hash_input_tokens": "05a66e44e190c178",
+      "hash_cont_tokens": "05a66e44e190c178"
+    },
+    "truncated": 0,
+    "non_truncated": 30,
+    "padded": 0,
+    "non_padded": 30,
+    "num_truncated_few_shots": 0
+  }
+}

backend/routes/evaluation.py CHANGED Viewed

@@ -5,6 +5,7 @@ from tasks.evaluationTask import EvaluationTask
 from huggingface_hub import hf_hub_download
 import json
 from datetime import datetime
 router = APIRouter(tags=["evaluation"])
@@ -51,7 +52,7 @@ async def evaluate_benchmark(data: Dict[str, Any]):
         active_evaluation_tasks[session_id] = evaluation_task
         # Démarrer l'évaluation de manière asynchrone
-        evaluation_task.run()
         # Récupérer les logs initiaux
         initial_logs = evaluation_task.get_logs()

 from huggingface_hub import hf_hub_download
 import json
 from datetime import datetime
+import asyncio
 router = APIRouter(tags=["evaluation"])
         active_evaluation_tasks[session_id] = evaluation_task
         # Démarrer l'évaluation de manière asynchrone
+        asyncio.create_task(evaluation_task.run())
         # Récupérer les logs initiaux
         initial_logs = evaluation_task.get_logs()

backend/tasks/createBenchConfigFile.py CHANGED Viewed

@@ -278,13 +278,15 @@ class CreateBenchConfigTask:
         # Mark the task as running
         self.is_running_flag.set()
-        # Start the task in a separate thread
-        self.thread = threading.Thread(target=self._run_task, args=(file_path,))
-        self.thread.daemon = True
-        self.thread.start()
-        # Return the expected config path
-        return f"uploaded_files/{self.session_uid}/config.yml"
     def is_running(self) -> bool:
         """

         # Mark the task as running
         self.is_running_flag.set()
+        # Run the task directly without threading
+        try:
+            config_path = self._run_task(file_path)
+            return config_path
+        except Exception as e:
+            error_msg = f"Error generating configuration: {str(e)}"
+            self._add_log(f"[ERROR] {error_msg}")
+            self.mark_task_completed()
+            raise RuntimeError(error_msg)
     def is_running(self) -> bool:
         """

backend/tasks/evaluationTask.py CHANGED Viewed

@@ -13,6 +13,7 @@ import json
 from typing import List, Dict
 from tasks.get_model_providers import get_model_providers
 from huggingface_hub import HfApi
 class EvaluationTask:
     """
@@ -60,7 +61,7 @@ class EvaluationTask:
         except Exception as e:
             print(f"[{datetime.now().strftime('%H:%M:%S')}] Failed to save results to Hub: {str(e)}")
-    def _run_lighteval(self, model_name: str, provider: str, dataset_name: str) -> dict:
         start_time = time.time()
         print(f"[{datetime.now().strftime('%H:%M:%S')}] Starting evaluation with {provider} provider for {model_name}")
@@ -88,21 +89,38 @@ TASKS_TABLE = [yourbench]
             temp_file_path,
             "--max-samples", "30",
             "--output-dir", "data/lighteval_results",
-            # "--save-details",
             "--no-push-to-hub"
         ]
         try:
-            # Run the command with environment variables and timeout of 60 seconds
-            subprocess.run(cmd_args, env=os.environ, timeout=60)
-        except subprocess.TimeoutExpired:
-            print(f"[{datetime.now().strftime('%H:%M:%S')}] Evaluation timed out for {model_name} after {time.time() - start_time:.2f}s")
             return {
                 "model": model_name,
                 "provider": provider,
                 "accuracy": 0.0,
-                "execution_time": 60.0,
-                "status": "timeout"
             }
         # Calculate execution time
@@ -138,12 +156,9 @@ TASKS_TABLE = [yourbench]
                 "status": "parse_error"
             }
-    def run_parallel(self) -> List[Dict]:
         """
-        Run the evaluation task with multiple models in parallel using ProcessPoolExecutor
-        Returns:
-            List of results for each model
         """
         # Start global timer
         script_start_time = time.time()
@@ -164,14 +179,13 @@ TASKS_TABLE = [yourbench]
         print(f"[{datetime.now().strftime('%H:%M:%S')}] Starting parallel evaluations")
-        # Run evaluations in parallel using ProcessPoolExecutor
-        with concurrent.futures.ProcessPoolExecutor() as executor:
-            futures = [
-                executor.submit(self._run_lighteval, model_name, providers[0], self.dataset_name)
-                for model_name, providers in model_providers
-                if providers  # Only run if providers are available
-            ]
-            self.results = [future.result() for future in concurrent.futures.as_completed(futures)]
         # Calculate total script execution time
         total_time = time.time() - script_start_time
@@ -182,8 +196,6 @@ TASKS_TABLE = [yourbench]
         # Mark the task as completed
         self.is_completed = True
-        return self.results
     def get_logs(self) -> List[str]:
         """
@@ -201,10 +213,4 @@ TASKS_TABLE = [yourbench]
         Returns:
             True if completed, False otherwise
         """
-        return self.is_completed
-    def run(self) -> None:
-        """
-        Run the evaluation task (wrapper around run_parallel)
-        """
-        self.run_parallel()

 from typing import List, Dict
 from tasks.get_model_providers import get_model_providers
 from huggingface_hub import HfApi
+import asyncio
 class EvaluationTask:
     """
         except Exception as e:
             print(f"[{datetime.now().strftime('%H:%M:%S')}] Failed to save results to Hub: {str(e)}")
+    async def _run_lighteval(self, model_name: str, provider: str, dataset_name: str) -> dict:
         start_time = time.time()
         print(f"[{datetime.now().strftime('%H:%M:%S')}] Starting evaluation with {provider} provider for {model_name}")
             temp_file_path,
             "--max-samples", "30",
             "--output-dir", "data/lighteval_results",
             "--no-push-to-hub"
         ]
         try:
+            # Run the command with environment variables and increased timeout of 300 seconds
+            process = await asyncio.create_subprocess_exec(
+                *cmd_args,
+                env=os.environ,
+                stdout=asyncio.subprocess.PIPE,
+                stderr=asyncio.subprocess.PIPE
+            )
+            try:
+                await asyncio.wait_for(process.communicate(), timeout=60)
+            except asyncio.TimeoutError:
+                process.kill()
+                print(f"[{datetime.now().strftime('%H:%M:%S')}] Evaluation timed out for {model_name} after {time.time() - start_time:.2f}s")
+                return {
+                    "model": model_name,
+                    "provider": provider,
+                    "accuracy": 0.0,
+                    "execution_time": 60.0,
+                    "status": "timeout"
+                }
+        except Exception as e:
+            print(f"[{datetime.now().strftime('%H:%M:%S')}] Error running evaluation for {model_name}: {str(e)}")
             return {
                 "model": model_name,
                 "provider": provider,
                 "accuracy": 0.0,
+                "execution_time": time.time() - start_time,
+                "status": "error"
             }
         # Calculate execution time
                 "status": "parse_error"
             }
+    async def run(self) -> None:
         """
+        Run the evaluation task asynchronously
         """
         # Start global timer
         script_start_time = time.time()
         print(f"[{datetime.now().strftime('%H:%M:%S')}] Starting parallel evaluations")
+        # Run evaluations in parallel using asyncio
+        tasks = []
+        for model_name, providers in model_providers:
+            if providers:  # Only run if providers are available
+                tasks.append(self._run_lighteval(model_name, providers[0], self.dataset_name))
+        self.results = await asyncio.gather(*tasks)
         # Calculate total script execution time
         total_time = time.time() - script_start_time
         # Mark the task as completed
         self.is_completed = True
     def get_logs(self) -> List[str]:
         """
         Returns:
             True if completed, False otherwise
         """
+        return self.is_completed

frontend/src/components/BenchmarkDisplay.jsx CHANGED Viewed

@@ -16,6 +16,7 @@ import AssessmentIcon from "@mui/icons-material/Assessment";
 import LinkIcon from "@mui/icons-material/Link";
 import DownloadIcon from "@mui/icons-material/Download";
 import CheckCircleIcon from "@mui/icons-material/CheckCircle";
 /**
  * Component to display benchmark information and evaluation button
@@ -67,7 +68,7 @@ const BenchmarkDisplay = ({
     setIsDownloading(true);
     try {
       // Requête pour télécharger le dataset
-      const downloadUrl = `http://localhost:3001/download-dataset/${sessionId}`;
       // Créer un élément a temporaire pour déclencher le téléchargement
       const link = document.createElement("a");

 import LinkIcon from "@mui/icons-material/Link";
 import DownloadIcon from "@mui/icons-material/Download";
 import CheckCircleIcon from "@mui/icons-material/CheckCircle";
+import API_CONFIG from "../config/api";
 /**
  * Component to display benchmark information and evaluation button
     setIsDownloading(true);
     try {
       // Requête pour télécharger le dataset
+      const downloadUrl = `${API_CONFIG.BASE_URL}/download-dataset/${sessionId}`;
       // Créer un élément a temporaire pour déclencher le téléchargement
       const link = document.createElement("a");

frontend/src/components/BenchmarkEvaluation.jsx CHANGED Viewed

@@ -1,6 +1,7 @@
 import React, { useState, useEffect, useRef } from "react";
 import { Box, Typography, CircularProgress, Alert, Paper } from "@mui/material";
-import { useNavigate } from "react-router-dom";
 // Starting messages with their timing
 const STARTING_MESSAGES = [
@@ -96,15 +97,18 @@ const BenchmarkEvaluation = ({ sessionId, onComplete }) => {
     try {
       // Call API to start evaluation
-      const response = await fetch("http://localhost:3001/evaluate-benchmark", {
-        method: "POST",
-        headers: {
-          "Content-Type": "application/json",
-        },
-        body: JSON.stringify({
-          session_id: sessionId,
-        }),
-      });
       const result = await response.json();
@@ -113,7 +117,7 @@ const BenchmarkEvaluation = ({ sessionId, onComplete }) => {
         pollingIntervalRef.current = setInterval(async () => {
           try {
             const logsResponse = await fetch(
-              `http://localhost:3001/evaluation-logs/${sessionId}`
             );
             if (logsResponse.ok) {

 import React, { useState, useEffect, useRef } from "react";
 import { Box, Typography, CircularProgress, Alert, Paper } from "@mui/material";
+import { useNavigate, useSearchParams } from "react-router-dom";
+import API_CONFIG from "../config/api";
 // Starting messages with their timing
 const STARTING_MESSAGES = [
     try {
       // Call API to start evaluation
+      const response = await fetch(
+        `${API_CONFIG.BASE_URL}/evaluate-benchmark`,
+        {
+          method: "POST",
+          headers: {
+            "Content-Type": "application/json",
+          },
+          body: JSON.stringify({
+            session_id: sessionId,
+          }),
+        }
+      );
       const result = await response.json();
         pollingIntervalRef.current = setInterval(async () => {
           try {
             const logsResponse = await fetch(
+              `${API_CONFIG.BASE_URL}/evaluation-logs/${sessionId}`
             );
             if (logsResponse.ok) {

frontend/src/components/BenchmarkGenerator.jsx CHANGED Viewed

@@ -3,6 +3,8 @@ import { Box, Typography, CircularProgress, Alert, Paper } from "@mui/material";
 import PlayArrowIcon from "@mui/icons-material/PlayArrow";
 import AccessTimeIcon from "@mui/icons-material/AccessTime";
 import LogDisplay from "./LogDisplay";
 // Define all benchmark steps in sequence
 const BENCHMARK_STEPS = [
@@ -172,15 +174,18 @@ const BenchmarkGenerator = ({ sessionId, onComplete }) => {
     try {
       // Call the API to generate the benchmark
-      const response = await fetch("http://localhost:3001/generate-benchmark", {
-        method: "POST",
-        headers: {
-          "Content-Type": "application/json",
-        },
-        body: JSON.stringify({
-          session_id: sessionId,
-        }),
-      });
       const result = await response.json();
@@ -192,7 +197,7 @@ const BenchmarkGenerator = ({ sessionId, onComplete }) => {
           try {
             // Call the API to get the config logs
             const configLogsResponse = await fetch(
-              `http://localhost:3001/config-logs/${sessionId}`
             );
             if (configLogsResponse.ok) {
@@ -237,7 +242,7 @@ const BenchmarkGenerator = ({ sessionId, onComplete }) => {
             try {
               // Call the API to get the latest benchmark logs
               const logsResponse = await fetch(
-                `http://localhost:3001/benchmark-logs/${sessionId}`
               );
               if (logsResponse.ok) {

 import PlayArrowIcon from "@mui/icons-material/PlayArrow";
 import AccessTimeIcon from "@mui/icons-material/AccessTime";
 import LogDisplay from "./LogDisplay";
+import { useNavigate, useSearchParams } from "react-router-dom";
+import API_CONFIG from "../config/api";
 // Define all benchmark steps in sequence
 const BENCHMARK_STEPS = [
     try {
       // Call the API to generate the benchmark
+      const response = await fetch(
+        `${API_CONFIG.BASE_URL}/generate-benchmark`,
+        {
+          method: "POST",
+          headers: {
+            "Content-Type": "application/json",
+          },
+          body: JSON.stringify({
+            session_id: sessionId,
+          }),
+        }
+      );
       const result = await response.json();
           try {
             // Call the API to get the config logs
             const configLogsResponse = await fetch(
+              `${API_CONFIG.BASE_URL}/config-logs/${sessionId}`
             );
             if (configLogsResponse.ok) {
             try {
               // Call the API to get the latest benchmark logs
               const logsResponse = await fetch(
+                `${API_CONFIG.BASE_URL}/benchmark-logs/${sessionId}`
               );
               if (logsResponse.ok) {

frontend/src/config/api.js ADDED Viewed

	@@ -0,0 +1,11 @@

+// API Configuration
+const API_CONFIG = {
+  // Use the current origin in production (Hugging Face Spaces)
+  // Fallback to localhost in development
+  BASE_URL:
+    process.env.REACT_APP_NODE_ENV === "production"
+      ? window.location.origin
+      : "http://localhost:3001",
+};
+export default API_CONFIG;

frontend/src/pages/BenchmarkDisplayPage.jsx CHANGED Viewed

@@ -3,6 +3,7 @@ import { Box, CircularProgress } from "@mui/material";
 import { useNavigate, useSearchParams, Navigate } from "react-router-dom";
 import Intro from "../components/Intro";
 import BenchmarkDisplay from "../components/BenchmarkDisplay";
 function BenchmarkDisplayPage() {
   const navigate = useNavigate();
@@ -30,7 +31,7 @@ function BenchmarkDisplayPage() {
         sessionId
       );
       try {
-        const apiUrl = `http://localhost:3001/benchmark-questions/${sessionId}`;
         console.log("Appel API:", apiUrl);
         const response = await fetch(apiUrl);

 import { useNavigate, useSearchParams, Navigate } from "react-router-dom";
 import Intro from "../components/Intro";
 import BenchmarkDisplay from "../components/BenchmarkDisplay";
+import API_CONFIG from "../config/api";
 function BenchmarkDisplayPage() {
   const navigate = useNavigate();
         sessionId
       );
       try {
+        const apiUrl = `${API_CONFIG.BASE_URL}/benchmark-questions/${sessionId}`;
         console.log("Appel API:", apiUrl);
         const response = await fetch(apiUrl);