Unit4_scoring

Running

App Files Files Community

Jofthomas commited on 9 days ago

Commit

3227abd

verified ·

1 Parent(s): 3f5f3ef

Update main.py

Browse files

Files changed (1) hide show

main.py +134 -126

main.py CHANGED Viewed

@@ -1,4 +1,3 @@
-# Import necessary libraries (ensure all required imports are at the top)
 import os
 import pandas as pd
 from fastapi import FastAPI, HTTPException, Body
@@ -28,7 +27,6 @@ filtered_dataset = None
 ALLOWED_CACHE_BASE = os.path.abspath("/app/.cache")
-# --- Define ErrorResponse if not already defined ---
 class ErrorResponse(BaseModel):
     detail: str
@@ -42,22 +40,19 @@ def load_questions():
     global filtered_dataset
     global questions_for_api
     global ground_truth_answers
-    global task_file_paths  # Declare modification of global
     tempo_filtered = []
-    # Clear existing data from previous runs or restarts
     questions_for_api.clear()
     ground_truth_answers.clear()
-    task_file_paths.clear() # Clear the file path mapping
     logger.info("Starting to load and filter GAIA dataset (validation split)...")
     try:
-        # Load the specified split
         dataset = load_dataset("gaia-benchmark/GAIA", "2023_level1", split="validation", trust_remote_code=True)
         logger.info(f"GAIA dataset validation split loaded. Features: {dataset.features}")
     except Exception as e:
         logger.error(f"Failed to load GAIA dataset: {e}", exc_info=True)
-        # Depending on requirements, you might want to exit or raise a more specific error
         raise RuntimeError("Could not load the primary GAIA dataset.") from e
     # --- Filtering Logic based on Annotator Metadata ---
@@ -72,44 +67,37 @@ def load_questions():
                 try:
                     num_tools = int(num_tools_str)
                     num_steps = int(num_steps_str)
-                    # Apply filter conditions
                     if num_tools < tool_threshold and num_steps < step_threshold:
-                        tempo_filtered.append(item) # Add the original item if it matches filter
                 except ValueError:
                     logger.warning(f"Skipping Task ID: {item.get('task_id', 'N/A')} - Could not convert tool/step count in metadata: tools='{num_tools_str}', steps='{num_steps_str}'.")
             else:
                  logger.warning(f"Skipping Task ID: {item.get('task_id', 'N/A')} - 'Number of tools' or 'Number of steps' missing in Metadata.")
         else:
-             # If metadata is essential for filtering, you might want to skip items without it
              logger.warning(f"Skipping Task ID: {item.get('task_id', 'N/A')} - Missing 'Annotator Metadata'.")
-    filtered_dataset = tempo_filtered # Store the list of filtered original dataset items
     logger.info(f"Found {len(filtered_dataset)} questions matching the criteria (tools < {tool_threshold}, steps < {step_threshold}).")
     processed_count = 0
-    # --- Process filtered items for API and File Mapping ---
     for item in filtered_dataset:
         # Extract data from the dataset item
         task_id = item.get('task_id')
         original_question_text = item.get('Question')
         final_answer = item.get('Final answer')
-        local_file_path = item.get('file_path') # Server-local path from dataset
-        file_name = item.get('file_name')       # Filename from dataset
-        # Validate essential fields needed for processing & ground truth
-        # Note: We proceed even if file path/name are missing, just won't map the file.
         if task_id and original_question_text and final_answer is not None:
             # 1. Create the dictionary to be exposed via the API
             #    (Includes 'file_name' for info, but excludes 'file_path')
             processed_item = {
                 "task_id": str(task_id),
-                "question": str(original_question_text), # Rename 'Question' -> 'question'
-                # Include other desired fields, using .get() for safety
                 "Level": item.get("Level"),
-                "file_name": file_name, # Include filename for client info
             }
-            # Optional: Remove keys with None values if you prefer cleaner JSON
             processed_item = {k: v for k, v in processed_item.items() if v is not None}
             questions_for_api.append(processed_item)
@@ -122,21 +110,15 @@ def load_questions():
                  # Log if the path from the dataset isn't absolute (might indicate issues)
                  if not os.path.isabs(local_file_path):
                      logger.warning(f"Task {task_id}: Path '{local_file_path}' from dataset is not absolute. This might cause issues finding the file on the server.")
-                     # Depending on dataset guarantees, you might try making it absolute:
-                     # Assuming WORKDIR is /app as per Dockerfile if paths are relative
-                     # local_file_path = os.path.abspath(os.path.join("/app", local_file_path))
-                 # Check if the file actually exists at the path ON THE SERVER
                  if os.path.exists(local_file_path) and os.path.isfile(local_file_path):
-                     # Path exists, store the mapping
                      task_file_paths[str(task_id)] = local_file_path
                      logger.debug(f"Stored file path mapping for task_id {task_id}: {local_file_path}")
                  else:
-                     # Path does *not* exist or is not a file on server filesystem
                      logger.warning(f"File path '{local_file_path}' for task_id {task_id} does NOT exist or is not a file on server. Mapping skipped.")
-            # Log if file info was missing in the first place
-            elif task_id: # Log only if we have a task_id to reference
-                 # Check which specific part was missing for better debugging
                  if not local_file_path and not file_name:
                      logger.debug(f"Task {task_id}: No 'file_path' or 'file_name' found in dataset item. No file mapping stored.")
                  elif not local_file_path:
@@ -147,17 +129,14 @@ def load_questions():
             processed_count += 1
         else:
-            # Log skipping due to missing core fields (task_id, Question, Final answer)
             logger.warning(f"Skipping item processing due to missing essential fields: task_id={task_id}, has_question={original_question_text is not None}, has_answer={final_answer is not None}")
-    # Final summary logging
     logger.info(f"Successfully processed {processed_count} questions for the API.")
     logger.info(f"Stored file path mappings for {len(task_file_paths)} tasks.")
     if not questions_for_api:
          logger.error("CRITICAL: No valid questions were loaded after filtering and processing. API endpoints like /questions will fail.")
-         # Consider raising an error if the application cannot function without questions
-         # raise RuntimeError("Failed to load mandatory question data after filtering.")
@@ -165,8 +144,7 @@ class Question(BaseModel):
     task_id: str
     question: str
     Level: Optional[str] = None
-    file_name: Optional[str] = None # Keep filename for info
-    # file_path: Optional[str] = None # REMOVE file_path from the response model
 # --- The rest of your Pydantic models remain the same ---
@@ -176,7 +154,7 @@ class AnswerItem(BaseModel):
 class Submission(BaseModel):
     username: str = Field(..., description="Hugging Face username", min_length=1)
-    agent_code: str = Field(..., description="The Python class code for the agent", min_length=10) # Basic check
     answers: List[AnswerItem] = Field(..., description="List of answers submitted by the agent")
 class ScoreResponse(BaseModel):
@@ -190,15 +168,13 @@ class ScoreResponse(BaseModel):
 class ErrorResponse(BaseModel):
     detail: str
-# Keep other models as they are (AnswerItem, Submission, ScoreResponse, ErrorResponse)
-# ... (rest of the Pydantic models remain the same) ...
 class AnswerItem(BaseModel):
     task_id: str
     submitted_answer: str = Field(..., description="The agent's answer for the task_id")
 class Submission(BaseModel):
     username: str = Field(..., description="Hugging Face username", min_length=1)
-    agent_code: str = Field(..., description="The Python class code for the agent", min_length=10) # Basic check
     answers: List[AnswerItem] = Field(..., description="List of answers submitted by the agent")
 class ScoreResponse(BaseModel):
@@ -231,18 +207,16 @@ async def startup_event():
             logger.info(f"Successfully loaded {len(questions_for_api)} questions.")
     except Exception as e:
         logger.error(f"CRITICAL ERROR DURING STARTUP while loading questions: {e}", exc_info=True)
-        # import sys
-        # sys.exit(1) # Consider exiting if questions are critical
-# --- Your Endpoints ---
 @app.get("/files/{task_id}",
          summary="Get Associated File by Task ID",
          description="Downloads the file associated with the given task_id, if one exists and is mapped.",
          responses={
              200: {
                  "description": "File content.",
-                 "content": {"*/*": {}} # Indicates response can be any file type
              },
              403: {"model": ErrorResponse, "description": "Access denied (e.g., path traversal attempt)."},
              404: {"model": ErrorResponse, "description": "Task ID not found, no file associated, or file missing on server."},
@@ -265,80 +239,93 @@ async def get_task_file(task_id: str):
     # --- CRUCIAL SECURITY CHECK ---
     try:
-        # Resolve to absolute paths to prevent '..' tricks
-        # --- local_file_path IS NOW DEFINED before being used ---
         abs_file_path = os.path.abspath(local_file_path)
         abs_base_path = ALLOWED_CACHE_BASE # Already absolute
-        # Check if the resolved file path starts with the allowed base directory
         if not abs_file_path.startswith(abs_base_path):
             logger.error(f"SECURITY ALERT: Path traversal attempt denied for task_id '{task_id}'. Path '{local_file_path}' resolves outside base '{abs_base_path}'.")
             raise HTTPException(status_code=403, detail="File access denied.")
-        # Check if the file exists at the resolved, validated path
         if not os.path.exists(abs_file_path) or not os.path.isfile(abs_file_path):
              logger.error(f"File not found on server for task_id '{task_id}' at expected path: {abs_file_path}")
              raise HTTPException(status_code=404, detail=f"File associated with task_id {task_id} not found on server disk.")
     except HTTPException as http_exc:
-         raise http_exc # Re-raise our own security/404 exceptions
     except Exception as path_err:
          logger.error(f"Error resolving or checking path '{local_file_path}' for task_id '{task_id}': {path_err}", exc_info=True)
          raise HTTPException(status_code=500, detail="Server error validating file path.")
-    # --- END SECURITY CHECK ---
-    # Determine MIME type for the Content-Type header
-    mime_type, _ = mimetypes.guess_type(abs_file_path) # Ensure 'import mimetypes' is at the top
-    media_type = mime_type if mime_type else "application/octet-stream" # Default if unknown
-    # Extract filename for the Content-Disposition header (suggests filename to browser/client)
     file_name_for_download = os.path.basename(abs_file_path)
     logger.info(f"Serving file '{file_name_for_download}' (type: {media_type}) for task_id '{task_id}' from path: {abs_file_path}")
-    # Use FileResponse to efficiently stream the file
     return FileResponse(path=abs_file_path, media_type=media_type, filename=file_name_for_download)
-def update_huggingface_dataset(username: str, score: float):
-    """Loads the dataset, updates the score if higher, and pushes back."""
     try:
-        # 1. Load the dataset
-        logger.info(f"Loading dataset '{HF_DATASET_ID}'...")
         ds_dict = None
         try:
-            # Use hf_hub_download to check if the parquet file exists, avoiding full dataset load error if empty
-            # This assumes the dataset uses the default 'train' split and parquet format. Adjust if needed.
-            hf_hub_download(repo_id=HF_DATASET_ID, filename="data/train-00000-of-00001.parquet", repo_type="dataset")
-            ds_dict = load_dataset(HF_DATASET_ID)
             logger.info("Dataset loaded successfully.")
-            if "train" not in ds_dict:
-                 logger.warning(f"Dataset '{HF_DATASET_ID}' does not contain a 'train' split. Creating one.")
-                 df = pd.DataFrame({'username': pd.Series(dtype='str'),
-                                     'score': pd.Series(dtype='float'),
-                                     'timestamp': pd.Series(dtype='str')})
             else:
-                # Convert the 'train' split to a pandas DataFrame for easier manipulation
-                 df = ds_dict['train'].to_pandas()
-        except Exception as load_error: # Catch broad exception for file not found or other loading issues
-            logger.warning(f"Could not load dataset '{HF_DATASET_ID}' or it might be empty/new ({load_error}). Creating structure.")
-            # Create an empty DataFrame with the correct schema
-            df = pd.DataFrame({'username': pd.Series(dtype='str'),
-                                 'score': pd.Series(dtype='float'),
-                                 'timestamp': pd.Series(dtype='str')})
-        # Ensure columns exist, add if they don't
-        for col, dtype in [('username', 'str'), ('score', 'float'), ('timestamp', 'str')]:
-             if col not in df.columns:
-                  logger.warning(f"Column '{col}' not found in dataset. Adding it.")
-                  df[col] = pd.Series(dtype=dtype)
-        # Convert score column to numeric, coercing errors
-        df['score'] = pd.to_numeric(df['score'], errors='coerce')
-        # Fill potential NaN values in score with 0.0 before comparison/aggregation
-        df['score'] = df['score'].fillna(0.0)
         # 2. Find existing score for the user
         existing_entries = df[df['username'] == username]
@@ -347,51 +334,75 @@ def update_huggingface_dataset(username: str, score: float):
         if not existing_entries.empty:
             # User exists, find their highest score
-            # Handle potential NaN scores from coercion or previous bad data (though fillna above should help)
-            max_existing_score = existing_entries['score'].max()
             if score > max_existing_score:
-                logger.info(f"New score {score} is higher than existing max {max_existing_score} for {username}. Updating.")
-                # Remove old entries for this user
-                df = df[df['username'] != username]
-                # Add new entry
-                new_entry = pd.DataFrame([{'username': username, 'score': score, 'timestamp': current_timestamp}])
                 df = pd.concat([df, new_entry], ignore_index=True)
                 needs_update = True
             else:
                 logger.info(f"New score {score} is not higher than existing max {max_existing_score} for {username}. No update needed.")
         else:
             # User does not exist, add them
-            logger.info(f"User {username} not found. Adding new entry.")
-            new_entry = pd.DataFrame([{'username': username, 'score': score, 'timestamp': current_timestamp}])
             df = pd.concat([df, new_entry], ignore_index=True)
             needs_update = True
         # 3. Push updated data back to Hugging Face Hub if changes were made
         if needs_update:
-            logger.info(f"Pushing updated dataset to '{HF_DATASET_ID}'...")
-            # Convert potentially modified DataFrame back to a Dataset object
-            # Ensure the schema matches if columns were added/modified.
-            # Use 'train' split convention.
-            # Make sure the dtypes are correct before creating the Dataset
-            df['username'] = df['username'].astype(str)
-            df['score'] = df['score'].astype(float)
-            df['timestamp'] = df['timestamp'].astype(str)
-            updated_ds = DatasetDict({'train': Dataset.from_pandas(df)})
-            logger.info(f"Dataset to push: {updated_ds}") # Log the dataset structure
-            updated_ds.push_to_hub(HF_DATASET_ID) # Uncomment this line to enable leaderboard updates
-            logger.warning("Dataset push to hub is currently commented out. Uncomment the line above to enable leaderboard updates.") # REMINDER
-            logger.info("Dataset push simulated/attempted.")
             return True
         else:
             return False # No update was pushed
     except Exception as e:
         logger.error(f"Error interacting with Hugging Face dataset '{HF_DATASET_ID}': {e}", exc_info=True)
-        # Re-raise the exception to be caught by the endpoint handler
         raise HTTPException(status_code=500, detail=f"Failed to update Hugging Face dataset: {e}")
-# --- API Endpoints (Modified response_model) ---
 @app.get("/questions",
          # Return a list of dictionaries with arbitrary keys/values
@@ -464,27 +475,26 @@ async def submit_answers(submission: Submission = Body(...)):
     correct_count = 0
     total_attempted_in_payload = len(submission.answers)
-    valid_attempted_count = 0 # Count attempts where task_id was valid
     processed_ids = set()
     for answer_item in submission.answers:
-        task_id = str(answer_item.task_id) # Ensure string comparison
-        submitted = str(answer_item.submitted_answer) # Ensure string comparison
-        # Prevent duplicate task_id submissions in the same request
         if task_id in processed_ids:
              logger.warning(f"Duplicate task_id '{task_id}' in submission from {submission.username}. Skipping.")
-             continue # Don't count this as an attempt for scoring
         processed_ids.add(task_id)
-        # Check if task_id is valid (exists in our loaded ground truth)
         if task_id not in ground_truth_answers:
             logger.warning(f"Task ID '{task_id}' submitted by {submission.username} not found in ground truth list. Skipping this answer.")
-            # Don't count this as a valid attempt for score calculation
             continue
-        # If we reach here, the task_id is valid
         valid_attempted_count += 1
         ground_truth = ground_truth_answers[task_id]
         # Compare answers (case-insensitive, strip whitespace)
@@ -494,8 +504,6 @@ async def submit_answers(submission: Submission = Body(...)):
         else:
              logger.debug(f"Incorrect answer for {task_id} from {submission.username}. Submitted: '{submitted}', Expected: '{ground_truth}'")
-    # Calculate score based on valid attempts AND total number of questions available
     if valid_attempted_count == 0:
         score = 0.0
         message = f"Submission received, but no valid/matching task IDs were found in the {total_attempted_in_payload} answers provided."
@@ -515,7 +523,7 @@ async def submit_answers(submission: Submission = Body(...)):
     # Update Hugging Face dataset
     try:
-        updated = update_huggingface_dataset(submission.username, score)
         if updated:
              message += " High score updated on leaderboard."
              logger.info(f"Leaderboard updated for {submission.username}.")

 import os
 import pandas as pd
 from fastapi import FastAPI, HTTPException, Body
 ALLOWED_CACHE_BASE = os.path.abspath("/app/.cache")
 class ErrorResponse(BaseModel):
     detail: str
     global filtered_dataset
     global questions_for_api
     global ground_truth_answers
+    global task_file_paths
     tempo_filtered = []
     questions_for_api.clear()
     ground_truth_answers.clear()
+    task_file_paths.clear()
     logger.info("Starting to load and filter GAIA dataset (validation split)...")
     try:
         dataset = load_dataset("gaia-benchmark/GAIA", "2023_level1", split="validation", trust_remote_code=True)
         logger.info(f"GAIA dataset validation split loaded. Features: {dataset.features}")
     except Exception as e:
         logger.error(f"Failed to load GAIA dataset: {e}", exc_info=True)
         raise RuntimeError("Could not load the primary GAIA dataset.") from e
     # --- Filtering Logic based on Annotator Metadata ---
                 try:
                     num_tools = int(num_tools_str)
                     num_steps = int(num_steps_str)
                     if num_tools < tool_threshold and num_steps < step_threshold:
+                        tempo_filtered.append(item)
                 except ValueError:
                     logger.warning(f"Skipping Task ID: {item.get('task_id', 'N/A')} - Could not convert tool/step count in metadata: tools='{num_tools_str}', steps='{num_steps_str}'.")
             else:
                  logger.warning(f"Skipping Task ID: {item.get('task_id', 'N/A')} - 'Number of tools' or 'Number of steps' missing in Metadata.")
         else:
              logger.warning(f"Skipping Task ID: {item.get('task_id', 'N/A')} - Missing 'Annotator Metadata'.")
+    filtered_dataset = tempo_filtered
     logger.info(f"Found {len(filtered_dataset)} questions matching the criteria (tools < {tool_threshold}, steps < {step_threshold}).")
     processed_count = 0
     for item in filtered_dataset:
         # Extract data from the dataset item
         task_id = item.get('task_id')
         original_question_text = item.get('Question')
         final_answer = item.get('Final answer')
+        local_file_path = item.get('file_path')
+        file_name = item.get('file_name')
         if task_id and original_question_text and final_answer is not None:
             # 1. Create the dictionary to be exposed via the API
             #    (Includes 'file_name' for info, but excludes 'file_path')
             processed_item = {
                 "task_id": str(task_id),
+                "question": str(original_question_text),
                 "Level": item.get("Level"),
+                "file_name": file_name,
             }
             processed_item = {k: v for k, v in processed_item.items() if v is not None}
             questions_for_api.append(processed_item)
                  # Log if the path from the dataset isn't absolute (might indicate issues)
                  if not os.path.isabs(local_file_path):
                      logger.warning(f"Task {task_id}: Path '{local_file_path}' from dataset is not absolute. This might cause issues finding the file on the server.")
                  if os.path.exists(local_file_path) and os.path.isfile(local_file_path):
                      task_file_paths[str(task_id)] = local_file_path
                      logger.debug(f"Stored file path mapping for task_id {task_id}: {local_file_path}")
                  else:
                      logger.warning(f"File path '{local_file_path}' for task_id {task_id} does NOT exist or is not a file on server. Mapping skipped.")
+            elif task_id:
                  if not local_file_path and not file_name:
                      logger.debug(f"Task {task_id}: No 'file_path' or 'file_name' found in dataset item. No file mapping stored.")
                  elif not local_file_path:
             processed_count += 1
         else:
             logger.warning(f"Skipping item processing due to missing essential fields: task_id={task_id}, has_question={original_question_text is not None}, has_answer={final_answer is not None}")
     logger.info(f"Successfully processed {processed_count} questions for the API.")
     logger.info(f"Stored file path mappings for {len(task_file_paths)} tasks.")
     if not questions_for_api:
          logger.error("CRITICAL: No valid questions were loaded after filtering and processing. API endpoints like /questions will fail.")
     task_id: str
     question: str
     Level: Optional[str] = None
+    file_name: Optional[str] = None
 # --- The rest of your Pydantic models remain the same ---
 class Submission(BaseModel):
     username: str = Field(..., description="Hugging Face username", min_length=1)
+    agent_code: str = Field(..., description="The Python class code for the agent")
     answers: List[AnswerItem] = Field(..., description="List of answers submitted by the agent")
 class ScoreResponse(BaseModel):
 class ErrorResponse(BaseModel):
     detail: str
 class AnswerItem(BaseModel):
     task_id: str
     submitted_answer: str = Field(..., description="The agent's answer for the task_id")
 class Submission(BaseModel):
     username: str = Field(..., description="Hugging Face username", min_length=1)
+    agent_code: str = Field(..., description="The Python class code for the agent", min_length=10)
     answers: List[AnswerItem] = Field(..., description="List of answers submitted by the agent")
 class ScoreResponse(BaseModel):
             logger.info(f"Successfully loaded {len(questions_for_api)} questions.")
     except Exception as e:
         logger.error(f"CRITICAL ERROR DURING STARTUP while loading questions: {e}", exc_info=True)
 @app.get("/files/{task_id}",
          summary="Get Associated File by Task ID",
          description="Downloads the file associated with the given task_id, if one exists and is mapped.",
          responses={
              200: {
                  "description": "File content.",
+                 "content": {"*/*": {}}
              },
              403: {"model": ErrorResponse, "description": "Access denied (e.g., path traversal attempt)."},
              404: {"model": ErrorResponse, "description": "Task ID not found, no file associated, or file missing on server."},
     # --- CRUCIAL SECURITY CHECK ---
     try:
         abs_file_path = os.path.abspath(local_file_path)
         abs_base_path = ALLOWED_CACHE_BASE # Already absolute
         if not abs_file_path.startswith(abs_base_path):
             logger.error(f"SECURITY ALERT: Path traversal attempt denied for task_id '{task_id}'. Path '{local_file_path}' resolves outside base '{abs_base_path}'.")
             raise HTTPException(status_code=403, detail="File access denied.")
         if not os.path.exists(abs_file_path) or not os.path.isfile(abs_file_path):
              logger.error(f"File not found on server for task_id '{task_id}' at expected path: {abs_file_path}")
              raise HTTPException(status_code=404, detail=f"File associated with task_id {task_id} not found on server disk.")
     except HTTPException as http_exc:
+         raise http_exc
     except Exception as path_err:
          logger.error(f"Error resolving or checking path '{local_file_path}' for task_id '{task_id}': {path_err}", exc_info=True)
          raise HTTPException(status_code=500, detail="Server error validating file path.")
+    mime_type, _ = mimetypes.guess_type(abs_file_path)
+    media_type = mime_type if mime_type else "application/octet-stream"
     file_name_for_download = os.path.basename(abs_file_path)
     logger.info(f"Serving file '{file_name_for_download}' (type: {media_type}) for task_id '{task_id}' from path: {abs_file_path}")
     return FileResponse(path=abs_file_path, media_type=media_type, filename=file_name_for_download)
+def update_huggingface_dataset(username: str, score: float, code_link: str):
+    """
+    Loads the dataset, updates the score and code link if the score is higher,
+    and pushes back to the Hugging Face Hub.
+    Args:
+        username: The username of the participant.
+        score: The new score achieved by the participant.
+        code_link: The link to the code submission associated with this score.
+    Returns:
+        True if the dataset was updated and pushed, False otherwise.
+    Raises:
+        HTTPException: If there's an error interacting with the dataset.
+    """
     try:
+        # Define the expected schema including the 'code' column
+        expected_columns = {
+            'username': 'str',
+            'score': 'float',
+            'timestamp': 'str',
+            'code': 'str' # Added the code column
+        }
+        # 1. Attempt to load the dataset
+        logger.info(f"Attempting to load dataset '{HF_DATASET_ID}'...")
         ds_dict = None
+        df = None
         try:
+            # Try downloading a file first to check existence without loading full dataset if large
+            # This might not be necessary if load_dataset handles non-existence gracefully
+            # hf_hub_download(repo_id=HF_DATASET_ID, filename="data/train-00000-of-00001.parquet", repo_type="dataset")
+            ds_dict = load_dataset(HF_DATASET_ID, trust_remote_code=True) # Added trust_remote_code=True if needed
             logger.info("Dataset loaded successfully.")
+            if "train" in ds_dict:
+                df = ds_dict['train'].to_pandas()
             else:
+                 logger.warning(f"Dataset '{HF_DATASET_ID}' loaded but no 'train' split found. Creating structure.")
+                 df = pd.DataFrame({col: pd.Series(dtype=dtype) for col, dtype in expected_columns.items()})
+        except Exception as load_error:
+            logger.warning(f"Could not load dataset '{HF_DATASET_ID}' or it's empty/new ({load_error}). Will create structure.")
+            # Create an empty DataFrame with the correct schema if loading failed
+            df = pd.DataFrame({col: pd.Series(dtype=dtype) for col, dtype in expected_columns.items()})
+        # Ensure all expected columns exist, add if they don't
+        for col, dtype in expected_columns.items():
+            if col not in df.columns:
+                logger.warning(f"Column '{col}' not found in loaded data. Adding it.")
+                # Use appropriate default based on dtype if needed, though concat handles it
+                df[col] = pd.Series(dtype=dtype)
+        # Convert score column to numeric, coercing errors, and fill NaNs
+        df['score'] = pd.to_numeric(df['score'], errors='coerce').fillna(0.0)
+        # Ensure other columns have correct types, fill NaNs for string columns
+        df['username'] = df['username'].astype(str).fillna('')
+        df['timestamp'] = df['timestamp'].astype(str).fillna('')
+        df['code'] = df['code'].astype(str).fillna('') # Ensure code column is string
         # 2. Find existing score for the user
         existing_entries = df[df['username'] == username]
         if not existing_entries.empty:
             # User exists, find their highest score
+            max_existing_score = existing_entries['score'].max() # Already numeric
             if score > max_existing_score:
+                logger.info(f"New score {score} is higher than existing max {max_existing_score} for {username}. Updating entry.")
+                # Remove *all* old entries for this user to replace with the single best one
+                df = df[df['username'] != username].copy() # Use .copy() to avoid SettingWithCopyWarning
+                # Add new entry with score and code link
+                new_entry = pd.DataFrame([{
+                    'username': username,
+                    'score': score,
+                    'timestamp': current_timestamp,
+                    'code': code_link # Add the code link here
+                }])
                 df = pd.concat([df, new_entry], ignore_index=True)
                 needs_update = True
             else:
                 logger.info(f"New score {score} is not higher than existing max {max_existing_score} for {username}. No update needed.")
         else:
             # User does not exist, add them
+            logger.info(f"User {username} not found. Adding new entry with score {score}.")
+            new_entry = pd.DataFrame([{
+                'username': username,
+                'score': score,
+                'timestamp': current_timestamp,
+                'code': code_link # Add the code link here
+            }])
             df = pd.concat([df, new_entry], ignore_index=True)
             needs_update = True
         # 3. Push updated data back to Hugging Face Hub if changes were made
         if needs_update:
+            logger.info(f"Preparing to push updated dataset to '{HF_DATASET_ID}'...")
+            # Ensure final DataFrame columns match the expected schema exactly before converting
+            # Select and order columns just in case
+            df = df[list(expected_columns.keys())]
+            # Explicitly cast types again before creating Dataset object
+            for col, dtype in expected_columns.items():
+                 # Handle potential pandas nullable types if necessary, default to standard types
+                 if dtype == 'str':
+                     df[col] = df[col].astype(str).fillna('')
+                 elif dtype == 'float':
+                     df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0.0) # Ensure float conversion
+                 # Add other type handling if needed
+            logger.info(f"Final DataFrame columns and types:\n{df.dtypes}")
+            logger.info(f"Sample data before push:\n{df.head().to_string()}")
+            # Create the Dataset object from the final DataFrame
+            updated_ds = Dataset.from_pandas(df)
+            # Wrap it in a DatasetDict (standard practice)
+            final_ds_dict = DatasetDict({'train': updated_ds})
+            logger.info(f"Dataset structure to push: {final_ds_dict}")
+            # *** UNCOMMENT THIS LINE TO ACTUALLY PUSH THE DATA ***
+            # final_ds_dict.push_to_hub(HF_DATASET_ID)
+            # logger.info(f"Successfully pushed updated dataset to '{HF_DATASET_ID}'.")
+            logger.warning("Dataset push to hub is currently commented out in the code. Uncomment the 'push_to_hub' line to enable leaderboard updates.")
             return True
         else:
+            logger.info("No changes needed, dataset not pushed.")
             return False # No update was pushed
     except Exception as e:
         logger.error(f"Error interacting with Hugging Face dataset '{HF_DATASET_ID}': {e}", exc_info=True)
+        # Re-raise the exception to be caught by the endpoint handler or calling function
+        # Adjust the exception type if not using FastAPI's HTTPException
         raise HTTPException(status_code=500, detail=f"Failed to update Hugging Face dataset: {e}")
 @app.get("/questions",
          # Return a list of dictionaries with arbitrary keys/values
     correct_count = 0
     total_attempted_in_payload = len(submission.answers)
+    valid_attempted_count = 0
     processed_ids = set()
     for answer_item in submission.answers:
+        task_id = str(answer_item.task_id)
+        submitted = str(answer_item.submitted_answer)
         if task_id in processed_ids:
              logger.warning(f"Duplicate task_id '{task_id}' in submission from {submission.username}. Skipping.")
+             continue
         processed_ids.add(task_id)
         if task_id not in ground_truth_answers:
             logger.warning(f"Task ID '{task_id}' submitted by {submission.username} not found in ground truth list. Skipping this answer.")
             continue
         valid_attempted_count += 1
         ground_truth = ground_truth_answers[task_id]
         # Compare answers (case-insensitive, strip whitespace)
         else:
              logger.debug(f"Incorrect answer for {task_id} from {submission.username}. Submitted: '{submitted}', Expected: '{ground_truth}'")
     if valid_attempted_count == 0:
         score = 0.0
         message = f"Submission received, but no valid/matching task IDs were found in the {total_attempted_in_payload} answers provided."
     # Update Hugging Face dataset
     try:
+        updated = update_huggingface_dataset(submission.username, score, submission.agent_code)
         if updated:
              message += " High score updated on leaderboard."
              logger.info(f"Leaderboard updated for {submission.username}.")