Spaces:
Running
Running
Update main.py
Browse files
main.py
CHANGED
@@ -1,4 +1,3 @@
|
|
1 |
-
# Import necessary libraries (ensure all required imports are at the top)
|
2 |
import os
|
3 |
import pandas as pd
|
4 |
from fastapi import FastAPI, HTTPException, Body
|
@@ -28,7 +27,6 @@ filtered_dataset = None
|
|
28 |
|
29 |
ALLOWED_CACHE_BASE = os.path.abspath("/app/.cache")
|
30 |
|
31 |
-
# --- Define ErrorResponse if not already defined ---
|
32 |
class ErrorResponse(BaseModel):
|
33 |
detail: str
|
34 |
|
@@ -42,22 +40,19 @@ def load_questions():
|
|
42 |
global filtered_dataset
|
43 |
global questions_for_api
|
44 |
global ground_truth_answers
|
45 |
-
global task_file_paths
|
46 |
|
47 |
tempo_filtered = []
|
48 |
-
# Clear existing data from previous runs or restarts
|
49 |
questions_for_api.clear()
|
50 |
ground_truth_answers.clear()
|
51 |
-
task_file_paths.clear()
|
52 |
|
53 |
logger.info("Starting to load and filter GAIA dataset (validation split)...")
|
54 |
try:
|
55 |
-
# Load the specified split
|
56 |
dataset = load_dataset("gaia-benchmark/GAIA", "2023_level1", split="validation", trust_remote_code=True)
|
57 |
logger.info(f"GAIA dataset validation split loaded. Features: {dataset.features}")
|
58 |
except Exception as e:
|
59 |
logger.error(f"Failed to load GAIA dataset: {e}", exc_info=True)
|
60 |
-
# Depending on requirements, you might want to exit or raise a more specific error
|
61 |
raise RuntimeError("Could not load the primary GAIA dataset.") from e
|
62 |
|
63 |
# --- Filtering Logic based on Annotator Metadata ---
|
@@ -72,44 +67,37 @@ def load_questions():
|
|
72 |
try:
|
73 |
num_tools = int(num_tools_str)
|
74 |
num_steps = int(num_steps_str)
|
75 |
-
# Apply filter conditions
|
76 |
if num_tools < tool_threshold and num_steps < step_threshold:
|
77 |
-
tempo_filtered.append(item)
|
78 |
except ValueError:
|
79 |
logger.warning(f"Skipping Task ID: {item.get('task_id', 'N/A')} - Could not convert tool/step count in metadata: tools='{num_tools_str}', steps='{num_steps_str}'.")
|
80 |
else:
|
81 |
logger.warning(f"Skipping Task ID: {item.get('task_id', 'N/A')} - 'Number of tools' or 'Number of steps' missing in Metadata.")
|
82 |
else:
|
83 |
-
# If metadata is essential for filtering, you might want to skip items without it
|
84 |
logger.warning(f"Skipping Task ID: {item.get('task_id', 'N/A')} - Missing 'Annotator Metadata'.")
|
85 |
|
86 |
-
filtered_dataset = tempo_filtered
|
87 |
logger.info(f"Found {len(filtered_dataset)} questions matching the criteria (tools < {tool_threshold}, steps < {step_threshold}).")
|
88 |
|
89 |
processed_count = 0
|
90 |
-
# --- Process filtered items for API and File Mapping ---
|
91 |
for item in filtered_dataset:
|
92 |
# Extract data from the dataset item
|
93 |
task_id = item.get('task_id')
|
94 |
original_question_text = item.get('Question')
|
95 |
final_answer = item.get('Final answer')
|
96 |
-
local_file_path = item.get('file_path')
|
97 |
-
file_name = item.get('file_name')
|
98 |
|
99 |
-
# Validate essential fields needed for processing & ground truth
|
100 |
-
# Note: We proceed even if file path/name are missing, just won't map the file.
|
101 |
if task_id and original_question_text and final_answer is not None:
|
102 |
|
103 |
# 1. Create the dictionary to be exposed via the API
|
104 |
# (Includes 'file_name' for info, but excludes 'file_path')
|
105 |
processed_item = {
|
106 |
"task_id": str(task_id),
|
107 |
-
"question": str(original_question_text),
|
108 |
-
# Include other desired fields, using .get() for safety
|
109 |
"Level": item.get("Level"),
|
110 |
-
"file_name": file_name,
|
111 |
}
|
112 |
-
# Optional: Remove keys with None values if you prefer cleaner JSON
|
113 |
processed_item = {k: v for k, v in processed_item.items() if v is not None}
|
114 |
|
115 |
questions_for_api.append(processed_item)
|
@@ -122,21 +110,15 @@ def load_questions():
|
|
122 |
# Log if the path from the dataset isn't absolute (might indicate issues)
|
123 |
if not os.path.isabs(local_file_path):
|
124 |
logger.warning(f"Task {task_id}: Path '{local_file_path}' from dataset is not absolute. This might cause issues finding the file on the server.")
|
125 |
-
|
126 |
-
# Assuming WORKDIR is /app as per Dockerfile if paths are relative
|
127 |
-
# local_file_path = os.path.abspath(os.path.join("/app", local_file_path))
|
128 |
|
129 |
-
# Check if the file actually exists at the path ON THE SERVER
|
130 |
if os.path.exists(local_file_path) and os.path.isfile(local_file_path):
|
131 |
-
# Path exists, store the mapping
|
132 |
task_file_paths[str(task_id)] = local_file_path
|
133 |
logger.debug(f"Stored file path mapping for task_id {task_id}: {local_file_path}")
|
134 |
else:
|
135 |
-
# Path does *not* exist or is not a file on server filesystem
|
136 |
logger.warning(f"File path '{local_file_path}' for task_id {task_id} does NOT exist or is not a file on server. Mapping skipped.")
|
137 |
-
|
138 |
-
|
139 |
-
# Check which specific part was missing for better debugging
|
140 |
if not local_file_path and not file_name:
|
141 |
logger.debug(f"Task {task_id}: No 'file_path' or 'file_name' found in dataset item. No file mapping stored.")
|
142 |
elif not local_file_path:
|
@@ -147,17 +129,14 @@ def load_questions():
|
|
147 |
|
148 |
processed_count += 1
|
149 |
else:
|
150 |
-
# Log skipping due to missing core fields (task_id, Question, Final answer)
|
151 |
logger.warning(f"Skipping item processing due to missing essential fields: task_id={task_id}, has_question={original_question_text is not None}, has_answer={final_answer is not None}")
|
152 |
|
153 |
-
# Final summary logging
|
154 |
logger.info(f"Successfully processed {processed_count} questions for the API.")
|
155 |
logger.info(f"Stored file path mappings for {len(task_file_paths)} tasks.")
|
156 |
|
157 |
if not questions_for_api:
|
158 |
logger.error("CRITICAL: No valid questions were loaded after filtering and processing. API endpoints like /questions will fail.")
|
159 |
-
|
160 |
-
# raise RuntimeError("Failed to load mandatory question data after filtering.")
|
161 |
|
162 |
|
163 |
|
@@ -165,8 +144,7 @@ class Question(BaseModel):
|
|
165 |
task_id: str
|
166 |
question: str
|
167 |
Level: Optional[str] = None
|
168 |
-
file_name: Optional[str] = None
|
169 |
-
# file_path: Optional[str] = None # REMOVE file_path from the response model
|
170 |
|
171 |
|
172 |
# --- The rest of your Pydantic models remain the same ---
|
@@ -176,7 +154,7 @@ class AnswerItem(BaseModel):
|
|
176 |
|
177 |
class Submission(BaseModel):
|
178 |
username: str = Field(..., description="Hugging Face username", min_length=1)
|
179 |
-
agent_code: str = Field(..., description="The Python class code for the agent"
|
180 |
answers: List[AnswerItem] = Field(..., description="List of answers submitted by the agent")
|
181 |
|
182 |
class ScoreResponse(BaseModel):
|
@@ -190,15 +168,13 @@ class ScoreResponse(BaseModel):
|
|
190 |
class ErrorResponse(BaseModel):
|
191 |
detail: str
|
192 |
|
193 |
-
# Keep other models as they are (AnswerItem, Submission, ScoreResponse, ErrorResponse)
|
194 |
-
# ... (rest of the Pydantic models remain the same) ...
|
195 |
class AnswerItem(BaseModel):
|
196 |
task_id: str
|
197 |
submitted_answer: str = Field(..., description="The agent's answer for the task_id")
|
198 |
|
199 |
class Submission(BaseModel):
|
200 |
username: str = Field(..., description="Hugging Face username", min_length=1)
|
201 |
-
agent_code: str = Field(..., description="The Python class code for the agent", min_length=10)
|
202 |
answers: List[AnswerItem] = Field(..., description="List of answers submitted by the agent")
|
203 |
|
204 |
class ScoreResponse(BaseModel):
|
@@ -231,18 +207,16 @@ async def startup_event():
|
|
231 |
logger.info(f"Successfully loaded {len(questions_for_api)} questions.")
|
232 |
except Exception as e:
|
233 |
logger.error(f"CRITICAL ERROR DURING STARTUP while loading questions: {e}", exc_info=True)
|
234 |
-
|
235 |
-
# sys.exit(1) # Consider exiting if questions are critical
|
236 |
|
237 |
|
238 |
-
# --- Your Endpoints ---
|
239 |
@app.get("/files/{task_id}",
|
240 |
summary="Get Associated File by Task ID",
|
241 |
description="Downloads the file associated with the given task_id, if one exists and is mapped.",
|
242 |
responses={
|
243 |
200: {
|
244 |
"description": "File content.",
|
245 |
-
"content": {"*/*": {}}
|
246 |
},
|
247 |
403: {"model": ErrorResponse, "description": "Access denied (e.g., path traversal attempt)."},
|
248 |
404: {"model": ErrorResponse, "description": "Task ID not found, no file associated, or file missing on server."},
|
@@ -265,80 +239,93 @@ async def get_task_file(task_id: str):
|
|
265 |
|
266 |
# --- CRUCIAL SECURITY CHECK ---
|
267 |
try:
|
268 |
-
|
269 |
-
# --- local_file_path IS NOW DEFINED before being used ---
|
270 |
abs_file_path = os.path.abspath(local_file_path)
|
271 |
abs_base_path = ALLOWED_CACHE_BASE # Already absolute
|
272 |
|
273 |
-
# Check if the resolved file path starts with the allowed base directory
|
274 |
if not abs_file_path.startswith(abs_base_path):
|
275 |
logger.error(f"SECURITY ALERT: Path traversal attempt denied for task_id '{task_id}'. Path '{local_file_path}' resolves outside base '{abs_base_path}'.")
|
276 |
raise HTTPException(status_code=403, detail="File access denied.")
|
277 |
|
278 |
-
# Check if the file exists at the resolved, validated path
|
279 |
if not os.path.exists(abs_file_path) or not os.path.isfile(abs_file_path):
|
280 |
logger.error(f"File not found on server for task_id '{task_id}' at expected path: {abs_file_path}")
|
281 |
raise HTTPException(status_code=404, detail=f"File associated with task_id {task_id} not found on server disk.")
|
282 |
|
283 |
except HTTPException as http_exc:
|
284 |
-
raise http_exc
|
285 |
except Exception as path_err:
|
286 |
logger.error(f"Error resolving or checking path '{local_file_path}' for task_id '{task_id}': {path_err}", exc_info=True)
|
287 |
raise HTTPException(status_code=500, detail="Server error validating file path.")
|
288 |
-
# --- END SECURITY CHECK ---
|
289 |
|
290 |
-
# Determine MIME type for the Content-Type header
|
291 |
-
mime_type, _ = mimetypes.guess_type(abs_file_path) # Ensure 'import mimetypes' is at the top
|
292 |
-
media_type = mime_type if mime_type else "application/octet-stream" # Default if unknown
|
293 |
|
294 |
-
|
|
|
|
|
295 |
file_name_for_download = os.path.basename(abs_file_path)
|
296 |
|
297 |
logger.info(f"Serving file '{file_name_for_download}' (type: {media_type}) for task_id '{task_id}' from path: {abs_file_path}")
|
298 |
|
299 |
-
# Use FileResponse to efficiently stream the file
|
300 |
return FileResponse(path=abs_file_path, media_type=media_type, filename=file_name_for_download)
|
301 |
-
|
302 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
303 |
try:
|
304 |
-
#
|
305 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
306 |
ds_dict = None
|
|
|
307 |
try:
|
308 |
-
#
|
309 |
-
# This
|
310 |
-
hf_hub_download(repo_id=HF_DATASET_ID, filename="data/train-00000-of-00001.parquet", repo_type="dataset")
|
311 |
-
ds_dict = load_dataset(HF_DATASET_ID)
|
312 |
logger.info("Dataset loaded successfully.")
|
313 |
-
if "train"
|
314 |
-
|
315 |
-
df = pd.DataFrame({'username': pd.Series(dtype='str'),
|
316 |
-
'score': pd.Series(dtype='float'),
|
317 |
-
'timestamp': pd.Series(dtype='str')})
|
318 |
else:
|
319 |
-
|
320 |
-
df =
|
321 |
-
|
322 |
-
except Exception as load_error:
|
323 |
-
logger.warning(f"Could not load dataset '{HF_DATASET_ID}' or it
|
324 |
-
# Create an empty DataFrame with the correct schema
|
325 |
-
df = pd.DataFrame({
|
326 |
-
|
327 |
-
|
328 |
-
|
329 |
-
|
330 |
-
|
331 |
-
|
332 |
-
|
333 |
-
|
334 |
-
|
335 |
-
|
336 |
-
|
337 |
-
|
338 |
-
df['
|
339 |
-
|
340 |
-
df['score'] = df['score'].fillna(0.0)
|
341 |
-
|
342 |
|
343 |
# 2. Find existing score for the user
|
344 |
existing_entries = df[df['username'] == username]
|
@@ -347,51 +334,75 @@ def update_huggingface_dataset(username: str, score: float):
|
|
347 |
|
348 |
if not existing_entries.empty:
|
349 |
# User exists, find their highest score
|
350 |
-
|
351 |
-
max_existing_score = existing_entries['score'].max()
|
352 |
if score > max_existing_score:
|
353 |
-
logger.info(f"New score {score} is higher than existing max {max_existing_score} for {username}. Updating.")
|
354 |
-
# Remove old entries for this user
|
355 |
-
df = df[df['username'] != username]
|
356 |
-
# Add new entry
|
357 |
-
new_entry = pd.DataFrame([{
|
|
|
|
|
|
|
|
|
|
|
358 |
df = pd.concat([df, new_entry], ignore_index=True)
|
359 |
needs_update = True
|
360 |
else:
|
361 |
logger.info(f"New score {score} is not higher than existing max {max_existing_score} for {username}. No update needed.")
|
362 |
else:
|
363 |
# User does not exist, add them
|
364 |
-
logger.info(f"User {username} not found. Adding new entry.")
|
365 |
-
new_entry = pd.DataFrame([{
|
|
|
|
|
|
|
|
|
|
|
366 |
df = pd.concat([df, new_entry], ignore_index=True)
|
367 |
needs_update = True
|
368 |
|
369 |
# 3. Push updated data back to Hugging Face Hub if changes were made
|
370 |
if needs_update:
|
371 |
-
logger.info(f"
|
372 |
-
|
373 |
-
# Ensure the schema
|
374 |
-
#
|
375 |
-
|
376 |
-
|
377 |
-
|
378 |
-
|
379 |
-
|
380 |
-
|
381 |
-
|
382 |
-
|
383 |
-
|
384 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
385 |
return True
|
386 |
else:
|
|
|
387 |
return False # No update was pushed
|
388 |
|
389 |
except Exception as e:
|
390 |
logger.error(f"Error interacting with Hugging Face dataset '{HF_DATASET_ID}': {e}", exc_info=True)
|
391 |
-
# Re-raise the exception to be caught by the endpoint handler
|
|
|
392 |
raise HTTPException(status_code=500, detail=f"Failed to update Hugging Face dataset: {e}")
|
393 |
|
394 |
-
# --- API Endpoints (Modified response_model) ---
|
395 |
|
396 |
@app.get("/questions",
|
397 |
# Return a list of dictionaries with arbitrary keys/values
|
@@ -464,27 +475,26 @@ async def submit_answers(submission: Submission = Body(...)):
|
|
464 |
|
465 |
correct_count = 0
|
466 |
total_attempted_in_payload = len(submission.answers)
|
467 |
-
valid_attempted_count = 0
|
468 |
processed_ids = set()
|
469 |
|
470 |
for answer_item in submission.answers:
|
471 |
-
task_id = str(answer_item.task_id)
|
472 |
-
submitted = str(answer_item.submitted_answer)
|
473 |
|
474 |
-
|
475 |
if task_id in processed_ids:
|
476 |
logger.warning(f"Duplicate task_id '{task_id}' in submission from {submission.username}. Skipping.")
|
477 |
-
continue
|
478 |
processed_ids.add(task_id)
|
479 |
|
480 |
|
481 |
-
|
482 |
if task_id not in ground_truth_answers:
|
483 |
logger.warning(f"Task ID '{task_id}' submitted by {submission.username} not found in ground truth list. Skipping this answer.")
|
484 |
-
# Don't count this as a valid attempt for score calculation
|
485 |
continue
|
486 |
|
487 |
-
|
488 |
valid_attempted_count += 1
|
489 |
ground_truth = ground_truth_answers[task_id]
|
490 |
# Compare answers (case-insensitive, strip whitespace)
|
@@ -494,8 +504,6 @@ async def submit_answers(submission: Submission = Body(...)):
|
|
494 |
else:
|
495 |
logger.debug(f"Incorrect answer for {task_id} from {submission.username}. Submitted: '{submitted}', Expected: '{ground_truth}'")
|
496 |
|
497 |
-
|
498 |
-
# Calculate score based on valid attempts AND total number of questions available
|
499 |
if valid_attempted_count == 0:
|
500 |
score = 0.0
|
501 |
message = f"Submission received, but no valid/matching task IDs were found in the {total_attempted_in_payload} answers provided."
|
@@ -515,7 +523,7 @@ async def submit_answers(submission: Submission = Body(...)):
|
|
515 |
|
516 |
# Update Hugging Face dataset
|
517 |
try:
|
518 |
-
updated = update_huggingface_dataset(submission.username, score)
|
519 |
if updated:
|
520 |
message += " High score updated on leaderboard."
|
521 |
logger.info(f"Leaderboard updated for {submission.username}.")
|
|
|
|
|
1 |
import os
|
2 |
import pandas as pd
|
3 |
from fastapi import FastAPI, HTTPException, Body
|
|
|
27 |
|
28 |
ALLOWED_CACHE_BASE = os.path.abspath("/app/.cache")
|
29 |
|
|
|
30 |
class ErrorResponse(BaseModel):
|
31 |
detail: str
|
32 |
|
|
|
40 |
global filtered_dataset
|
41 |
global questions_for_api
|
42 |
global ground_truth_answers
|
43 |
+
global task_file_paths
|
44 |
|
45 |
tempo_filtered = []
|
|
|
46 |
questions_for_api.clear()
|
47 |
ground_truth_answers.clear()
|
48 |
+
task_file_paths.clear()
|
49 |
|
50 |
logger.info("Starting to load and filter GAIA dataset (validation split)...")
|
51 |
try:
|
|
|
52 |
dataset = load_dataset("gaia-benchmark/GAIA", "2023_level1", split="validation", trust_remote_code=True)
|
53 |
logger.info(f"GAIA dataset validation split loaded. Features: {dataset.features}")
|
54 |
except Exception as e:
|
55 |
logger.error(f"Failed to load GAIA dataset: {e}", exc_info=True)
|
|
|
56 |
raise RuntimeError("Could not load the primary GAIA dataset.") from e
|
57 |
|
58 |
# --- Filtering Logic based on Annotator Metadata ---
|
|
|
67 |
try:
|
68 |
num_tools = int(num_tools_str)
|
69 |
num_steps = int(num_steps_str)
|
|
|
70 |
if num_tools < tool_threshold and num_steps < step_threshold:
|
71 |
+
tempo_filtered.append(item)
|
72 |
except ValueError:
|
73 |
logger.warning(f"Skipping Task ID: {item.get('task_id', 'N/A')} - Could not convert tool/step count in metadata: tools='{num_tools_str}', steps='{num_steps_str}'.")
|
74 |
else:
|
75 |
logger.warning(f"Skipping Task ID: {item.get('task_id', 'N/A')} - 'Number of tools' or 'Number of steps' missing in Metadata.")
|
76 |
else:
|
|
|
77 |
logger.warning(f"Skipping Task ID: {item.get('task_id', 'N/A')} - Missing 'Annotator Metadata'.")
|
78 |
|
79 |
+
filtered_dataset = tempo_filtered
|
80 |
logger.info(f"Found {len(filtered_dataset)} questions matching the criteria (tools < {tool_threshold}, steps < {step_threshold}).")
|
81 |
|
82 |
processed_count = 0
|
|
|
83 |
for item in filtered_dataset:
|
84 |
# Extract data from the dataset item
|
85 |
task_id = item.get('task_id')
|
86 |
original_question_text = item.get('Question')
|
87 |
final_answer = item.get('Final answer')
|
88 |
+
local_file_path = item.get('file_path')
|
89 |
+
file_name = item.get('file_name')
|
90 |
|
|
|
|
|
91 |
if task_id and original_question_text and final_answer is not None:
|
92 |
|
93 |
# 1. Create the dictionary to be exposed via the API
|
94 |
# (Includes 'file_name' for info, but excludes 'file_path')
|
95 |
processed_item = {
|
96 |
"task_id": str(task_id),
|
97 |
+
"question": str(original_question_text),
|
|
|
98 |
"Level": item.get("Level"),
|
99 |
+
"file_name": file_name,
|
100 |
}
|
|
|
101 |
processed_item = {k: v for k, v in processed_item.items() if v is not None}
|
102 |
|
103 |
questions_for_api.append(processed_item)
|
|
|
110 |
# Log if the path from the dataset isn't absolute (might indicate issues)
|
111 |
if not os.path.isabs(local_file_path):
|
112 |
logger.warning(f"Task {task_id}: Path '{local_file_path}' from dataset is not absolute. This might cause issues finding the file on the server.")
|
113 |
+
|
|
|
|
|
114 |
|
|
|
115 |
if os.path.exists(local_file_path) and os.path.isfile(local_file_path):
|
|
|
116 |
task_file_paths[str(task_id)] = local_file_path
|
117 |
logger.debug(f"Stored file path mapping for task_id {task_id}: {local_file_path}")
|
118 |
else:
|
|
|
119 |
logger.warning(f"File path '{local_file_path}' for task_id {task_id} does NOT exist or is not a file on server. Mapping skipped.")
|
120 |
+
elif task_id:
|
121 |
+
|
|
|
122 |
if not local_file_path and not file_name:
|
123 |
logger.debug(f"Task {task_id}: No 'file_path' or 'file_name' found in dataset item. No file mapping stored.")
|
124 |
elif not local_file_path:
|
|
|
129 |
|
130 |
processed_count += 1
|
131 |
else:
|
|
|
132 |
logger.warning(f"Skipping item processing due to missing essential fields: task_id={task_id}, has_question={original_question_text is not None}, has_answer={final_answer is not None}")
|
133 |
|
|
|
134 |
logger.info(f"Successfully processed {processed_count} questions for the API.")
|
135 |
logger.info(f"Stored file path mappings for {len(task_file_paths)} tasks.")
|
136 |
|
137 |
if not questions_for_api:
|
138 |
logger.error("CRITICAL: No valid questions were loaded after filtering and processing. API endpoints like /questions will fail.")
|
139 |
+
|
|
|
140 |
|
141 |
|
142 |
|
|
|
144 |
task_id: str
|
145 |
question: str
|
146 |
Level: Optional[str] = None
|
147 |
+
file_name: Optional[str] = None
|
|
|
148 |
|
149 |
|
150 |
# --- The rest of your Pydantic models remain the same ---
|
|
|
154 |
|
155 |
class Submission(BaseModel):
|
156 |
username: str = Field(..., description="Hugging Face username", min_length=1)
|
157 |
+
agent_code: str = Field(..., description="The Python class code for the agent")
|
158 |
answers: List[AnswerItem] = Field(..., description="List of answers submitted by the agent")
|
159 |
|
160 |
class ScoreResponse(BaseModel):
|
|
|
168 |
class ErrorResponse(BaseModel):
|
169 |
detail: str
|
170 |
|
|
|
|
|
171 |
class AnswerItem(BaseModel):
|
172 |
task_id: str
|
173 |
submitted_answer: str = Field(..., description="The agent's answer for the task_id")
|
174 |
|
175 |
class Submission(BaseModel):
|
176 |
username: str = Field(..., description="Hugging Face username", min_length=1)
|
177 |
+
agent_code: str = Field(..., description="The Python class code for the agent", min_length=10)
|
178 |
answers: List[AnswerItem] = Field(..., description="List of answers submitted by the agent")
|
179 |
|
180 |
class ScoreResponse(BaseModel):
|
|
|
207 |
logger.info(f"Successfully loaded {len(questions_for_api)} questions.")
|
208 |
except Exception as e:
|
209 |
logger.error(f"CRITICAL ERROR DURING STARTUP while loading questions: {e}", exc_info=True)
|
210 |
+
|
|
|
211 |
|
212 |
|
|
|
213 |
@app.get("/files/{task_id}",
|
214 |
summary="Get Associated File by Task ID",
|
215 |
description="Downloads the file associated with the given task_id, if one exists and is mapped.",
|
216 |
responses={
|
217 |
200: {
|
218 |
"description": "File content.",
|
219 |
+
"content": {"*/*": {}}
|
220 |
},
|
221 |
403: {"model": ErrorResponse, "description": "Access denied (e.g., path traversal attempt)."},
|
222 |
404: {"model": ErrorResponse, "description": "Task ID not found, no file associated, or file missing on server."},
|
|
|
239 |
|
240 |
# --- CRUCIAL SECURITY CHECK ---
|
241 |
try:
|
242 |
+
|
|
|
243 |
abs_file_path = os.path.abspath(local_file_path)
|
244 |
abs_base_path = ALLOWED_CACHE_BASE # Already absolute
|
245 |
|
|
|
246 |
if not abs_file_path.startswith(abs_base_path):
|
247 |
logger.error(f"SECURITY ALERT: Path traversal attempt denied for task_id '{task_id}'. Path '{local_file_path}' resolves outside base '{abs_base_path}'.")
|
248 |
raise HTTPException(status_code=403, detail="File access denied.")
|
249 |
|
|
|
250 |
if not os.path.exists(abs_file_path) or not os.path.isfile(abs_file_path):
|
251 |
logger.error(f"File not found on server for task_id '{task_id}' at expected path: {abs_file_path}")
|
252 |
raise HTTPException(status_code=404, detail=f"File associated with task_id {task_id} not found on server disk.")
|
253 |
|
254 |
except HTTPException as http_exc:
|
255 |
+
raise http_exc
|
256 |
except Exception as path_err:
|
257 |
logger.error(f"Error resolving or checking path '{local_file_path}' for task_id '{task_id}': {path_err}", exc_info=True)
|
258 |
raise HTTPException(status_code=500, detail="Server error validating file path.")
|
|
|
259 |
|
|
|
|
|
|
|
260 |
|
261 |
+
mime_type, _ = mimetypes.guess_type(abs_file_path)
|
262 |
+
media_type = mime_type if mime_type else "application/octet-stream"
|
263 |
+
|
264 |
file_name_for_download = os.path.basename(abs_file_path)
|
265 |
|
266 |
logger.info(f"Serving file '{file_name_for_download}' (type: {media_type}) for task_id '{task_id}' from path: {abs_file_path}")
|
267 |
|
|
|
268 |
return FileResponse(path=abs_file_path, media_type=media_type, filename=file_name_for_download)
|
269 |
+
|
270 |
+
def update_huggingface_dataset(username: str, score: float, code_link: str):
|
271 |
+
"""
|
272 |
+
Loads the dataset, updates the score and code link if the score is higher,
|
273 |
+
and pushes back to the Hugging Face Hub.
|
274 |
+
|
275 |
+
Args:
|
276 |
+
username: The username of the participant.
|
277 |
+
score: The new score achieved by the participant.
|
278 |
+
code_link: The link to the code submission associated with this score.
|
279 |
+
|
280 |
+
Returns:
|
281 |
+
True if the dataset was updated and pushed, False otherwise.
|
282 |
+
|
283 |
+
Raises:
|
284 |
+
HTTPException: If there's an error interacting with the dataset.
|
285 |
+
"""
|
286 |
try:
|
287 |
+
# Define the expected schema including the 'code' column
|
288 |
+
expected_columns = {
|
289 |
+
'username': 'str',
|
290 |
+
'score': 'float',
|
291 |
+
'timestamp': 'str',
|
292 |
+
'code': 'str' # Added the code column
|
293 |
+
}
|
294 |
+
|
295 |
+
# 1. Attempt to load the dataset
|
296 |
+
logger.info(f"Attempting to load dataset '{HF_DATASET_ID}'...")
|
297 |
ds_dict = None
|
298 |
+
df = None
|
299 |
try:
|
300 |
+
# Try downloading a file first to check existence without loading full dataset if large
|
301 |
+
# This might not be necessary if load_dataset handles non-existence gracefully
|
302 |
+
# hf_hub_download(repo_id=HF_DATASET_ID, filename="data/train-00000-of-00001.parquet", repo_type="dataset")
|
303 |
+
ds_dict = load_dataset(HF_DATASET_ID, trust_remote_code=True) # Added trust_remote_code=True if needed
|
304 |
logger.info("Dataset loaded successfully.")
|
305 |
+
if "train" in ds_dict:
|
306 |
+
df = ds_dict['train'].to_pandas()
|
|
|
|
|
|
|
307 |
else:
|
308 |
+
logger.warning(f"Dataset '{HF_DATASET_ID}' loaded but no 'train' split found. Creating structure.")
|
309 |
+
df = pd.DataFrame({col: pd.Series(dtype=dtype) for col, dtype in expected_columns.items()})
|
310 |
+
|
311 |
+
except Exception as load_error:
|
312 |
+
logger.warning(f"Could not load dataset '{HF_DATASET_ID}' or it's empty/new ({load_error}). Will create structure.")
|
313 |
+
# Create an empty DataFrame with the correct schema if loading failed
|
314 |
+
df = pd.DataFrame({col: pd.Series(dtype=dtype) for col, dtype in expected_columns.items()})
|
315 |
+
|
316 |
+
# Ensure all expected columns exist, add if they don't
|
317 |
+
for col, dtype in expected_columns.items():
|
318 |
+
if col not in df.columns:
|
319 |
+
logger.warning(f"Column '{col}' not found in loaded data. Adding it.")
|
320 |
+
# Use appropriate default based on dtype if needed, though concat handles it
|
321 |
+
df[col] = pd.Series(dtype=dtype)
|
322 |
+
|
323 |
+
# Convert score column to numeric, coercing errors, and fill NaNs
|
324 |
+
df['score'] = pd.to_numeric(df['score'], errors='coerce').fillna(0.0)
|
325 |
+
# Ensure other columns have correct types, fill NaNs for string columns
|
326 |
+
df['username'] = df['username'].astype(str).fillna('')
|
327 |
+
df['timestamp'] = df['timestamp'].astype(str).fillna('')
|
328 |
+
df['code'] = df['code'].astype(str).fillna('') # Ensure code column is string
|
|
|
|
|
329 |
|
330 |
# 2. Find existing score for the user
|
331 |
existing_entries = df[df['username'] == username]
|
|
|
334 |
|
335 |
if not existing_entries.empty:
|
336 |
# User exists, find their highest score
|
337 |
+
max_existing_score = existing_entries['score'].max() # Already numeric
|
|
|
338 |
if score > max_existing_score:
|
339 |
+
logger.info(f"New score {score} is higher than existing max {max_existing_score} for {username}. Updating entry.")
|
340 |
+
# Remove *all* old entries for this user to replace with the single best one
|
341 |
+
df = df[df['username'] != username].copy() # Use .copy() to avoid SettingWithCopyWarning
|
342 |
+
# Add new entry with score and code link
|
343 |
+
new_entry = pd.DataFrame([{
|
344 |
+
'username': username,
|
345 |
+
'score': score,
|
346 |
+
'timestamp': current_timestamp,
|
347 |
+
'code': code_link # Add the code link here
|
348 |
+
}])
|
349 |
df = pd.concat([df, new_entry], ignore_index=True)
|
350 |
needs_update = True
|
351 |
else:
|
352 |
logger.info(f"New score {score} is not higher than existing max {max_existing_score} for {username}. No update needed.")
|
353 |
else:
|
354 |
# User does not exist, add them
|
355 |
+
logger.info(f"User {username} not found. Adding new entry with score {score}.")
|
356 |
+
new_entry = pd.DataFrame([{
|
357 |
+
'username': username,
|
358 |
+
'score': score,
|
359 |
+
'timestamp': current_timestamp,
|
360 |
+
'code': code_link # Add the code link here
|
361 |
+
}])
|
362 |
df = pd.concat([df, new_entry], ignore_index=True)
|
363 |
needs_update = True
|
364 |
|
365 |
# 3. Push updated data back to Hugging Face Hub if changes were made
|
366 |
if needs_update:
|
367 |
+
logger.info(f"Preparing to push updated dataset to '{HF_DATASET_ID}'...")
|
368 |
+
|
369 |
+
# Ensure final DataFrame columns match the expected schema exactly before converting
|
370 |
+
# Select and order columns just in case
|
371 |
+
df = df[list(expected_columns.keys())]
|
372 |
+
# Explicitly cast types again before creating Dataset object
|
373 |
+
for col, dtype in expected_columns.items():
|
374 |
+
# Handle potential pandas nullable types if necessary, default to standard types
|
375 |
+
if dtype == 'str':
|
376 |
+
df[col] = df[col].astype(str).fillna('')
|
377 |
+
elif dtype == 'float':
|
378 |
+
df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0.0) # Ensure float conversion
|
379 |
+
# Add other type handling if needed
|
380 |
+
|
381 |
+
logger.info(f"Final DataFrame columns and types:\n{df.dtypes}")
|
382 |
+
logger.info(f"Sample data before push:\n{df.head().to_string()}")
|
383 |
+
|
384 |
+
# Create the Dataset object from the final DataFrame
|
385 |
+
updated_ds = Dataset.from_pandas(df)
|
386 |
+
# Wrap it in a DatasetDict (standard practice)
|
387 |
+
final_ds_dict = DatasetDict({'train': updated_ds})
|
388 |
+
|
389 |
+
logger.info(f"Dataset structure to push: {final_ds_dict}")
|
390 |
+
|
391 |
+
# *** UNCOMMENT THIS LINE TO ACTUALLY PUSH THE DATA ***
|
392 |
+
# final_ds_dict.push_to_hub(HF_DATASET_ID)
|
393 |
+
# logger.info(f"Successfully pushed updated dataset to '{HF_DATASET_ID}'.")
|
394 |
+
logger.warning("Dataset push to hub is currently commented out in the code. Uncomment the 'push_to_hub' line to enable leaderboard updates.")
|
395 |
return True
|
396 |
else:
|
397 |
+
logger.info("No changes needed, dataset not pushed.")
|
398 |
return False # No update was pushed
|
399 |
|
400 |
except Exception as e:
|
401 |
logger.error(f"Error interacting with Hugging Face dataset '{HF_DATASET_ID}': {e}", exc_info=True)
|
402 |
+
# Re-raise the exception to be caught by the endpoint handler or calling function
|
403 |
+
# Adjust the exception type if not using FastAPI's HTTPException
|
404 |
raise HTTPException(status_code=500, detail=f"Failed to update Hugging Face dataset: {e}")
|
405 |
|
|
|
406 |
|
407 |
@app.get("/questions",
|
408 |
# Return a list of dictionaries with arbitrary keys/values
|
|
|
475 |
|
476 |
correct_count = 0
|
477 |
total_attempted_in_payload = len(submission.answers)
|
478 |
+
valid_attempted_count = 0
|
479 |
processed_ids = set()
|
480 |
|
481 |
for answer_item in submission.answers:
|
482 |
+
task_id = str(answer_item.task_id)
|
483 |
+
submitted = str(answer_item.submitted_answer)
|
484 |
|
485 |
+
|
486 |
if task_id in processed_ids:
|
487 |
logger.warning(f"Duplicate task_id '{task_id}' in submission from {submission.username}. Skipping.")
|
488 |
+
continue
|
489 |
processed_ids.add(task_id)
|
490 |
|
491 |
|
492 |
+
|
493 |
if task_id not in ground_truth_answers:
|
494 |
logger.warning(f"Task ID '{task_id}' submitted by {submission.username} not found in ground truth list. Skipping this answer.")
|
|
|
495 |
continue
|
496 |
|
497 |
+
|
498 |
valid_attempted_count += 1
|
499 |
ground_truth = ground_truth_answers[task_id]
|
500 |
# Compare answers (case-insensitive, strip whitespace)
|
|
|
504 |
else:
|
505 |
logger.debug(f"Incorrect answer for {task_id} from {submission.username}. Submitted: '{submitted}', Expected: '{ground_truth}'")
|
506 |
|
|
|
|
|
507 |
if valid_attempted_count == 0:
|
508 |
score = 0.0
|
509 |
message = f"Submission received, but no valid/matching task IDs were found in the {total_attempted_in_payload} answers provided."
|
|
|
523 |
|
524 |
# Update Hugging Face dataset
|
525 |
try:
|
526 |
+
updated = update_huggingface_dataset(submission.username, score, submission.agent_code)
|
527 |
if updated:
|
528 |
message += " High score updated on leaderboard."
|
529 |
logger.info(f"Leaderboard updated for {submission.username}.")
|