File size: 2,865 Bytes
1e9f877
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
import time
import pandas as pd
from datasets import load_dataset
from fastapi import HTTPException
import logging

logger = logging.getLogger(__name__)

DATASET_NAME = "agents-course/unit4-students-scores"
CACHE_DURATION_SECONDS = 60  # Cache data for 60 seconds

# Simple in-memory cache
cached_data = None
last_cache_time = 0

def get_sorted_leaderboard_data():
    """
    Loads data from Hugging Face dataset, sorts it, and caches the result.
    Returns the sorted data as a list of dictionaries.
    """
    global cached_data, last_cache_time
    current_time = time.time()

    # Check cache validity
    if cached_data is not None and (current_time - last_cache_time) < CACHE_DURATION_SECONDS:
        logger.info("Returning cached leaderboard data.")
        return cached_data

    logger.info(f"Cache expired or empty. Fetching fresh data from {DATASET_NAME}...")
    try:
        # Load the dataset
        dataset = load_dataset(DATASET_NAME, split="train")

        # Convert to pandas DataFrame for easier sorting
        df = pd.DataFrame(dataset)

        # Ensure required columns exist
        required_columns = ['username', 'score', 'timestamp', 'code']
        if not all(col in df.columns for col in required_columns):
            missing = [col for col in required_columns if col not in df.columns]
            raise ValueError(f"Dataset missing required columns: {missing}")

        # Convert timestamp to datetime objects for proper sorting
        # Handle potential errors during conversion
        df['timestamp_dt'] = pd.to_datetime(df['timestamp'], errors='coerce')

        # Drop rows where timestamp conversion failed
        df.dropna(subset=['timestamp_dt'], inplace=True)

        # Sort by score (descending) and then by timestamp (ascending)
        df_sorted = df.sort_values(by=['score', 'timestamp_dt'], ascending=[False, True])

        # Select only the columns needed for the frontend + code
        # Convert DataFrame to list of dictionaries (JSON serializable)
        # Use original timestamp string for display consistency if needed,
        # but sorting was done on datetime objects.
        leaderboard = df_sorted[['username', 'score', 'timestamp', 'code']].to_dict(orient='records')

        # Update cache
        cached_data = leaderboard
        last_cache_time = current_time
        logger.info(f"Successfully fetched and cached data. {len(leaderboard)} entries.")

        return cached_data

    except Exception as e:
        logger.error(f"Error loading or processing dataset {DATASET_NAME}: {e}", exc_info=True)
        # Re-raise as HTTPException so FastAPI returns a proper error response
        raise HTTPException(status_code=500, detail=f"Failed to load or process leaderboard data: {e}")

# Optional: Add an __init__.py file in the app directory
# Create an empty file named app/__init__.py