File size: 7,376 Bytes
bb4ee7a
6a1e686
 
8524cf7
5d27647
2f3df87
 
 
8524cf7
 
 
da9a9de
3515613
d45f589
 
 
2f3df87
22054c0
3515613
bb4ee7a
b637757
 
 
3515613
bb4ee7a
 
3515613
 
 
 
bb4ee7a
 
 
2f3df87
 
bb4ee7a
2f3df87
bb4ee7a
 
 
 
 
 
 
 
 
b637757
bb4ee7a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2f3df87
bb4ee7a
fe52d58
5d27647
 
 
 
 
 
 
 
 
 
 
 
2f3df87
 
 
 
 
 
 
 
8524cf7
 
 
2f3df87
c4ad33b
8524cf7
c4ad33b
8524cf7
c4ad33b
8524cf7
 
6a1e686
8524cf7
d45f589
8524cf7
 
d45f589
8524cf7
 
 
d45f589
 
bb4ee7a
 
8524cf7
 
 
 
 
d45f589
8524cf7
 
 
 
 
fe52d58
8524cf7
 
d45f589
8524cf7
 
 
 
 
a6bcf7e
8df5d3b
1c3c329
8524cf7
 
d45f589
8524cf7
0c10f09
 
8524cf7
 
 
 
 
 
 
 
 
 
 
 
 
1c3c329
 
 
 
fe52d58
 
 
8524cf7
fe52d58
8524cf7
fe52d58
da9a9de
 
c4ad33b
da9a9de
 
 
c4ad33b
da9a9de
 
 
 
5a63674
da9a9de
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
import os
import numpy as np
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import MultiLabelBinarizer
import zipfile
import json
import pandas as pd
import torch
from datasets import Dataset
from torch.utils.data import DataLoader
import requests
from pathlib import Path

from .config import TAG_NAMES, DEVICE, SPACE_URL, EVAL_LIMIT
from .globals import global_model, global_tokenizer

print(np.__version__)

def _load_data(test_data_path):
    eval_limit = os.getenv("EVAL_LIM")
    if not eval_limit:
        eval_limit = EVAL_LIMIT
    test_data_path = Path(__file__).parent / test_data_path
    # Check file existence
    if not os.path.exists(test_data_path):
        raise FileNotFoundError(
            f"ZIP file not found at {test_data_path}. "
            f"Current directory: {os.listdir(Path(__file__).parent)}"
        )
    
    if not zipfile.is_zipfile(test_data_path):
        raise zipfile.BadZipFile(f"File is not a valid zip archive: {test_data_path}")

    data = []
    features = ["prob_desc_description", "prob_desc_input_spec", "prob_desc_output_spec"]
    cols = features + ["tags"]
    
    try:
        with zipfile.ZipFile(test_data_path, 'r') as zip_file:
            # Verify zip contents
            names = zip_file.namelist()
            if not names:
                raise ValueError("Empty zip archive - no files found")
                
            # Process files with limit
            for name in names[1:1+int(eval_limit)]:
                try:
                    with zip_file.open(name) as f:
                        content = f.read()
                        d = json.loads(content)
                        
                        # 4. Validate required fields
                        if not all(col in d for col in cols):
                            missing = [col for col in cols if col not in d]
                            raise KeyError(f"Missing required fields in {name}: {missing}")
                            
                        row = [d[c] for c in cols]
                        data.append(row)
                        
                except json.JSONDecodeError as e:
                    raise ValueError(f"Invalid JSON in file {name}: {str(e)}")
                except Exception as e:
                    raise RuntimeError(f"Error processing {name}: {str(e)}")

    except zipfile.BadZipFile as e:
        raise zipfile.BadZipFile(f"Corrupted zip file: {str(e)}")
    except Exception as e:
        raise RuntimeError(f"Unexpected error loading data: {str(e)}")

    if not data:
        raise ValueError("No valid data files found in zip archive")
        
    return pd.DataFrame(data, columns=cols)

def _preprocessing(df):
    mlb = MultiLabelBinarizer(classes = TAG_NAMES)
    tags_to_encode = ['math', 'graphs', 'strings', 'number theory', 'trees', 'geometry', 'games', 'probabilities']

    # Filter tags and one-hot encode
    df['tags_filtered'] = [[tag for tag in tags if tag in tags_to_encode] for tags in df["tags"]]
    df.loc[df['tags_filtered'].apply(len) == 0, 'tags_filtered'] = df.loc[df['tags_filtered'].apply(len) == 0, 'tags_filtered'].apply(lambda x: ['other'])
    encoded_tags = mlb.fit_transform(df['tags_filtered'])

    # Create a new DataFrame with one-hot encoded columns
    encoded_df = pd.DataFrame(encoded_tags, columns=mlb.classes_)

    # Concatenate the encoded tags with the original DataFrame
    df = pd.concat([df, encoded_df], axis=1)
    texts = df["prob_desc_description"].values.tolist()
    labels = df[TAG_NAMES].values.tolist()

    # data:
    # texts = ["text1", "text2", ...]  # list of texts
    # labels = [[0,1,0,0,1,0,1,1,0], [0,1,1,0,0,0,0,0,0],, ...] # list of labels
    
    df = pd.DataFrame({'text':texts, 'labels': labels})
    return df



def evaluate_batch(file_path, hf_repo, backend="local", hf_token=None):
    if backend == "local":
        return _evaluate_local(file_path, hf_repo)
    elif backend == "hf":
        return _evaluate_hf_api(file_path, hf_token)
    else:
        raise ValueError(f"Unknown backend: {backend}")

def _evaluate_local(test_data_path, hf_repo):
    global global_model, global_tokenizer
    
    # Lazy-loading to avoid slow startup
    if global_model is None:
        from .model import QwenClassifier
        from transformers import AutoTokenizer
        
        global_model = QwenClassifier.from_pretrained(hf_repo).eval()
        global_tokenizer = AutoTokenizer.from_pretrained(hf_repo)
    df = _load_data(test_data_path)
    df = _preprocessing(df)

    hf_dataset = Dataset.from_pandas(df)

    # Then apply tokenization
    def tokenize_function(examples):
        return global_tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)

    dataset = hf_dataset.map(tokenize_function, batched=True)

    dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

    dataloader = DataLoader(dataset, batch_size=16, shuffle=True)


    global_model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in dataloader:
            print(f"EVALUATION RUNNING ON {global_model.device}")
            global_model = global_model.to(DEVICE)
            batch = {k: v.to(DEVICE) for k, v in batch.items()}
            labels = batch["labels"].type(torch.float32)

            logits = global_model(batch["input_ids"], batch["attention_mask"])

            preds = torch.sigmoid(logits).cpu().numpy() > 0.5
            labels = labels.cpu().numpy()

            all_preds.extend(preds)
            all_labels.extend(labels)

    val_acc = accuracy_score(all_labels, all_preds)
    val_prec = precision_score(all_labels, all_preds, average='macro', zero_division=0)
    val_rec = recall_score(all_labels, all_preds, average='macro')
    val_f1 = f1_score(all_labels, all_preds, average='macro')
    val_prec_per_class = precision_score(all_labels, all_preds, average=None, zero_division=0)
    val_rec_per_class = recall_score(all_labels, all_preds, average=None)
    val_f1_per_class = f1_score(all_labels, all_preds, average=None)

    metrics = {
        'Accuracy':int(100*val_acc),
        'Precision':int(100*val_prec),
        'Recall':int(100*val_rec), 
        'F1':int(100*val_f1),
        'Precision_per_class':(100*val_prec_per_class).astype(int),
        'Recall_per_class':(100*val_rec_per_class).astype(int),
        'F1_per_class':(100*val_f1_per_class).astype(int),
    }
    # report = classification_report(all_labels, all_preds, target_names=TAG_NAMES, zero_division=0)

    return metrics


def _evaluate_hf_api(file_path, hf_token=None):
    try:
        response = requests.post(
            f"{SPACE_URL}/evaluate",
            json={"file_path": file_path},  # This matches the Pydantic model
            headers={
                "Authorization": f"Bearer {hf_token}",
                "Content-Type": "application/json"
            } if hf_token else {"Content-Type": "application/json"},
            timeout=180
        )
        response.raise_for_status()  # Raise HTTP errors
        return response.json()
    except requests.exceptions.RequestException as e:
        raise ValueError(f"API Error: {str(e)}\nResponse: {e.response.text if hasattr(e, 'response') else ''}")