File size: 3,907 Bytes
6a1e686
 
8524cf7
2f3df87
 
 
8524cf7
 
 
 
 
2f3df87
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8524cf7
 
 
2f3df87
8524cf7
 
 
 
 
 
 
6a1e686
8524cf7
 
 
 
 
 
 
 
 
 
b0cd906
 
8524cf7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import numpy as np
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import zipfile
import json
import pandas as pd
import torch
from datasets import Dataset
from torch.utils.data import DataLoader
from .config import TAG_NAMES, DEVICE
from .predict import predict_batch

def load_data(test_data_path):
    # zip file handler
    zip_file = zipfile.ZipFile('code_classification_dataset.zip')

    # list available files in the container
    names = zip_file.namelist()
    data = []
    features = ["prob_desc_description","prob_desc_input_spec","prob_desc_output_spec"]
    cols = features + ["tags"]
    # extract a specific file from the zip container
    for name in names[1:]:
        f = zip_file.open(name)

    # save the extraced file
    content = f.read()
    d = json.loads(content)
    # json_fmt = json.dumps(d, indent=2)
    # print(json_fmt)
    row = []
    for c in cols:
        row.append(d[c])
    data.append(row)
    df = pd.DataFrame(data, columns=cols)
    return df

def preprocessing(df):
    texts = df["prob_desc_description"].values.tolist()
    labels = df[TAG_NAMES].values.tolist()

    # data:
    # texts = ["text1", "text2", ...]  # list of texts
    # labels = [[0,1,0,0,1,0,1,1,0], [0,1,1,0,0,0,0,0,0],, ...] # list of labels
    
    df = pd.DataFrame({'text':texts, 'labels': labels})
    return df



def evaluate_batch(text, hf_repo, backend="local", hf_token=None):
    if backend == "local":
        return _evaluate_local(text, hf_repo)
    elif backend == "hf":
        return _evaluate_hf_api(text, hf_token)
    else:
        raise ValueError(f"Unknown backend: {backend}")

def _evaluate_local(test_data_path, hf_repo):
    global local_model, local_tokenizer
    
    # Lazy-loading to avoid slow startup
    if local_model is None:
        from .model import QwenClassifier
        from transformers import AutoTokenizer
        
        local_model = QwenClassifier.from_pretrained(hf_repo).eval()
        local_tokenizer = AutoTokenizer.from_pretrained(hf_repo)
    df = load_data(test_data_path)
    df = preprocessing(df)

    hf_dataset = Dataset.from_pandas(df)

    # Then apply tokenization
    def tokenize_function(examples):
        return local_tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)

    dataset = hf_dataset.map(tokenize_function, batched=True)

    dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

    dataloader = DataLoader(dataset, batch_size=8, shuffle=True)


    local_model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in dataloader:
            batch = {k: v.to(DEVICE) for k, v in batch.items()}
            labels = batch["labels"].type(torch.float32)

            logits = local_model(batch["input_ids"], batch["attention_mask"])

            preds = torch.sigmoid(logits).cpu().numpy() > 0.5
            labels = labels.cpu().numpy()

            all_preds.extend(preds)
            all_labels.extend(labels)

    val_acc = accuracy_score(all_labels, all_preds)
    val_prec = precision_score(all_labels, all_preds, average='macro', zero_division=0)
    val_rec = recall_score(all_labels, all_preds, average='macro')
    val_f1 = f1_score(all_labels, all_preds, average='macro')
    val_prec_per_class = precision_score(all_labels, all_preds, average=None, zero_division=0)
    val_rec_per_class = recall_score(all_labels, all_preds, average=None)
    val_f1_per_class = f1_score(all_labels, all_preds, average=None)

    metrics = {
        val_acc,
        val_prec,
        val_rec, 
        val_f1, 
        val_prec_per_class, 
        val_rec_per_class, 
        val_f1_per_class
    }
    report = classification_report(all_labels, all_preds, target_names=TAG_NAMES, zero_division=0)

    return metrics, report