import numpy as np from sklearn.metrics import classification_report from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score import zipfile import json import pandas as pd import torch from datasets import Dataset from torch.utils.data import DataLoader from .config import TAG_NAMES, DEVICE from .predict import predict_batch def load_data(test_data_path): # zip file handler zip_file = zipfile.ZipFile('code_classification_dataset.zip') # list available files in the container names = zip_file.namelist() data = [] features = ["prob_desc_description","prob_desc_input_spec","prob_desc_output_spec"] cols = features + ["tags"] # extract a specific file from the zip container for name in names[1:]: f = zip_file.open(name) # save the extraced file content = f.read() d = json.loads(content) # json_fmt = json.dumps(d, indent=2) # print(json_fmt) row = [] for c in cols: row.append(d[c]) data.append(row) df = pd.DataFrame(data, columns=cols) return df def preprocessing(df): texts = df["prob_desc_description"].values.tolist() labels = df[TAG_NAMES].values.tolist() # data: # texts = ["text1", "text2", ...] # list of texts # labels = [[0,1,0,0,1,0,1,1,0], [0,1,1,0,0,0,0,0,0],, ...] # list of labels df = pd.DataFrame({'text':texts, 'labels': labels}) return df def evaluate_batch(text, hf_repo, backend="local", hf_token=None): if backend == "local": return _evaluate_local(text, hf_repo) elif backend == "hf": return _evaluate_hf_api(text, hf_token) else: raise ValueError(f"Unknown backend: {backend}") def _evaluate_local(test_data_path, hf_repo): global local_model, local_tokenizer # Lazy-loading to avoid slow startup if local_model is None: from .model import QwenClassifier from transformers import AutoTokenizer local_model = QwenClassifier.from_pretrained(hf_repo).eval() local_tokenizer = AutoTokenizer.from_pretrained(hf_repo) df = load_data(test_data_path) df = preprocessing(df) hf_dataset = Dataset.from_pandas(df) # Then apply tokenization def tokenize_function(examples): return local_tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512) dataset = hf_dataset.map(tokenize_function, batched=True) dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels']) dataloader = DataLoader(dataset, batch_size=8, shuffle=True) local_model.eval() all_preds = [] all_labels = [] with torch.no_grad(): for batch in dataloader: batch = {k: v.to(DEVICE) for k, v in batch.items()} labels = batch["labels"].type(torch.float32) logits = local_model(batch["input_ids"], batch["attention_mask"]) preds = torch.sigmoid(logits).cpu().numpy() > 0.5 labels = labels.cpu().numpy() all_preds.extend(preds) all_labels.extend(labels) val_acc = accuracy_score(all_labels, all_preds) val_prec = precision_score(all_labels, all_preds, average='macro', zero_division=0) val_rec = recall_score(all_labels, all_preds, average='macro') val_f1 = f1_score(all_labels, all_preds, average='macro') val_prec_per_class = precision_score(all_labels, all_preds, average=None, zero_division=0) val_rec_per_class = recall_score(all_labels, all_preds, average=None) val_f1_per_class = f1_score(all_labels, all_preds, average=None) metrics = { val_acc, val_prec, val_rec, val_f1, val_prec_per_class, val_rec_per_class, val_f1_per_class } report = classification_report(all_labels, all_preds, target_names=TAG_NAMES, zero_division=0) return metrics, report