Spaces:
Sleeping
Sleeping
File size: 7,125 Bytes
7713b1f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 |
import torch
from torch.utils import data
from torch.utils.data import Dataset
from datasets.arrow_dataset import Dataset as HFDataset
from datasets.load import load_dataset, load_metric
from transformers import (
AutoTokenizer,
DataCollatorWithPadding,
EvalPrediction,
default_data_collator,
)
import copy, math
import os
import numpy as np
import logging, re
from datasets.formatting.formatting import LazyRow, LazyBatch
from tqdm import tqdm
from tasks import utils
task_to_keys = {
"cola": ("sentence", None),
"mnli": ("premise", "hypothesis"),
"mrpc": ("sentence1", "sentence2"),
"qnli": ("question", "sentence"),
"qqp": ("question1", "question2"),
"rte": ("sentence1", "sentence2"),
"sst2": ("sentence", None),
"stsb": ("sentence1", "sentence2"),
"wnli": ("sentence1", "sentence2"),
}
logger = logging.getLogger(__name__)
idx = 0
class GlueDataset():
def __init__(self, tokenizer: AutoTokenizer, data_args, training_args) -> None:
super().__init__()
self.tokenizer = tokenizer
self.data_args = data_args
#labels
raw_datasets = load_dataset("glue", data_args.dataset_name)
self.is_regression = data_args.dataset_name == "stsb"
if not self.is_regression:
self.label_list = raw_datasets["train"].features["label"].names
self.num_labels = len(self.label_list)
else:
self.num_labels = 1
# Preprocessing the raw_datasets
self.sentence1_key, self.sentence2_key = task_to_keys[data_args.dataset_name]
sc_template = f'''{'{' + self.sentence1_key + '}'}''' \
if self.sentence2_key is None else f'''{'{' + self.sentence1_key + '}'}</s></s>{'{' + self.sentence2_key + '}'}'''
self.tokenizer.template = self.template = [sc_template]
print(f"-> using template:{self.template}")
# Padding strategy
if data_args.pad_to_max_length:
self.padding = "max_length"
else:
# We will pad later, dynamically at batch creation, to the max sequence length in each batch
self.padding = False
# Some models have set the order of the labels to use, so let's make sure we do use it.
if not self.is_regression:
self.label2id = {l: i for i, l in enumerate(self.label_list)}
self.id2label = {id: label for label, id in self.label2id.items()}
if data_args.max_seq_length > tokenizer.model_max_length:
logger.warning(
f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the"
f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
)
self.max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)
new_datasets = raw_datasets.map(
self.preprocess_function,
batched=True,
load_from_cache_file=not data_args.overwrite_cache,
desc="Running tokenizer on clean dataset",
)
for key in new_datasets.keys():
if "idx" not in raw_datasets[key].column_names:
idx = np.arange(len(raw_datasets[key])).tolist()
raw_datasets[key] = raw_datasets[key].add_column("idx", idx)
if training_args.do_train:
self.train_dataset = new_datasets["train"]
if data_args.max_train_samples is not None:
data_args.max_train_samples = min(data_args.max_train_samples, len(self.train_dataset))
self.train_dataset = self.train_dataset.select(range(data_args.max_train_samples))
size = len(self.train_dataset)
select = np.random.choice(size, math.ceil(size * training_args.poison_rate), replace=False)
idx = torch.zeros([size])
idx[select] = 1
self.train_dataset.poison_idx = idx
if training_args.do_eval:
self.eval_dataset = new_datasets["validation_matched" if data_args.dataset_name == "mnli" else "validation"]
if data_args.max_eval_samples is not None:
data_args.max_eval_samples = min(data_args.max_eval_samples, len(self.eval_dataset))
self.eval_dataset = self.eval_dataset.select(range(data_args.max_eval_samples))
if training_args.do_predict or data_args.dataset_name is not None or data_args.test_file is not None:
self.predict_dataset = new_datasets["test_matched" if data_args.dataset_name == "mnli" else "test"]
if data_args.max_predict_samples is not None:
data_args.max_predict_samples = min(data_args.max_predict_samples, len(self.predict_dataset))
self.predict_dataset = self.predict_dataset.select(range(data_args.max_predict_samples))
self.metric = load_metric("glue", data_args.dataset_name)
if data_args.pad_to_max_length:
self.data_collator = default_data_collator
elif training_args.fp16:
self.data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8)
def filter(self, examples, length=None):
if type(examples) == list:
return [self.filter(x, length) for x in examples]
elif type(examples) == dict or type(examples) == LazyRow or type(examples) == LazyBatch:
return {k: self.filter(v, length) for k, v in examples.items()}
elif type(examples) == str:
#txt = re.sub(r"[^a-zA-Z0-9\ \%#!.,]+", '', examples)
txt = examples.replace(self.tokenizer.prompt_token, "T").replace(self.tokenizer.skey_token, "K").replace(
self.tokenizer.predict_token, "P").replace("[X]", "Y").replace("[Y]", "Y")
if length is not None:
return txt[:length]
return txt
return examples
def preprocess_function(self, examples, **kwargs):
examples = self.filter(examples, length=200)
# Tokenize the texts, args = [text1, text2, ...]
_examples = copy.deepcopy(examples)
args = (
(_examples[self.sentence1_key],) if self.sentence2_key is None else (_examples[self.sentence1_key], _examples[self.sentence2_key])
)
result = self.tokenizer(*args, padding=self.padding, max_length=self.max_seq_length, truncation=True)
result["idx"] = examples["idx"]
return result
def compute_metrics(self, p: EvalPrediction):
preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
preds = np.squeeze(preds) if self.is_regression else np.argmax(preds, axis=1)
if self.data_args.dataset_name is not None:
result = self.metric.compute(predictions=preds, references=p.label_ids)
if len(result) > 1:
result["combined_score"] = np.mean(list(result.values())).item()
return result
elif self.is_regression:
return {"mse": ((preds - p.label_ids) ** 2).mean().item()}
else:
return {"accuracy": (preds == p.label_ids).astype(np.float32).mean().item()}
|