File size: 7,901 Bytes
e6e7506
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
import os
import json
import random
from utils import *
from pipeline import *
current_dir = os.path.dirname(os.path.abspath(__file__))
DATA_DIR = os.path.join(current_dir, "../data/datasets")
OUTPUT_DIR = os.path.join(current_dir, "results")

class BaseDataset:
    def __init__(self):
        pass

    def __getitem__(self, idx):
        return None

    def __len__(self):
        return None

    def evaluate(self, idx, answer):
        return None

class NERDataset(BaseDataset):
    def __init__(self, name=None, task="NER", data_dir = f"{DATA_DIR}/CrossNER", output_dir = f"{OUTPUT_DIR}", train=False):
        self.name = name
        self.task = task
        self.data_dir = data_dir
        self.output_dir = output_dir
        self.test_file = json.load(open(f"{data_dir}/train.json")) if train else json.load(open(f"{data_dir}/test.json"))
        self.schema = str(json.load(open(f"{data_dir}/class.json")))
        self.retry = 2

    def evaluate(self, llm: BaseEngine, mode="", sample=None, random_sample=False, update_case=False):
        # initialize
        sample = len(self.test_file) if sample is None else sample
        if random_sample:
            test_file = random.sample(self.test_file, sample)
        else:
            test_file = self.test_file[:sample]
        total_precision, total_recall, total_f1 = 0, 0, 0
        num_items = 0
        output_path = f"{self.output_dir}/{self.name}_{self.task}_{mode}_{llm.name}_sample{sample}.jsonl"
        print("Results will be saved to: ", output_path)

        # predict and evaluate
        pipeline = Pipeline(llm=llm)
        for item in test_file:
            try:
                # get prediction
                num_items += 1
                truth = list(item.items())[1]
                truth = {truth[0]: truth[1]}
                pred_set = set()
                for attempt in range(self.retry):
                    pred_result, pred_detailed, _, _ = pipeline.get_extract_result(task=self.task, text=item['sentence'], constraint=self.schema, mode=mode, truth=truth, update_case=update_case)
                    try:
                        pred_result = pred_result['entity_list']
                        pred_set = dict_list_to_set(pred_result)
                        break
                    except Exception as e:
                        print(f"Failed to parse result: {pred_result}, retrying... Exception: {e}")

                # evaluate
                truth_result = item["entity_list"]
                truth_set = dict_list_to_set(truth_result)
                print(truth_set)
                print(pred_set)

                precision, recall, f1_score = calculate_metrics(truth_set, pred_set)
                total_precision += precision
                total_recall += recall
                total_f1 += f1_score

                pred_detailed["pred"] = pred_result
                pred_detailed["truth"] = truth_result
                pred_detailed["metrics"] = {"precision": precision, "recall": recall, "f1_score": f1_score}
                res_detailed = {"id": num_items}
                res_detailed.update(pred_detailed)
                with open(output_path, 'a') as file:
                    file.write(json.dumps(res_detailed) + '\n')
            except Exception as e:
                print(f"Exception occured: {e}")
                print(f"idx: {num_items}")
                pass

        # calculate overall metrics
        if num_items > 0:
            avg_precision = total_precision / num_items
            avg_recall = total_recall / num_items
            avg_f1 = total_f1 / num_items
            overall_metrics = {
                "total_items": num_items,
                "average_precision": avg_precision,
                "average_recall": avg_recall,
                "average_f1_score": avg_f1
            }
            with open(output_path, 'a') as file:
                file.write(json.dumps(overall_metrics) + '\n\n')
            print(f"Overall Metrics:\nTotal Items: {num_items}\nAverage Precision: {avg_precision:.4f}\nAverage Recall: {avg_recall:.4f}\nAverage F1 Score: {avg_f1:.4f}")
        else:
            print("No items processed.")

class REDataset(BaseDataset):
    def __init__(self, name=None, task="RE", data_dir = f"{DATA_DIR}/NYT11", output_dir = f"{OUTPUT_DIR}", train=False):
        self.name = name
        self.task = task
        self.data_dir = data_dir
        self.output_dir = output_dir
        self.test_file = json.load(open(f"{data_dir}/train.json")) if train else json.load(open(f"{data_dir}/test.json"))
        self.schema = str(json.load(open(f"{data_dir}/class.json")))
        self.retry = 2

    def evaluate(self, llm: BaseEngine, mode="", sample=None, random_sample=False, update_case=False):
        # initialize
        sample = len(self.test_file) if sample is None else sample
        if random_sample:
            test_file = random.sample(self.test_file, sample)
        else:
            test_file = self.test_file[:sample]
        total_precision, total_recall, total_f1 = 0, 0, 0
        num_items = 0
        output_path = f"{self.output_dir}/{self.name}_{self.task}_{mode}_{llm.name}_sample{sample}.jsonl"
        print("Results will be saved to: ", output_path)

        # predict and evaluate
        pipeline = Pipeline(llm=llm)
        for item in test_file:
            try:
                # get prediction
                num_items += 1
                truth = list(item.items())[1]
                truth = {truth[0]: truth[1]}
                pred_set = set()
                for attempt in range(self.retry):
                    pred_result, pred_detailed, _, _ = pipeline.get_extract_result(task=self.task, text=item['sentence'], constraint=self.schema, mode=mode, truth=truth, update_case=update_case)
                    try:
                        pred_result = pred_result['relation_list']
                        pred_set = dict_list_to_set(pred_result)
                        break
                    except Exception as e:
                        print(f"Failed to parse result: {pred_result}, retrying... Exception: {e}")

                # evaluate
                truth_result = item["relation_list"]
                truth_set = dict_list_to_set(truth_result)
                print(truth_set)
                print(pred_set)

                precision, recall, f1_score = calculate_metrics(truth_set, pred_set)
                total_precision += precision
                total_recall += recall
                total_f1 += f1_score

                pred_detailed["pred"] = pred_result
                pred_detailed["truth"] = truth_result
                pred_detailed["metrics"] = {"precision": precision, "recall": recall, "f1_score": f1_score}
                res_detailed = {"id": num_items}
                res_detailed.update(pred_detailed)
                with open(output_path, 'a') as file:
                    file.write(json.dumps(res_detailed) + '\n')
            except Exception as e:
                print(f"Exception occured: {e}")
                print(f"idx: {num_items}")
                pass

        # calculate overall metrics
        if num_items > 0:
            avg_precision = total_precision / num_items
            avg_recall = total_recall / num_items
            avg_f1 = total_f1 / num_items
            overall_metrics = {
                "total_items": num_items,
                "average_precision": avg_precision,
                "average_recall": avg_recall,
                "average_f1_score": avg_f1
            }
            with open(output_path, 'a') as file:
                file.write(json.dumps(overall_metrics) + '\n\n')
            print(f"Overall Metrics:\nTotal Items: {num_items}\nAverage Precision: {avg_precision:.4f}\nAverage Recall: {avg_recall:.4f}\nAverage F1 Score: {avg_f1:.4f}")
        else:
            print("No items processed.")