KeivanR commited on
Commit
bb4ee7a
·
1 Parent(s): bf8e831

json error handling

Browse files
Files changed (1) hide show
  1. qwen_classifier/evaluate.py +49 -24
qwen_classifier/evaluate.py CHANGED
@@ -1,3 +1,4 @@
 
1
  import numpy as np
2
  from sklearn.metrics import classification_report
3
  from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
@@ -13,32 +14,56 @@ import requests
13
  from .config import TAG_NAMES, DEVICE, SPACE_URL, EVAL_LIMIT
14
  from .globals import global_model, global_tokenizer
15
 
16
- def load_data(test_data_path):
17
- # zip file handler
18
- zip_file = zipfile.ZipFile(test_data_path)
 
 
 
 
19
 
20
- # list available files in the container
21
- names = zip_file.namelist()
22
  data = []
23
- features = ["prob_desc_description","prob_desc_input_spec","prob_desc_output_spec"]
24
  cols = features + ["tags"]
25
- # extract a specific file from the zip container
26
- for name in names[1:1+EVAL_LIMIT]:
27
- f = zip_file.open(name)
28
-
29
- # save the extraced file
30
- content = f.read()
31
- d = json.loads(content)
32
- # json_fmt = json.dumps(d, indent=2)
33
- # print(json_fmt)
34
- row = []
35
- for c in cols:
36
- row.append(d[c])
37
- data.append(row)
38
- df = pd.DataFrame(data, columns=cols)
39
- return df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
 
41
- def preprocessing(df):
42
  mlb = MultiLabelBinarizer()
43
  tags_to_encode = ['math', 'graphs', 'strings', 'number theory', 'trees', 'geometry', 'games', 'probabilities']
44
 
@@ -83,8 +108,8 @@ def _evaluate_local(test_data_path, hf_repo):
83
 
84
  global_model = QwenClassifier.from_pretrained(hf_repo).eval()
85
  global_tokenizer = AutoTokenizer.from_pretrained(hf_repo)
86
- df = load_data(test_data_path)
87
- df = preprocessing(df)
88
 
89
  hf_dataset = Dataset.from_pandas(df)
90
 
 
1
+ import os
2
  import numpy as np
3
  from sklearn.metrics import classification_report
4
  from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
 
14
  from .config import TAG_NAMES, DEVICE, SPACE_URL, EVAL_LIMIT
15
  from .globals import global_model, global_tokenizer
16
 
17
+ def _load_data(test_data_path):
18
+ # Check file existence
19
+ if not os.path.exists(test_data_path):
20
+ raise FileNotFoundError(f"Zip file not found at: {test_data_path}")
21
+
22
+ if not zipfile.is_zipfile(test_data_path):
23
+ raise zipfile.BadZipFile(f"File is not a valid zip archive: {test_data_path}")
24
 
 
 
25
  data = []
26
+ features = ["prob_desc_description", "prob_desc_input_spec", "prob_desc_output_spec"]
27
  cols = features + ["tags"]
28
+
29
+ try:
30
+ with zipfile.ZipFile(test_data_path, 'r') as zip_file:
31
+ # Verify zip contents
32
+ names = zip_file.namelist()
33
+ if not names:
34
+ raise ValueError("Empty zip archive - no files found")
35
+
36
+ # Process files with limit
37
+ for name in names[1:1+EVAL_LIMIT]:
38
+ try:
39
+ with zip_file.open(name) as f:
40
+ content = f.read()
41
+ d = json.loads(content)
42
+
43
+ # 4. Validate required fields
44
+ if not all(col in d for col in cols):
45
+ missing = [col for col in cols if col not in d]
46
+ raise KeyError(f"Missing required fields in {name}: {missing}")
47
+
48
+ row = [d[c] for c in cols]
49
+ data.append(row)
50
+
51
+ except json.JSONDecodeError as e:
52
+ raise ValueError(f"Invalid JSON in file {name}: {str(e)}")
53
+ except Exception as e:
54
+ raise RuntimeError(f"Error processing {name}: {str(e)}")
55
+
56
+ except zipfile.BadZipFile as e:
57
+ raise zipfile.BadZipFile(f"Corrupted zip file: {str(e)}")
58
+ except Exception as e:
59
+ raise RuntimeError(f"Unexpected error loading data: {str(e)}")
60
+
61
+ if not data:
62
+ raise ValueError("No valid data files found in zip archive")
63
+
64
+ return pd.DataFrame(data, columns=cols)
65
 
66
+ def _preprocessing(df):
67
  mlb = MultiLabelBinarizer()
68
  tags_to_encode = ['math', 'graphs', 'strings', 'number theory', 'trees', 'geometry', 'games', 'probabilities']
69
 
 
108
 
109
  global_model = QwenClassifier.from_pretrained(hf_repo).eval()
110
  global_tokenizer = AutoTokenizer.from_pretrained(hf_repo)
111
+ df = _load_data(test_data_path)
112
+ df = _preprocessing(df)
113
 
114
  hf_dataset = Dataset.from_pandas(df)
115