Spaces:

KeivanR
/

qwen-classifier-demo

Sleeping

App Files Files Community

KeivanR commited on Mar 29

Commit

bb4ee7a

1 Parent(s): bf8e831

json error handling

Browse files

Files changed (1) hide show

qwen_classifier/evaluate.py +49 -24

qwen_classifier/evaluate.py CHANGED Viewed

@@ -1,3 +1,4 @@
 import numpy as np
 from sklearn.metrics import classification_report
 from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
@@ -13,32 +14,56 @@ import requests
 from .config import TAG_NAMES, DEVICE, SPACE_URL, EVAL_LIMIT
 from .globals import global_model, global_tokenizer
-def load_data(test_data_path):
-    # zip file handler
-    zip_file = zipfile.ZipFile(test_data_path)
-    # list available files in the container
-    names = zip_file.namelist()
     data = []
-    features = ["prob_desc_description","prob_desc_input_spec","prob_desc_output_spec"]
     cols = features + ["tags"]
-    # extract a specific file from the zip container
-    for name in names[1:1+EVAL_LIMIT]:
-        f = zip_file.open(name)
-    # save the extraced file
-    content = f.read()
-    d = json.loads(content)
-    # json_fmt = json.dumps(d, indent=2)
-    # print(json_fmt)
-    row = []
-    for c in cols:
-        row.append(d[c])
-    data.append(row)
-    df = pd.DataFrame(data, columns=cols)
-    return df
-def preprocessing(df):
     mlb = MultiLabelBinarizer()
     tags_to_encode = ['math', 'graphs', 'strings', 'number theory', 'trees', 'geometry', 'games', 'probabilities']
@@ -83,8 +108,8 @@ def _evaluate_local(test_data_path, hf_repo):
         global_model = QwenClassifier.from_pretrained(hf_repo).eval()
         global_tokenizer = AutoTokenizer.from_pretrained(hf_repo)
-    df = load_data(test_data_path)
-    df = preprocessing(df)
     hf_dataset = Dataset.from_pandas(df)

+import os
 import numpy as np
 from sklearn.metrics import classification_report
 from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
 from .config import TAG_NAMES, DEVICE, SPACE_URL, EVAL_LIMIT
 from .globals import global_model, global_tokenizer
+def _load_data(test_data_path):
+    # Check file existence
+    if not os.path.exists(test_data_path):
+        raise FileNotFoundError(f"Zip file not found at: {test_data_path}")
+    if not zipfile.is_zipfile(test_data_path):
+        raise zipfile.BadZipFile(f"File is not a valid zip archive: {test_data_path}")
     data = []
+    features = ["prob_desc_description", "prob_desc_input_spec", "prob_desc_output_spec"]
     cols = features + ["tags"]
+    try:
+        with zipfile.ZipFile(test_data_path, 'r') as zip_file:
+            # Verify zip contents
+            names = zip_file.namelist()
+            if not names:
+                raise ValueError("Empty zip archive - no files found")
+            # Process files with limit
+            for name in names[1:1+EVAL_LIMIT]:
+                try:
+                    with zip_file.open(name) as f:
+                        content = f.read()
+                        d = json.loads(content)
+                        # 4. Validate required fields
+                        if not all(col in d for col in cols):
+                            missing = [col for col in cols if col not in d]
+                            raise KeyError(f"Missing required fields in {name}: {missing}")
+                        row = [d[c] for c in cols]
+                        data.append(row)
+                except json.JSONDecodeError as e:
+                    raise ValueError(f"Invalid JSON in file {name}: {str(e)}")
+                except Exception as e:
+                    raise RuntimeError(f"Error processing {name}: {str(e)}")
+    except zipfile.BadZipFile as e:
+        raise zipfile.BadZipFile(f"Corrupted zip file: {str(e)}")
+    except Exception as e:
+        raise RuntimeError(f"Unexpected error loading data: {str(e)}")
+    if not data:
+        raise ValueError("No valid data files found in zip archive")
+    return pd.DataFrame(data, columns=cols)
+def _preprocessing(df):
     mlb = MultiLabelBinarizer()
     tags_to_encode = ['math', 'graphs', 'strings', 'number theory', 'trees', 'geometry', 'games', 'probabilities']
         global_model = QwenClassifier.from_pretrained(hf_repo).eval()
         global_tokenizer = AutoTokenizer.from_pretrained(hf_repo)
+    df = _load_data(test_data_path)
+    df = _preprocessing(df)
     hf_dataset = Dataset.from_pandas(df)