KeivanR's picture
text input as json
b0cd906
raw
history blame
1.34 kB
import numpy as np
from sklearn.metrics import classification_report
import zipfile
import json
import pandas as pd
from .config import TAG_NAMES
def load_data(test_data_path):
# zip file handler
zip_file = zipfile.ZipFile('code_classification_dataset.zip')
# list available files in the container
names = zip_file.namelist()
data = []
features = ["prob_desc_description","prob_desc_input_spec","prob_desc_output_spec"]
cols = features + ["tags"]
# extract a specific file from the zip container
for name in names[1:]:
f = zip_file.open(name)
# save the extraced file
content = f.read()
d = json.loads(content)
# json_fmt = json.dumps(d, indent=2)
# print(json_fmt)
row = []
for c in cols:
row.append(d[c])
data.append(row)
df = pd.DataFrame(data, columns=cols)
return df
def preprocessing(df):
# Example dataset
texts = df["prob_desc_description"].values.tolist()
labels = df[TAG_NAMES].values.tolist()
# data:
# texts = ["text1", "text2", ...] # list of texts
# labels = [[0,1,0,0,1,0,1,1,0], [0,1,1,0,0,0,0,0,0],, ...] # list of labels
df = pd.DataFrame({'text':texts, 'labels': labels})
def evaluate_model(test_data_path):
df = load_data(test_data_path)
df = preprocessing(df)
return metrics