Spaces:
Running
Running
from dataclasses import dataclass | |
import json | |
from pathlib import Path | |
import pandas as pd | |
import numpy as np | |
from src.constants import MethodTypes, ProblemTypes, DatasetInfo, MetricNames, DatasetInfoRow | |
np.random.seed(0) | |
def generate_dummy_data(method: str, model_type: str, datasets: list[DatasetInfoRow]): | |
# Create a sample DataFrame with schema | |
# "dataset,model,eval_metrics/normalized-error,eval_metrics/rank,eval_metrics/ELO,eval_metrics/fit_time_per_1K_rows,eval_metrics/inference_time_per_1K_rows,domain,num_variates" | |
rows = [] | |
for dataset in datasets: | |
rows.append({ | |
"dataset": dataset.name, | |
"model": method, | |
f"{MetricNames.raw_error}": np.random.rand(), | |
f"{MetricNames.fit_time_per_1K_rows}": np.random.randint(1, 100), | |
f"{MetricNames.inference_time_per_1K_rows}": np.random.randint(1, 100), | |
}) | |
df = pd.DataFrame(rows) | |
result_path = Path(__file__).parent.parent / "results" / method | |
csv_path = result_path / f"all_results.csv" | |
csv_path.parent.mkdir(parents=True, exist_ok=True) | |
df.to_csv(csv_path, index=False) | |
with open(result_path / "config.json", "w") as f: | |
f.write(json.dumps( | |
{ | |
"model": method, | |
MethodTypes.col_name: model_type, | |
} | |
)) | |
return df | |
def get_model_type(model: str): | |
pattern_families = { | |
"tuned": MethodTypes.automl, | |
"tuned + ensemble": MethodTypes.automl, | |
"FT": MethodTypes.finetuned, | |
"AutoGluon": MethodTypes.automl, | |
"Autosklearn2": MethodTypes.automl, | |
"CAT": MethodTypes.tree, | |
"EBM": MethodTypes.other, | |
"FASTAI": MethodTypes.finetuned, | |
"FT_TRANSFORMER": MethodTypes.finetuned, | |
"GBM": MethodTypes.tree, | |
"KNN": MethodTypes.other, | |
"REALMLP": MethodTypes.finetuned, | |
"RF": MethodTypes.tree, | |
"XGB": MethodTypes.tree, | |
"XT": MethodTypes.tree, | |
} | |
for pattern, family in pattern_families.items(): | |
if pattern in model: | |
return family | |
return MethodTypes.other | |
if __name__ == "__main__": | |
tabrepo_results_root = Path("~/Downloads/tabrepo_temp_results").expanduser() | |
results_root = Path(__file__).parent.parent / "results" | |
results_root.mkdir(exist_ok=True) | |
df_datasets = pd.read_csv(tabrepo_results_root / "dataset_properties.csv") | |
df_datasets.to_csv(results_root / "dataset_properties.csv", index=False) | |
df_models = pd.read_csv(tabrepo_results_root / "all_results.csv") | |
# For now discard tuned and ensemble | |
df_models = df_models[~df_models.model.str.contains("tuned")] | |
for model in df_models.loc[:, "model"].unique(): | |
result_path = results_root / model | |
result_path.mkdir(exist_ok=True) | |
model_type = get_model_type(model) | |
df_all_results = df_models.loc[df_models.model == model].copy() | |
# dataset,model,raw-error,fit-time-per-1K-rows,inference-time-per-1K-rows | |
df_all_results.rename(columns={ | |
"dataset": "dataset", | |
"model": "model", | |
"metric_error": "raw-error", | |
# TODO divide by number of rows and multiply by 1K | |
"time_train_s": "fit-time-per-1K-rows", | |
"time_infer_s": "inference-time-per-1K-rows", | |
}, inplace=True) | |
df_all_results.to_csv(result_path / "all_results.csv", index=False) | |
with open(result_path / "config.json", "w") as f: | |
f.write(json.dumps( | |
{ | |
"model": model, | |
MethodTypes.col_name: model_type, | |
} | |
)) |