TabArena-WIP / src /convert_tabrepo_data.py
geoalgo's picture
add data from tabrepo
9daaf0d
raw
history blame contribute delete
3.66 kB
from dataclasses import dataclass
import json
from pathlib import Path
import pandas as pd
import numpy as np
from src.constants import MethodTypes, ProblemTypes, DatasetInfo, MetricNames, DatasetInfoRow
np.random.seed(0)
def generate_dummy_data(method: str, model_type: str, datasets: list[DatasetInfoRow]):
# Create a sample DataFrame with schema
# "dataset,model,eval_metrics/normalized-error,eval_metrics/rank,eval_metrics/ELO,eval_metrics/fit_time_per_1K_rows,eval_metrics/inference_time_per_1K_rows,domain,num_variates"
rows = []
for dataset in datasets:
rows.append({
"dataset": dataset.name,
"model": method,
f"{MetricNames.raw_error}": np.random.rand(),
f"{MetricNames.fit_time_per_1K_rows}": np.random.randint(1, 100),
f"{MetricNames.inference_time_per_1K_rows}": np.random.randint(1, 100),
})
df = pd.DataFrame(rows)
result_path = Path(__file__).parent.parent / "results" / method
csv_path = result_path / f"all_results.csv"
csv_path.parent.mkdir(parents=True, exist_ok=True)
df.to_csv(csv_path, index=False)
with open(result_path / "config.json", "w") as f:
f.write(json.dumps(
{
"model": method,
MethodTypes.col_name: model_type,
}
))
return df
def get_model_type(model: str):
pattern_families = {
"tuned": MethodTypes.automl,
"tuned + ensemble": MethodTypes.automl,
"FT": MethodTypes.finetuned,
"AutoGluon": MethodTypes.automl,
"Autosklearn2": MethodTypes.automl,
"CAT": MethodTypes.tree,
"EBM": MethodTypes.other,
"FASTAI": MethodTypes.finetuned,
"FT_TRANSFORMER": MethodTypes.finetuned,
"GBM": MethodTypes.tree,
"KNN": MethodTypes.other,
"REALMLP": MethodTypes.finetuned,
"RF": MethodTypes.tree,
"XGB": MethodTypes.tree,
"XT": MethodTypes.tree,
}
for pattern, family in pattern_families.items():
if pattern in model:
return family
return MethodTypes.other
if __name__ == "__main__":
tabrepo_results_root = Path("~/Downloads/tabrepo_temp_results").expanduser()
results_root = Path(__file__).parent.parent / "results"
results_root.mkdir(exist_ok=True)
df_datasets = pd.read_csv(tabrepo_results_root / "dataset_properties.csv")
df_datasets.to_csv(results_root / "dataset_properties.csv", index=False)
df_models = pd.read_csv(tabrepo_results_root / "all_results.csv")
# For now discard tuned and ensemble
df_models = df_models[~df_models.model.str.contains("tuned")]
for model in df_models.loc[:, "model"].unique():
result_path = results_root / model
result_path.mkdir(exist_ok=True)
model_type = get_model_type(model)
df_all_results = df_models.loc[df_models.model == model].copy()
# dataset,model,raw-error,fit-time-per-1K-rows,inference-time-per-1K-rows
df_all_results.rename(columns={
"dataset": "dataset",
"model": "model",
"metric_error": "raw-error",
# TODO divide by number of rows and multiply by 1K
"time_train_s": "fit-time-per-1K-rows",
"time_infer_s": "inference-time-per-1K-rows",
}, inplace=True)
df_all_results.to_csv(result_path / "all_results.csv", index=False)
with open(result_path / "config.json", "w") as f:
f.write(json.dumps(
{
"model": model,
MethodTypes.col_name: model_type,
}
))