TabArena-WIP / src /generate_dummy_data.py
geoalgo's picture
add data from tabrepo
9daaf0d
raw
history blame contribute delete
2.84 kB
from dataclasses import dataclass
import json
from pathlib import Path
import pandas as pd
import numpy as np
from src.constants import MethodTypes, ProblemTypes, DatasetInfo, MetricNames, DatasetInfoRow
np.random.seed(0)
def generate_dummy_data(method: str, model_type: str, datasets: list[DatasetInfoRow]):
# Create a sample DataFrame with schema
# "dataset,model,eval_metrics/normalized-error,eval_metrics/rank,eval_metrics/ELO,eval_metrics/fit_time_per_1K_rows,eval_metrics/inference_time_per_1K_rows,domain,num_variates"
rows = []
for dataset in datasets:
rows.append({
"dataset": dataset.name,
"model": method,
f"{MetricNames.raw_error}": np.random.rand(),
f"{MetricNames.fit_time_per_1K_rows}": np.random.randint(1, 100),
f"{MetricNames.inference_time_per_1K_rows}": np.random.randint(1, 100),
})
df = pd.DataFrame(rows)
result_path = Path(__file__).parent.parent / "results" / method
csv_path = result_path / f"all_results.csv"
csv_path.parent.mkdir(parents=True, exist_ok=True)
df.to_csv(csv_path, index=False)
with open(result_path / "config.json", "w") as f:
f.write(json.dumps(
{
"model": method,
MethodTypes.col_name: model_type,
}
))
return df
if __name__ == "__main__":
datasets = [
DatasetInfoRow(name="airline", problem_type=ProblemTypes.regression, num_features=12, num_rows=10),
DatasetInfoRow(name="electricity", problem_type=ProblemTypes.classification, num_features=2, num_rows=1020),
DatasetInfoRow(name="solar-energy", problem_type=ProblemTypes.multi_classification, num_features=3, num_rows=100),
DatasetInfoRow(name="traffic", problem_type=ProblemTypes.multi_classification, num_features=12, num_rows=10000),
DatasetInfoRow(name="volcano", problem_type=ProblemTypes.regression, num_features=12, num_rows=100),
]
methods = [
("AutoGluon (best)", MethodTypes.automl),
("CatBoost", MethodTypes.tree),
("TabPFN", MethodTypes.foundational),
("TabPFN-v2", MethodTypes.foundational),
("KNN", MethodTypes.other),
]
for method, method_type in methods:
generate_dummy_data(
method=method,
datasets=datasets,
model_type=method_type,
)
row_datasets = []
for dataset in datasets:
row_datasets.append({
DatasetInfo.col_name: dataset.name,
ProblemTypes.col_name: dataset.problem_type,
DatasetInfo.num_rows: dataset.num_rows,
DatasetInfo.num_features: dataset.num_features,
})
pd.DataFrame(row_datasets).to_csv(Path(__file__).parent.parent / "results" / "dataset_properties.csv", index=False)