TabArena-WIP

Running

App Files Files Community

TabArena-WIP / src /generate_dummy_data.py

geoalgo

add data from tabrepo

9daaf0d 2 days ago

raw

history blame contribute delete

2.84 kB

	from dataclasses import dataclass
	import json
	from pathlib import Path
	import pandas as pd
	import numpy as np
	from src.constants import MethodTypes, ProblemTypes, DatasetInfo, MetricNames, DatasetInfoRow

	np.random.seed(0)


	def generate_dummy_data(method: str, model_type: str, datasets: list[DatasetInfoRow]):
	# Create a sample DataFrame with schema
	# "dataset,model,eval_metrics/normalized-error,eval_metrics/rank,eval_metrics/ELO,eval_metrics/fit_time_per_1K_rows,eval_metrics/inference_time_per_1K_rows,domain,num_variates"

	rows = []
	for dataset in datasets:
	rows.append({
	"dataset": dataset.name,
	"model": method,
	f"{MetricNames.raw_error}": np.random.rand(),
	f"{MetricNames.fit_time_per_1K_rows}": np.random.randint(1, 100),
	f"{MetricNames.inference_time_per_1K_rows}": np.random.randint(1, 100),
	})
	df = pd.DataFrame(rows)
	result_path = Path(__file__).parent.parent / "results" / method
	csv_path = result_path / f"all_results.csv"
	csv_path.parent.mkdir(parents=True, exist_ok=True)
	df.to_csv(csv_path, index=False)

	with open(result_path / "config.json", "w") as f:
	f.write(json.dumps(
	{
	"model": method,
	MethodTypes.col_name: model_type,
	}
	))

	return df


	if __name__ == "__main__":

	datasets = [
	DatasetInfoRow(name="airline", problem_type=ProblemTypes.regression, num_features=12, num_rows=10),
	DatasetInfoRow(name="electricity", problem_type=ProblemTypes.classification, num_features=2, num_rows=1020),
	DatasetInfoRow(name="solar-energy", problem_type=ProblemTypes.multi_classification, num_features=3, num_rows=100),
	DatasetInfoRow(name="traffic", problem_type=ProblemTypes.multi_classification, num_features=12, num_rows=10000),
	DatasetInfoRow(name="volcano", problem_type=ProblemTypes.regression, num_features=12, num_rows=100),
	]

	methods = [
	("AutoGluon (best)", MethodTypes.automl),
	("CatBoost", MethodTypes.tree),
	("TabPFN", MethodTypes.foundational),
	("TabPFN-v2", MethodTypes.foundational),
	("KNN", MethodTypes.other),
	]
	for method, method_type in methods:
	generate_dummy_data(
	method=method,
	datasets=datasets,
	model_type=method_type,
	)

	row_datasets = []
	for dataset in datasets:
	row_datasets.append({
	DatasetInfo.col_name: dataset.name,
	ProblemTypes.col_name: dataset.problem_type,
	DatasetInfo.num_rows: dataset.num_rows,
	DatasetInfo.num_features: dataset.num_features,
	})

	pd.DataFrame(row_datasets).to_csv(Path(__file__).parent.parent / "results" / "dataset_properties.csv", index=False)