|
import argparse |
|
import pandas as pd |
|
import streamlit as st |
|
from generate_schema import generate_schema |
|
from fetch_data import fetch_real_data |
|
from synthetic_generator import train_and_generate_synthetic |
|
|
|
def main(): |
|
parser = argparse.ArgumentParser() |
|
parser.add_argument("--prompt", type=str, required=True, help="Describe the dataset you want") |
|
parser.add_argument("--domain", type=str, default="healthcare", help="Domain to fetch real data from (optional)") |
|
args = parser.parse_args() |
|
|
|
|
|
hf_token = st.secrets["hf_token"] |
|
|
|
|
|
schema = generate_schema(args.prompt, hf_token) |
|
print(f"π Generated schema: {schema}") |
|
|
|
|
|
real_data = fetch_real_data(args.domain) |
|
|
|
|
|
real_data = real_data[schema['columns']] |
|
print(f"β
Fetched real data with shape: {real_data.shape}") |
|
|
|
|
|
output_path = f"outputs/synthetic_{args.domain}.csv" |
|
train_and_generate_synthetic(real_data, schema, output_path) |
|
|
|
if __name__ == "__main__": |
|
main() |
|
|