import os import gradio as gr import threading from src.synth_data_gen import SynthDataGen generator = SynthDataGen() # Update the output format choices based on the selected dataset type def update_output_format(dataset_type): if dataset_type in ["Tabular", "Time-series"]: return gr.update(choices=["JSON", "csv", "Parquet"], value="JSON") elif dataset_type == "Text": return gr.update(choices=["JSON", "Markdown"], value="JSON") def update_pipeline(business_problem, dataset_type, output_format, num_samples, model): # Check if business problem is empty if not business_problem.strip(): yield [gr.update(visible=False), gr.update(visible=True), "❌ Please enter a business problem before generating."] return # Initial feedback while generating yield [gr.update(visible=False), gr.update(visible=False), "⏳ Generating dataset..."] try: # Pack inputs into a dictionary for the generator input_data = { "business_problem": business_problem, "dataset_type": dataset_type, "output_format": output_format, "num_samples": num_samples, "model": model } # Generate dataset file file_path = generator.generate_dataset(**input_data) print("🧪 File result returned:", file_path) # Check if file exists and return success message + file path if isinstance(file_path, str) and os.path.exists(file_path): threading.Timer(60, os.remove, args=[file_path]).start() # Auto-delete after 60s yield [gr.update(value=file_path, visible=True), gr.update(visible=True), "✅ Dataset ready for download."] else: # Handle invalid or missing file yield [gr.update(visible=False), gr.update(visible=True), "❌ Error: File not created or path invalid."] except Exception as e: # Catch and display any errors in the pipeline yield [gr.update(visible=False), gr.update(visible=True), f"❌ Pipeline error: {e}"] def build_ui(css_path="assets/styles.css"): with open(css_path, "r") as f: css = f.read() with gr.Blocks(css=css, title="🧬SynthDataGen") as ui: with gr.Column(elem_id="app-container"): gr.Markdown("

SynthDataGen 🧬

") gr.Markdown("

AI-Powered Synthetic Dataset Generator

") gr.HTML("""

With SynthDataGen, easily generate diverse datasets in different formats for testing, development, and AI training.

🎯 How It Works:

  1. 1️⃣ Define your business problem or dataset topic.
  2. 2️⃣ Select the dataset type, output format, model, and number of samples.
  3. 3️⃣ Receive your synthetic dataset — ready to download and use!
""") gr.HTML("""
Learn More
""") gr.Markdown("""

🧠 Need inspiration? Try one of these examples:

""") gr.Markdown("

Start generating your synthetic datasets now! 🗂️✨

") with gr.Group(elem_id="input-container"): business_problem = gr.Textbox( placeholder="Describe the dataset you want (e.g., Job postings, Customer reviews, Sensor data, Movie titles)", lines=2, label="📌 Business Problem", elem_classes=["label-box"], elem_id="business-problem-box" ) with gr.Row(elem_classes="column-gap"): with gr.Column(scale=1): dataset_type = gr.Dropdown( ["Tabular", "Time-series", "Text"], value="Tabular", label="📊 Dataset Type", elem_classes=["label-box"], elem_id="custom-dropdown" ) with gr.Column(scale=1): output_format = gr.Dropdown( choices=["JSON", "csv", "Parquet"], value="JSON", label="📁 Output Format", elem_classes=["label-box"], elem_id="custom-dropdown" ) # Bind the update function to the dataset type dropdown dataset_type.change( update_output_format, inputs=[dataset_type], outputs=[output_format] ) with gr.Row(elem_classes="row-spacer column-gap"): with gr.Column(scale=1): model = gr.Dropdown( ["GPT", "Claude"], value="GPT", label="🤖 Model", elem_classes=["label-box"], elem_id="custom-dropdown" ) with gr.Column(scale=1): num_samples = gr.Slider( minimum=10, maximum=1000, value=10, step=1, interactive=True, label="🔢 Number of Samples", elem_classes=["label-box"] ) # Hidden file component for dataset download file_download = gr.File(visible=False, elem_id="download-box", label=None) # Component to display status messages status_message = gr.Markdown("", label="Status") # Button to trigger dataset generation run_btn = gr.Button("Create a dataset", elem_id="run-btn") run_btn.click( update_pipeline, inputs=[business_problem, dataset_type, output_format, num_samples, model], outputs=[file_download, run_btn, status_message] ) return ui, generator.output_dir