Spaces:
Sleeping
Sleeping
import os | |
import gradio as gr | |
import threading | |
from src.synth_data_gen import SynthDataGen | |
generator = SynthDataGen() | |
# Update the output format choices based on the selected dataset type | |
def update_output_format(dataset_type): | |
if dataset_type in ["Tabular", "Time-series"]: | |
return gr.update(choices=["JSON", "csv", "Parquet"], value="JSON") | |
elif dataset_type == "Text": | |
return gr.update(choices=["JSON", "Markdown"], value="JSON") | |
def update_pipeline(business_problem, dataset_type, output_format, num_samples, model): | |
# Check if business problem is empty | |
if not business_problem.strip(): | |
yield [gr.update(visible=False), gr.update(visible=True), "❌ Please enter a business problem before generating."] | |
return | |
# Initial feedback while generating | |
yield [gr.update(visible=False), gr.update(visible=False), "⏳ Generating dataset..."] | |
try: | |
# Pack inputs into a dictionary for the generator | |
input_data = { | |
"business_problem": business_problem, | |
"dataset_type": dataset_type, | |
"output_format": output_format, | |
"num_samples": num_samples, | |
"model": model | |
} | |
# Generate dataset file | |
file_path = generator.generate_dataset(**input_data) | |
print("🧪 File result returned:", file_path) | |
# Check if file exists and return success message + file path | |
if isinstance(file_path, str) and os.path.exists(file_path): | |
threading.Timer(60, os.remove, args=[file_path]).start() # Auto-delete after 60s | |
yield [gr.update(value=file_path, visible=True), gr.update(visible=True), "✅ Dataset ready for download."] | |
else: | |
# Handle invalid or missing file | |
yield [gr.update(visible=False), gr.update(visible=True), "❌ Error: File not created or path invalid."] | |
except Exception as e: | |
# Catch and display any errors in the pipeline | |
yield [gr.update(visible=False), gr.update(visible=True), f"❌ Pipeline error: {e}"] | |
def build_ui(css_path="assets/styles.css"): | |
with open(css_path, "r") as f: | |
css = f.read() | |
with gr.Blocks(css=css, title="🧬SynthDataGen") as ui: | |
with gr.Column(elem_id="app-container"): | |
gr.Markdown("<h1 id='app-title'>SynthDataGen 🧬 </h1>") | |
gr.Markdown("<h2 id='app-subtitle'>AI-Powered Synthetic Dataset Generator</h2>") | |
gr.HTML(""" | |
<div id="intro-text"> | |
<p>With SynthDataGen, easily generate <strong>diverse datasets in different formats</strong> for testing, development, and AI training.</p> | |
<h4>🎯 How It Works:</h4> | |
<ol> | |
<li>1️⃣ Define your business problem or dataset topic.</li> | |
<li>2️⃣ Select the dataset type, output format, model, and number of samples.</li> | |
<li>3️⃣ Receive your synthetic dataset — ready to download and use!</li> | |
</ol> | |
</div> | |
""") | |
gr.HTML(""" | |
<div id="learn-more-button"> | |
<a href="https://github.com/lisek75/synthdatagen_app/blob/main/README.md" class="button-link" target="_blank">Learn More</a> | |
</div> | |
""") | |
gr.Markdown(""" | |
<p><strong>🧠 Need inspiration?</strong> Try one of these examples:</p> | |
<ul> | |
<li>Movie summaries for genre classification.</li> | |
<li>Generate customer chats with realistic dialogue, chat_id, timestamp, names, sentiment label, and aligned transcript.</li> | |
<li>Create daily stock prices for 2 companies with typical fields like date, ticker, open, close, high, low, and volume.</li> | |
</ul> | |
""") | |
gr.Markdown("<p><strong>Start generating your synthetic datasets now!</strong> 🗂️✨</p>") | |
with gr.Group(elem_id="input-container"): | |
business_problem = gr.Textbox( | |
placeholder="Describe the dataset you want (e.g., Job postings, Customer reviews, Sensor data, Movie titles)", | |
lines=2, | |
label="📌 Business Problem", | |
elem_classes=["label-box"], | |
elem_id="business-problem-box" | |
) | |
with gr.Row(elem_classes="column-gap"): | |
with gr.Column(scale=1): | |
dataset_type = gr.Dropdown( | |
["Tabular", "Time-series", "Text"], | |
value="Tabular", | |
label="📊 Dataset Type", | |
elem_classes=["label-box"], | |
elem_id="custom-dropdown" | |
) | |
with gr.Column(scale=1): | |
output_format = gr.Dropdown( | |
choices=["JSON", "csv", "Parquet"], | |
value="JSON", | |
label="📁 Output Format", | |
elem_classes=["label-box"], | |
elem_id="custom-dropdown" | |
) | |
# Bind the update function to the dataset type dropdown | |
dataset_type.change( | |
update_output_format, | |
inputs=[dataset_type], | |
outputs=[output_format] | |
) | |
with gr.Row(elem_classes="row-spacer column-gap"): | |
with gr.Column(scale=1): | |
model = gr.Dropdown( | |
["GPT", "Claude"], | |
value="GPT", | |
label="🤖 Model", | |
elem_classes=["label-box"], | |
elem_id="custom-dropdown" | |
) | |
with gr.Column(scale=1): | |
num_samples = gr.Slider( | |
minimum=10, | |
maximum=1000, | |
value=10, | |
step=1, | |
interactive=True, | |
label="🔢 Number of Samples", | |
elem_classes=["label-box"] | |
) | |
# Hidden file component for dataset download | |
file_download = gr.File(visible=False, elem_id="download-box", label=None) | |
# Component to display status messages | |
status_message = gr.Markdown("", label="Status") | |
# Button to trigger dataset generation | |
run_btn = gr.Button("Create a dataset", elem_id="run-btn") | |
run_btn.click( | |
update_pipeline, | |
inputs=[business_problem, dataset_type, output_format, num_samples, model], | |
outputs=[file_download, run_btn, status_message] | |
) | |
return ui, generator.output_dir | |