Dhananjaykhengare's picture
Upload 10 files
b9a43be verified
import gradio as gr
import pandas as pd
import json
import os
from utils.logger import create_log_entry, log_experiment_results
from utils.file_utils import load_csv, preview_dataframe, get_column_names
from utils.training import train_models
from utils.preprocessing import preprocess_data
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, ParameterGrid, train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import numpy as np
from utils.training import get_model_instance
try:
from skopt import BayesSearchCV
bayes_available = True
except ImportError:
bayes_available = False
session = {
"raw_df": None,
"df": None,
"features": [],
"target": None,
"columns": [],
"missing_strategy": "drop",
"transformation_text": ""
}
# ---------------------------
# Dahsboard
# ---------------------------
# ---------------------------
# Step 1: File Upload Handler
# ---------------------------
def handle_upload(file):
if file is None:
return "No file uploaded", None, gr.update(choices=[]), gr.update(choices=[])
try:
df, err = load_csv(file.name)
session["uploaded_filename"] = file.name
if err:
return f"Error: {err}", None, gr.update(choices=[]), gr.update(choices=[])
session["raw_df"] = df.copy()
session["df"] = df.copy() # Initialize processed df as raw df
columns = get_column_names(df)
session["columns"] = columns
return (
"File uploaded successfully!",
preview_dataframe(df),
gr.update(choices=columns, value=[]),
gr.update(choices=columns, value=None)
)
except Exception as e:
return f"Error: {e}", None, gr.update(choices=[]), gr.update(choices=[])
# ---------------------------
# Step 2: Global Missing Value Strategy
# ---------------------------
def save_missing_strategy(missing_strategy):
raw_df = session.get("raw_df")
target_col = session.get("target", "")
if raw_df is None:
return "No data available", None
processed_df = preprocess_data(raw_df.copy(), target_col=target_col, missing_strategy=missing_strategy, transformation_map={})
session["df"] = processed_df
session["missing_strategy"] = missing_strategy # Store in session
return f"Missing value strategy '{missing_strategy}' applied", preview_dataframe(processed_df)
# ---------------------------
# Step 3: Save Features and Target Selection (Filter DataFrame)
# ---------------------------
def save_feature_target_selection(features, target):
if session.get("df") is None:
return "No data available", "", None
session["features"] = features
session["target"] = target
selected_cols = features.copy()
if target and target not in selected_cols:
selected_cols.append(target)
filtered_df = session["df"][selected_cols]
session["df"] = filtered_df
default_trans = ", ".join(["No Transformation"] * len(features)) if features else ""
return f"Selected {len(features)} features and target: {target}", default_trans, preview_dataframe(filtered_df)
# ---------------------------
# Step 4: Save Transformation Options
# ---------------------------
def save_transformation_options(transformation_text):
if session.get("df") is None or not session.get("features"):
return "No data or features available", None
trans_list = [t.strip() for t in transformation_text.split(",")] if transformation_text.strip() != "" else []
if len(trans_list) < len(session["features"]):
trans_list += ["No Transformation"] * (len(session["features"]) - len(trans_list))
transformation_mapping = {session["features"][i]: trans_list[i] for i in range(len(session["features"]))}
df = session.get("df").copy()
def apply_transformations(df, transformation_map):
for col, transform in transformation_map.items():
if transform == "Label Encode":
if df[col].dtype == "object" or str(df[col].dtype).startswith("category"):
df[col] = LabelEncoder().fit_transform(df[col])
else:
df[col] = LabelEncoder().fit_transform(df[col].astype(str))
elif transform == "Normalize":
scaler = StandardScaler()
df[[col]] = scaler.fit_transform(df[[col]])
return df
processed_df = apply_transformations(df, transformation_mapping)
session["df"] = processed_df
session["transformation_text"] = transformation_text # Store in session
return "Transformation options applied", preview_dataframe(processed_df)
# ---------------------------
# Model Training Function
# ---------------------------
def train_selected_models(experiment_title, selected_models, lr_c, lr_max_iter, dt_max_depth, dt_min_samples_split,
rf_n_estimators, rf_max_depth, svm_c, svm_kernel, nb_var_smoothing,
train_size):
df = session.get("df")
features = session.get("features")
target = session.get("target")
missing_strategy = session.get("missing_strategy", "drop")
transformation_text = session.get("transformation_text", "")
if df is None or not features or target is None or not selected_models:
return "Please ensure data is uploaded, features/target selected, and models chosen."
trans_list = [t.strip() for t in transformation_text.split(",")] if transformation_text.strip() != "" else []
if len(trans_list) < len(features):
trans_list += ["No Transformation"] * (len(features) - len(trans_list))
transformation_mapping = {features[i]: trans_list[i] for i in range(len(features))}
preprocessing_steps = [f"Missing Value: {missing_strategy}"] + [f"{k}: {v}" for k, v in transformation_mapping.items()]
test_size = 1 - train_size
if not set(features).issubset(df.columns):
return "Selected features not found in the processed data."
X = df[features]
y = df[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)
model_params = {
"Logistic Regression": {"C": lr_c, "max_iter": lr_max_iter},
"Decision Tree": {"max_depth": dt_max_depth, "min_samples_split": dt_min_samples_split},
"Random Forest": {"n_estimators": rf_n_estimators, "max_depth": rf_max_depth},
"SVM": {"C": svm_c, "kernel": svm_kernel},
"Naive Bayes": {"var_smoothing": nb_var_smoothing}
}
training_logs = train_models(X_train, X_test, y_train, y_test, selected_models, model_params, preprocessing_steps)
session["trained_models"] = {model: training_logs[model]["model"] for model in selected_models}
session["X_test"] = X_test
session["y_test"] = y_test
experiment_logs = []
for model_name in selected_models:
entry = create_log_entry(
experiment_title,
model_name,
model_params[model_name],
"",
preprocessing_steps,
training_logs[model_name]["metrics"],
training_logs[model_name].get("training_time", 0),
training_logs[model_name]["model"]
)
experiment_logs.append(entry)
log_experiment_results(experiment_logs)
formatted_results = "\n".join([f"{model}: {training_logs[model]['metrics']}" for model in selected_models])
return formatted_results
# ---------------------------
# Hyperparameter Tuning Function (Grid Search Example)
# ---------------------------
def run_hyperparameter_tuning(experiment_title, selected_models):
df = session.get("df")
features = session.get("features")
target = session.get("target")
if df is None or not features or target is None or not selected_models:
return "Please ensure data is uploaded, features/target selected, and models chosen.", None
X = df[features]
y = df[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
strategy_map = {
"Grid Search": GridSearchCV,
"Random Search": RandomizedSearchCV
}
if bayes_available:
from skopt import BayesSearchCV
strategy_map["Bayesian Optimization"] = BayesSearchCV
param_grids = {
"Logistic Regression": {"C": [0.01, 0.1, 1, 10], "max_iter": [100, 200, 300]},
"Decision Tree": {"max_depth": [3, 5, 10, None], "min_samples_split": [2, 5, 10]},
"Random Forest": {"n_estimators": [50, 100, 200], "max_depth": [None, 10, 20]},
"SVM": {"C": [0.1, 1, 10], "kernel": ["linear", "rbf"]},
"Naive Bayes": {"var_smoothing": np.logspace(-10, -8, 5)}
}
all_logs = []
status_texts = []
for model_name in selected_models:
best_overall_score = -1
best_overall_summary = None
for strategy_name, strategy_cls in strategy_map.items():
try:
model = get_model_instance(model_name, {})
if strategy_name == "Grid Search":
searcher = strategy_cls(
model,
param_grid=param_grids[model_name],
scoring="accuracy",
cv=5
)
elif strategy_name == "Random Search":
searcher = strategy_cls(
model,
param_distributions=param_grids[model_name],
scoring="accuracy",
cv=5,
n_iter=min(10, len(list(ParameterGrid(param_grids[model_name]))))
)
elif strategy_name == "Bayesian Optimization":
searcher = strategy_cls(
model,
search_spaces=param_grids[model_name],
scoring="accuracy",
cv=5,
n_iter=10
)
else:
continue
searcher.fit(X_train, y_train)
best_estimator = searcher.best_estimator_
best_params = searcher.best_params_
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
y_train_pred = best_estimator.predict(X_train)
y_test_pred = best_estimator.predict(X_test)
metrics = {
"accuracy_train": accuracy_score(y_train, y_train_pred),
"accuracy_test": accuracy_score(y_test, y_test_pred),
"precision_train": precision_score(y_train, y_train_pred, average='weighted', zero_division=0),
"precision_test": precision_score(y_test, y_test_pred, average='weighted', zero_division=0),
"recall_train": recall_score(y_train, y_train_pred, average='weighted', zero_division=0),
"recall_test": recall_score(y_test, y_test_pred, average='weighted', zero_division=0),
"f1_score_train": f1_score(y_train, y_train_pred, average='weighted', zero_division=0),
"f1_score_test": f1_score(y_test, y_test_pred, average='weighted', zero_division=0)
}
log_entry = create_log_entry(
experiment_title,
f"Hyperparameter Tuned {model_name} ({strategy_name})",
best_params,
"",
[f"Strategy: {strategy_name}"],
metrics,
0,
best_estimator
)
all_logs.append(log_entry)
if searcher.best_score_ > best_overall_score:
best_overall_score = searcher.best_score_
best_overall_summary = f"{model_name} ({strategy_name}):\n" + "\n".join(
[f"{k}: {v:.4f}" for k, v in metrics.items()]
)
except Exception as e:
continue
if best_overall_summary:
status_texts.append(best_overall_summary)
else:
status_texts.append(f"{model_name}: All tuning strategies failed.")
log_experiment_results(all_logs)
return "\n\n".join(status_texts), "Tuning complete!"
###--------------------dahsboard
###--------------------dahsboard
# ---------------------------
# Gradio Interface Layout
# ---------------------------
with gr.Blocks() as demo:
gr.Markdown("## ML Model Builder")
with gr.Tab("Data Upload & Preprocessing"):
# Step 1: File Upload
gr.Markdown("### Step 1: Upload File")
with gr.Row():
file_input = gr.File(label="Upload CSV File", file_types=[".csv"])
upload_status = gr.Textbox(label="Upload Status", interactive=False)
df_preview = gr.Dataframe(label="Raw Data Preview", interactive=False)
# Step 2: Global Missing Value Strategy
gr.Markdown("### Step 2: Global Missing Value Strategy")
missing_strategy_dropdown = gr.Dropdown(
label="Missing Value Strategy",
choices=["drop", "mean", "median", "mode"],
value="drop",
info="Select how to handle missing values for all columns."
)
save_missing_btn = gr.Button("Save Missing Value Strategy")
missing_status = gr.Textbox(label="Missing Strategy Status", interactive=False)
missing_preview = gr.Dataframe(label="Data Preview after Missing Strategy", interactive=False)
# Step 3: Select Features and Target
gr.Markdown("### Step 3: Select Features and Target")
feature_selector = gr.CheckboxGroup(label="Select Input Features", choices=[], interactive=True)
target_selector = gr.Dropdown(label="Select Target Column", choices=[], interactive=True)
save_features_btn = gr.Button("Save Features and Target")
features_status = gr.Textbox(label="Features/Target Status", interactive=False)
features_preview = gr.Dataframe(label="Data Preview after Feature Selection", interactive=False)
# Step 4: Transformation Options
gr.Markdown("### Step 4: Transformation Options")
gr.Markdown(
"For each selected feature (in order), specify a transformation. Allowed options: **No Transformation**, **Label Encode**, **Normalize**. "
"Enter your choices as a comma-separated list. E.g.: No Transformation, Label Encode, Normalize"
)
transformation_text = gr.Textbox(label="Transformation Options", placeholder="E.g. No Transformation, Label Encode, Normalize", lines=1)
save_transformation_btn = gr.Button("Save Transformation Options")
transformation_status = gr.Textbox(label="Transformation Status", interactive=False)
transformation_preview = gr.Dataframe(label="Data Preview after Transformation", interactive=False)
with gr.Tab("Model Training"):
gr.Markdown("### Model Training and Experiment Logging")
# Global Experiment Title Input
experiment_title_input = gr.Textbox(label="Experiment Title", placeholder="Enter a title for this experiment", lines=1)
gr.Markdown("### Model Selection and Hyperparameter Tuning")
model_selector = gr.CheckboxGroup(
label="Select Models to Train",
choices=["Logistic Regression", "Decision Tree", "Random Forest", "SVM", "Naive Bayes"],
value=[], interactive=True
)
with gr.Column(visible=False) as lr_col:
gr.Markdown("**Logistic Regression**")
lr_c = gr.Slider(0.01, 10.0, step=0.01, value=1.0, label="C", interactive=True)
lr_max_iter = gr.Slider(50, 500, step=10, value=100, label="Max Iterations", interactive=True)
with gr.Column(visible=False) as dt_col:
gr.Markdown("**Decision Tree**")
dt_max_depth = gr.Slider(1, 50, step=1, value=10, label="Max Depth", interactive=True)
dt_min_samples_split = gr.Slider(2, 10, step=1, value=2, label="Min Samples Split", interactive=True)
with gr.Column(visible=False) as rf_col:
gr.Markdown("**Random Forest**")
rf_n_estimators = gr.Slider(10, 200, step=10, value=100, label="N Estimators", interactive=True)
rf_max_depth = gr.Slider(1, 50, step=1, value=10, label="Max Depth", interactive=True)
with gr.Column(visible=False) as svm_col:
gr.Markdown("**SVM**")
svm_c = gr.Slider(0.01, 10.0, step=0.01, value=1.0, label="C", interactive=True)
svm_kernel = gr.Radio(["linear", "poly", "rbf", "sigmoid"], value="rbf", label="Kernel", interactive=True)
with gr.Column(visible=False) as nb_col:
gr.Markdown("**Naive Bayes**")
nb_var_smoothing = gr.Slider(1e-10, 1e-5, step=1e-10, value=1e-9, label="Var Smoothing", interactive=True)
model_columns = {
"Logistic Regression": lr_col,
"Decision Tree": dt_col,
"Random Forest": rf_col,
"SVM": svm_col,
"Naive Bayes": nb_col,
}
def toggle_model_ui(selected_models):
updates = []
for model_name, panel in model_columns.items():
updates.append(gr.update(visible=(model_name in selected_models)))
return updates
model_selector.change(
fn=toggle_model_ui,
inputs=model_selector,
outputs=[lr_col, dt_col, rf_col, svm_col, nb_col]
)
gr.Markdown("### Training Parameters")
train_slider = gr.Slider(minimum=0.5, maximum=0.9, step=0.05, value=0.8, label="Training Set Size (proportion)", interactive=True)
train_btn = gr.Button("Train Selected Models")
training_output = gr.Textbox(label="Training Output", lines=8, interactive=False)
# ---------------------------
# Define Component Interactions
# ---------------------------
file_input.change(
fn=handle_upload,
inputs=file_input,
outputs=[upload_status, df_preview, feature_selector, target_selector]
)
save_missing_btn.click(
fn=save_missing_strategy,
inputs=missing_strategy_dropdown,
outputs=[missing_status, missing_preview]
)
save_features_btn.click(
fn=save_feature_target_selection,
inputs=[feature_selector, target_selector],
outputs=[features_status, transformation_text, features_preview]
)
save_transformation_btn.click(
fn=save_transformation_options,
inputs=transformation_text,
outputs=[transformation_status, transformation_preview]
)
train_btn.click(
fn=train_selected_models,
inputs=[
experiment_title_input,
model_selector,
lr_c, lr_max_iter,
dt_max_depth, dt_min_samples_split,
rf_n_estimators, rf_max_depth,
svm_c, svm_kernel,
nb_var_smoothing,
train_slider
],
outputs=training_output
)
with gr.Tab("Hyperparameter Tuning"):
gr.Markdown("### Fully Automatic Hyperparameter Tuning")
gr.Markdown(
"This step will automatically tune the selected models using **three search strategies**:\n"
"- **Grid Search**\n"
"- **Random Search**\n"
"- **Bayesian Optimization** (if available)\n\n"
"The best-performing result from each strategy will be logged, and the top strategy will be shown below."
)
experiment_title_hp = gr.Textbox(label="Experiment Title", placeholder="Enter experiment title")
model_selector_hp = gr.CheckboxGroup(
label="Select Models for Auto-Tuning",
choices=["Logistic Regression", "Decision Tree", "Random Forest", "SVM", "Naive Bayes"],
value=[], interactive=True
)
run_tune_btn = gr.Button("Run Hyperparameter Tuning")
tuning_output = gr.Textbox(label="Tuning Output", lines=10, interactive=False)
run_tune_btn.click(
fn=run_hyperparameter_tuning,
inputs=[experiment_title_hp, model_selector_hp],
outputs=[tuning_output, gr.Textbox(visible=False)]
)
with gr.Tab("Dashboard"):
log_df = gr.State(pd.DataFrame())
def load_log_dataframe_dynamic():
import os, json, pandas as pd
log_path = "experiments/logs/experiment_log.jsonl"
if not os.path.exists(log_path):
return pd.DataFrame([{"Message": "No logs found. Train or tune a model."}])
with open(log_path, "r", encoding="utf-8") as f:
lines = f.readlines()
rows = []
for line in lines:
try:
row = json.loads(line)
metrics = row.get("metrics", {})
entry = {
"Experiment": row.get("experiment_title", ""),
"Timestamp": row.get("timestamp", ""),
"Model": row.get("model", ""),
"Training Time (s)": round(row.get("training_time_sec", 0), 4),
"Inference Time (ms)": round(metrics.get("inference_time", 0) * 1000, 4),
"Model Size (bytes)": row.get("model_size_bytes", ""),
"CPU (%)": row.get("system_info", {}).get("cpu_utilization", ""),
"Memory (MB)": row.get("system_info", {}).get("memory_used_mb", ""),
"CPU Name": row.get("system_info", {}).get("cpu", ""),
"Hyperparameters": json.dumps(row.get("hyperparameters", {})),
}
for k, v in metrics.items():
if k != "inference_time":
entry[k] = round(v, 4) if isinstance(v, (float, int)) else v
rows.append(entry)
except Exception as e:
continue
return pd.DataFrame(rows)
refresh_button = gr.Button("πŸ”„ Refresh Dashboard")
dashboard_table = gr.Dataframe(
value=load_log_dataframe_dynamic(),
interactive=True,
wrap=False,
)
refresh_button.click(
fn=load_log_dataframe_dynamic,
outputs=dashboard_table,
)
with gr.Tab("Summary"):
gr.Markdown("### πŸ” Best Models by Metric")
gr.Markdown(
"- βœ… Automatically finds the **best model** for each evaluation metric from all logged experiments.\n"
"- πŸ” Use the **Refresh** button to update this view after new training or tuning."
)
summary_df = gr.Dataframe(label="Top Models by Metric", interactive=False)
def refresh_summary():
import pandas as pd, os, json
log_path = "experiments/logs/experiment_log.jsonl"
if not os.path.exists(log_path):
return pd.DataFrame([{"Message": "No logs found. Train or tune a model first."}])
df = pd.read_json(log_path, lines=True)
metric_keys = [
"accuracy_test", "precision_test", "recall_test", "f1_score_test"
]
best_rows = []
for metric in metric_keys:
best = None
best_score = -float("inf")
for _, row in df.iterrows():
score = row.get("metrics", {}).get(metric)
if isinstance(score, (int, float)) and score > best_score:
best = row
best_score = score
if best is not None:
best_rows.append({
"Metric": metric,
"Best Score": round(best_score, 4),
"Model": best.get("model"),
"Experiment": best.get("experiment_title"),
"Timestamp": best.get("timestamp"),
"Hyperparameters": json.dumps(best.get("hyperparameters", {})),
})
summary_df_result = pd.DataFrame(best_rows)
if not summary_df_result.empty:
return summary_df_result
else:
return pd.DataFrame([{"Message": "No valid metrics found in logs."}])
refresh_btn = gr.Button("πŸ” Refresh")
refresh_btn.click(fn=refresh_summary, outputs=summary_df)
# Load initial data
summary_df.value = refresh_summary()
demo.launch(ssr_mode=False)