Spaces:

Dhananjaykhengare
/

ml_model_builder

Running

File size: 25,468 Bytes

b9a43be

import gradio as gr
import pandas as pd
import json
import os
from utils.logger import create_log_entry, log_experiment_results
from utils.file_utils import load_csv, preview_dataframe, get_column_names
from utils.training import train_models
from utils.preprocessing import preprocess_data
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, ParameterGrid, train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import numpy as np
from utils.training import get_model_instance
try:
    from skopt import BayesSearchCV
    bayes_available = True
except ImportError:
    bayes_available = False


session = {
    "raw_df": None,   
    "df": None,       
    "features": [],
    "target": None,
    "columns": [],
    "missing_strategy": "drop",      
    "transformation_text": ""        
}
# ---------------------------
# Dahsboard 
# ---------------------------

# ---------------------------
# Step 1: File Upload Handler
# ---------------------------
def handle_upload(file):
    if file is None:
        return "No file uploaded", None, gr.update(choices=[]), gr.update(choices=[])
    try:
        df, err = load_csv(file.name)
        session["uploaded_filename"] = file.name
        if err:
            return f"Error: {err}", None, gr.update(choices=[]), gr.update(choices=[])
        session["raw_df"] = df.copy()
        session["df"] = df.copy()  # Initialize processed df as raw df
        columns = get_column_names(df)
        session["columns"] = columns
        return (
            "File uploaded successfully!",
            preview_dataframe(df),
            gr.update(choices=columns, value=[]),
            gr.update(choices=columns, value=None)
        )
    except Exception as e:
        return f"Error: {e}", None, gr.update(choices=[]), gr.update(choices=[])

# ---------------------------
# Step 2: Global Missing Value Strategy
# ---------------------------


def save_missing_strategy(missing_strategy):
    raw_df = session.get("raw_df")
    target_col = session.get("target", "")
    if raw_df is None:
        return "No data available", None
    processed_df = preprocess_data(raw_df.copy(), target_col=target_col, missing_strategy=missing_strategy, transformation_map={})
    session["df"] = processed_df
    session["missing_strategy"] = missing_strategy  # Store in session
    return f"Missing value strategy '{missing_strategy}' applied", preview_dataframe(processed_df)


# ---------------------------
# Step 3: Save Features and Target Selection (Filter DataFrame)
# ---------------------------
def save_feature_target_selection(features, target):
    if session.get("df") is None:
        return "No data available", "", None
    session["features"] = features
    session["target"] = target
    selected_cols = features.copy()
    if target and target not in selected_cols:
        selected_cols.append(target)
    filtered_df = session["df"][selected_cols]
    session["df"] = filtered_df
    default_trans = ", ".join(["No Transformation"] * len(features)) if features else ""
    return f"Selected {len(features)} features and target: {target}", default_trans, preview_dataframe(filtered_df)

# ---------------------------
# Step 4: Save Transformation Options
# ---------------------------
def save_transformation_options(transformation_text):
    if session.get("df") is None or not session.get("features"):
        return "No data or features available", None
    trans_list = [t.strip() for t in transformation_text.split(",")] if transformation_text.strip() != "" else []
    if len(trans_list) < len(session["features"]):
        trans_list += ["No Transformation"] * (len(session["features"]) - len(trans_list))
    transformation_mapping = {session["features"][i]: trans_list[i] for i in range(len(session["features"]))}
    df = session.get("df").copy()
    def apply_transformations(df, transformation_map):
        for col, transform in transformation_map.items():
            if transform == "Label Encode":
                if df[col].dtype == "object" or str(df[col].dtype).startswith("category"):
                    df[col] = LabelEncoder().fit_transform(df[col])
                else:
                    df[col] = LabelEncoder().fit_transform(df[col].astype(str))
            elif transform == "Normalize":
                scaler = StandardScaler()
                df[[col]] = scaler.fit_transform(df[[col]])
        return df
    processed_df = apply_transformations(df, transformation_mapping)
    session["df"] = processed_df
    session["transformation_text"] = transformation_text  # Store in session
    return "Transformation options applied", preview_dataframe(processed_df)

# ---------------------------
# Model Training Function
# ---------------------------
def train_selected_models(experiment_title, selected_models, lr_c, lr_max_iter, dt_max_depth, dt_min_samples_split,

                          rf_n_estimators, rf_max_depth, svm_c, svm_kernel, nb_var_smoothing,

                          train_size):
    df = session.get("df")
    features = session.get("features")
    target = session.get("target")
    missing_strategy = session.get("missing_strategy", "drop")
    transformation_text = session.get("transformation_text", "")
    if df is None or not features or target is None or not selected_models:
        return "Please ensure data is uploaded, features/target selected, and models chosen."
    trans_list = [t.strip() for t in transformation_text.split(",")] if transformation_text.strip() != "" else []
    if len(trans_list) < len(features):
        trans_list += ["No Transformation"] * (len(features) - len(trans_list))
    transformation_mapping = {features[i]: trans_list[i] for i in range(len(features))}
    preprocessing_steps = [f"Missing Value: {missing_strategy}"] + [f"{k}: {v}" for k, v in transformation_mapping.items()]
    test_size = 1 - train_size
    if not set(features).issubset(df.columns):
        return "Selected features not found in the processed data."
    X = df[features]
    y = df[target]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)
    model_params = {
        "Logistic Regression": {"C": lr_c, "max_iter": lr_max_iter},
        "Decision Tree": {"max_depth": dt_max_depth, "min_samples_split": dt_min_samples_split},
        "Random Forest": {"n_estimators": rf_n_estimators, "max_depth": rf_max_depth},
        "SVM": {"C": svm_c, "kernel": svm_kernel},
        "Naive Bayes": {"var_smoothing": nb_var_smoothing}
    }
    training_logs = train_models(X_train, X_test, y_train, y_test, selected_models, model_params, preprocessing_steps)
    session["trained_models"] = {model: training_logs[model]["model"] for model in selected_models}
    session["X_test"] = X_test
    session["y_test"] = y_test
    experiment_logs = []
    for model_name in selected_models:
        entry = create_log_entry(
            experiment_title,
            model_name,
            model_params[model_name],
            "",
            preprocessing_steps,
            training_logs[model_name]["metrics"],
            training_logs[model_name].get("training_time", 0),
            training_logs[model_name]["model"]
        )
        experiment_logs.append(entry)
    log_experiment_results(experiment_logs)
    formatted_results = "\n".join([f"{model}: {training_logs[model]['metrics']}" for model in selected_models])
    return formatted_results

# ---------------------------
# Hyperparameter Tuning Function (Grid Search Example)
# ---------------------------
def run_hyperparameter_tuning(experiment_title, selected_models):
    df = session.get("df")
    features = session.get("features")
    target = session.get("target")

    if df is None or not features or target is None or not selected_models:
        return "Please ensure data is uploaded, features/target selected, and models chosen.", None

    X = df[features]
    y = df[target]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

    strategy_map = {
        "Grid Search": GridSearchCV,
        "Random Search": RandomizedSearchCV
    }
    if bayes_available:
        from skopt import BayesSearchCV
        strategy_map["Bayesian Optimization"] = BayesSearchCV

    param_grids = {
        "Logistic Regression": {"C": [0.01, 0.1, 1, 10], "max_iter": [100, 200, 300]},
        "Decision Tree": {"max_depth": [3, 5, 10, None], "min_samples_split": [2, 5, 10]},
        "Random Forest": {"n_estimators": [50, 100, 200], "max_depth": [None, 10, 20]},
        "SVM": {"C": [0.1, 1, 10], "kernel": ["linear", "rbf"]},
        "Naive Bayes": {"var_smoothing": np.logspace(-10, -8, 5)}
    }

    all_logs = []
    status_texts = []

    for model_name in selected_models:
        best_overall_score = -1
        best_overall_summary = None

        for strategy_name, strategy_cls in strategy_map.items():
            try:
                model = get_model_instance(model_name, {})

                if strategy_name == "Grid Search":
                    searcher = strategy_cls(
                        model,
                        param_grid=param_grids[model_name],
                        scoring="accuracy",
                        cv=5
                    )
                elif strategy_name == "Random Search":
                    searcher = strategy_cls(
                        model,
                        param_distributions=param_grids[model_name],
                        scoring="accuracy",
                        cv=5,
                        n_iter=min(10, len(list(ParameterGrid(param_grids[model_name]))))
                    )
                elif strategy_name == "Bayesian Optimization":
                    searcher = strategy_cls(
                        model,
                        search_spaces=param_grids[model_name],
                        scoring="accuracy",
                        cv=5,
                        n_iter=10
                    )
                else:
                    continue

                searcher.fit(X_train, y_train)
                best_estimator = searcher.best_estimator_
                best_params = searcher.best_params_

                from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

                y_train_pred = best_estimator.predict(X_train)
                y_test_pred = best_estimator.predict(X_test)

                metrics = {
                    "accuracy_train": accuracy_score(y_train, y_train_pred),
                    "accuracy_test": accuracy_score(y_test, y_test_pred),
                    "precision_train": precision_score(y_train, y_train_pred, average='weighted', zero_division=0),
                    "precision_test": precision_score(y_test, y_test_pred, average='weighted', zero_division=0),
                    "recall_train": recall_score(y_train, y_train_pred, average='weighted', zero_division=0),
                    "recall_test": recall_score(y_test, y_test_pred, average='weighted', zero_division=0),
                    "f1_score_train": f1_score(y_train, y_train_pred, average='weighted', zero_division=0),
                    "f1_score_test": f1_score(y_test, y_test_pred, average='weighted', zero_division=0)
                }

                log_entry = create_log_entry(
                    experiment_title,
                    f"Hyperparameter Tuned {model_name} ({strategy_name})",
                    best_params,
                    "",
                    [f"Strategy: {strategy_name}"],
                    metrics,
                    0,
                    best_estimator
                )
                all_logs.append(log_entry)

                if searcher.best_score_ > best_overall_score:
                    best_overall_score = searcher.best_score_
                    best_overall_summary = f"{model_name} ({strategy_name}):\n" + "\n".join(
                        [f"{k}: {v:.4f}" for k, v in metrics.items()]
                    )

            except Exception as e:
                continue

        if best_overall_summary:
            status_texts.append(best_overall_summary)
        else:
            status_texts.append(f"{model_name}: All tuning strategies failed.")

    log_experiment_results(all_logs)
    return "\n\n".join(status_texts), "Tuning complete!"




###--------------------dahsboard 


###--------------------dahsboard 




# ---------------------------
# Gradio Interface Layout
# ---------------------------
with gr.Blocks() as demo:
    gr.Markdown("## ML Model Builder")
    
    with gr.Tab("Data Upload & Preprocessing"):
        # Step 1: File Upload
        gr.Markdown("### Step 1: Upload File")
        with gr.Row():
            file_input = gr.File(label="Upload CSV File", file_types=[".csv"])
            upload_status = gr.Textbox(label="Upload Status", interactive=False)
        df_preview = gr.Dataframe(label="Raw Data Preview", interactive=False)
        
        # Step 2: Global Missing Value Strategy
        gr.Markdown("### Step 2: Global Missing Value Strategy")
        missing_strategy_dropdown = gr.Dropdown(
            label="Missing Value Strategy",
            choices=["drop", "mean", "median", "mode"],
            value="drop",
            info="Select how to handle missing values for all columns."
        )
        save_missing_btn = gr.Button("Save Missing Value Strategy")
        missing_status = gr.Textbox(label="Missing Strategy Status", interactive=False)
        missing_preview = gr.Dataframe(label="Data Preview after Missing Strategy", interactive=False)
        
        # Step 3: Select Features and Target
        gr.Markdown("### Step 3: Select Features and Target")
        feature_selector = gr.CheckboxGroup(label="Select Input Features", choices=[], interactive=True)
        target_selector = gr.Dropdown(label="Select Target Column", choices=[], interactive=True)
        save_features_btn = gr.Button("Save Features and Target")
        features_status = gr.Textbox(label="Features/Target Status", interactive=False)
        features_preview = gr.Dataframe(label="Data Preview after Feature Selection", interactive=False)
        
        # Step 4: Transformation Options
        gr.Markdown("### Step 4: Transformation Options")
        gr.Markdown(
            "For each selected feature (in order), specify a transformation. Allowed options: **No Transformation**, **Label Encode**, **Normalize**. "
            "Enter your choices as a comma-separated list. E.g.: No Transformation, Label Encode, Normalize"
        )
        transformation_text = gr.Textbox(label="Transformation Options", placeholder="E.g. No Transformation, Label Encode, Normalize", lines=1)
        save_transformation_btn = gr.Button("Save Transformation Options")
        transformation_status = gr.Textbox(label="Transformation Status", interactive=False)
        transformation_preview = gr.Dataframe(label="Data Preview after Transformation", interactive=False)
    
    with gr.Tab("Model Training"):
        gr.Markdown("### Model Training and Experiment Logging")
        # Global Experiment Title Input
        experiment_title_input = gr.Textbox(label="Experiment Title", placeholder="Enter a title for this experiment", lines=1)
        
        gr.Markdown("### Model Selection and Hyperparameter Tuning")
        model_selector = gr.CheckboxGroup(
            label="Select Models to Train",
            choices=["Logistic Regression", "Decision Tree", "Random Forest", "SVM", "Naive Bayes"],
            value=[], interactive=True
        )
        with gr.Column(visible=False) as lr_col:
            gr.Markdown("**Logistic Regression**")
            lr_c = gr.Slider(0.01, 10.0, step=0.01, value=1.0, label="C", interactive=True)
            lr_max_iter = gr.Slider(50, 500, step=10, value=100, label="Max Iterations", interactive=True)
        with gr.Column(visible=False) as dt_col:
            gr.Markdown("**Decision Tree**")
            dt_max_depth = gr.Slider(1, 50, step=1, value=10, label="Max Depth", interactive=True)
            dt_min_samples_split = gr.Slider(2, 10, step=1, value=2, label="Min Samples Split", interactive=True)
        with gr.Column(visible=False) as rf_col:
            gr.Markdown("**Random Forest**")
            rf_n_estimators = gr.Slider(10, 200, step=10, value=100, label="N Estimators", interactive=True)
            rf_max_depth = gr.Slider(1, 50, step=1, value=10, label="Max Depth", interactive=True)
        with gr.Column(visible=False) as svm_col:
            gr.Markdown("**SVM**")
            svm_c = gr.Slider(0.01, 10.0, step=0.01, value=1.0, label="C", interactive=True)
            svm_kernel = gr.Radio(["linear", "poly", "rbf", "sigmoid"], value="rbf", label="Kernel", interactive=True)
        with gr.Column(visible=False) as nb_col:
            gr.Markdown("**Naive Bayes**")
            nb_var_smoothing = gr.Slider(1e-10, 1e-5, step=1e-10, value=1e-9, label="Var Smoothing", interactive=True)
    
        model_columns = {
            "Logistic Regression": lr_col,
            "Decision Tree": dt_col,
            "Random Forest": rf_col,
            "SVM": svm_col,
            "Naive Bayes": nb_col,
        }
    
        def toggle_model_ui(selected_models):
            updates = []
            for model_name, panel in model_columns.items():
                updates.append(gr.update(visible=(model_name in selected_models)))
            return updates
    
        model_selector.change(
            fn=toggle_model_ui,
            inputs=model_selector,
            outputs=[lr_col, dt_col, rf_col, svm_col, nb_col]
        )
    
        gr.Markdown("### Training Parameters")
        train_slider = gr.Slider(minimum=0.5, maximum=0.9, step=0.05, value=0.8, label="Training Set Size (proportion)", interactive=True)
        train_btn = gr.Button("Train Selected Models")
        training_output = gr.Textbox(label="Training Output", lines=8, interactive=False)
    

# ---------------------------
# Define Component Interactions
# ---------------------------
    file_input.change(
        fn=handle_upload,
        inputs=file_input,
        outputs=[upload_status, df_preview, feature_selector, target_selector]
    )
    
    save_missing_btn.click(
        fn=save_missing_strategy,
        inputs=missing_strategy_dropdown,
        outputs=[missing_status, missing_preview]
    )
    
    save_features_btn.click(
        fn=save_feature_target_selection,
        inputs=[feature_selector, target_selector],
        outputs=[features_status, transformation_text, features_preview]
    )
    
    save_transformation_btn.click(
        fn=save_transformation_options,
        inputs=transformation_text,
        outputs=[transformation_status, transformation_preview]
    )
    
    train_btn.click(
        fn=train_selected_models,
        inputs=[
            experiment_title_input,
            model_selector,
            lr_c, lr_max_iter,
            dt_max_depth, dt_min_samples_split,
            rf_n_estimators, rf_max_depth,
            svm_c, svm_kernel,
            nb_var_smoothing,
            train_slider
        ],
        outputs=training_output
    )
    with gr.Tab("Hyperparameter Tuning"):
        gr.Markdown("### Fully Automatic Hyperparameter Tuning")
        gr.Markdown(
            "This step will automatically tune the selected models using **three search strategies**:\n"
            "- **Grid Search**\n"
            "- **Random Search**\n"
            "- **Bayesian Optimization** (if available)\n\n"
            "The best-performing result from each strategy will be logged, and the top strategy will be shown below."
        )
        experiment_title_hp = gr.Textbox(label="Experiment Title", placeholder="Enter experiment title")
        model_selector_hp = gr.CheckboxGroup(
            label="Select Models for Auto-Tuning",
            choices=["Logistic Regression", "Decision Tree", "Random Forest", "SVM", "Naive Bayes"],
            value=[], interactive=True
        )
        run_tune_btn = gr.Button("Run Hyperparameter Tuning")
        tuning_output = gr.Textbox(label="Tuning Output", lines=10, interactive=False)

        run_tune_btn.click(
            fn=run_hyperparameter_tuning,
            inputs=[experiment_title_hp, model_selector_hp],
            outputs=[tuning_output, gr.Textbox(visible=False)]
        )
    with gr.Tab("Dashboard"):
        log_df = gr.State(pd.DataFrame())

        def load_log_dataframe_dynamic():
            import os, json, pandas as pd

            log_path = "experiments/logs/experiment_log.jsonl"
            if not os.path.exists(log_path):
                return pd.DataFrame([{"Message": "No logs found. Train or tune a model."}])

            with open(log_path, "r", encoding="utf-8") as f:
                lines = f.readlines()

            rows = []
            for line in lines:
                try:
                    row = json.loads(line)
                    metrics = row.get("metrics", {})
                    entry = {
                        "Experiment": row.get("experiment_title", ""),
                        "Timestamp": row.get("timestamp", ""),
                        "Model": row.get("model", ""),
                        "Training Time (s)": round(row.get("training_time_sec", 0), 4),
                        "Inference Time (ms)": round(metrics.get("inference_time", 0) * 1000, 4),
                        "Model Size (bytes)": row.get("model_size_bytes", ""),
                        "CPU (%)": row.get("system_info", {}).get("cpu_utilization", ""),
                        "Memory (MB)": row.get("system_info", {}).get("memory_used_mb", ""),
                        "CPU Name": row.get("system_info", {}).get("cpu", ""),
                        "Hyperparameters": json.dumps(row.get("hyperparameters", {})),
                    }
                    for k, v in metrics.items():
                        if k != "inference_time":
                            entry[k] = round(v, 4) if isinstance(v, (float, int)) else v
                    rows.append(entry)
                except Exception as e:
                    continue

            return pd.DataFrame(rows)

        refresh_button = gr.Button("🔄 Refresh Dashboard")
        dashboard_table = gr.Dataframe(
            value=load_log_dataframe_dynamic(),
            interactive=True,
            wrap=False,
            
        )

        refresh_button.click(
            fn=load_log_dataframe_dynamic,
            outputs=dashboard_table,
        )

    with gr.Tab("Summary"):

        gr.Markdown("### 🔍 Best Models by Metric")
        gr.Markdown(
            "- ✅ Automatically finds the **best model** for each evaluation metric from all logged experiments.\n"
            "- 🔁 Use the **Refresh** button to update this view after new training or tuning."
        )

        summary_df = gr.Dataframe(label="Top Models by Metric", interactive=False)

        def refresh_summary():
            import pandas as pd, os, json

            log_path = "experiments/logs/experiment_log.jsonl"
            if not os.path.exists(log_path):
                return pd.DataFrame([{"Message": "No logs found. Train or tune a model first."}])

            df = pd.read_json(log_path, lines=True)
            metric_keys = [
                "accuracy_test", "precision_test", "recall_test", "f1_score_test"
            ]

            best_rows = []

            for metric in metric_keys:
                best = None
                best_score = -float("inf")

                for _, row in df.iterrows():
                    score = row.get("metrics", {}).get(metric)
                    if isinstance(score, (int, float)) and score > best_score:
                        best = row
                        best_score = score

                if best is not None:
                    best_rows.append({
                        "Metric": metric,
                        "Best Score": round(best_score, 4),
                        "Model": best.get("model"),
                        "Experiment": best.get("experiment_title"),
                        "Timestamp": best.get("timestamp"),
                        "Hyperparameters": json.dumps(best.get("hyperparameters", {})),
                    })

            summary_df_result = pd.DataFrame(best_rows)
            if not summary_df_result.empty:
                return summary_df_result
            else:
                return pd.DataFrame([{"Message": "No valid metrics found in logs."}])

        refresh_btn = gr.Button("🔁 Refresh")
        refresh_btn.click(fn=refresh_summary, outputs=summary_df)

        # Load initial data
        summary_df.value = refresh_summary()


demo.launch(ssr_mode=False)