Spaces:

mohitkumarrajbadi
/

Finetune_Gemma_Model

Build error

File size: 13,370 Bytes

import streamlit as st
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from utils import (
    load_dataset,
    save_dataset,
    clean_dataset,
    compute_dataset_score,
    detect_outliers,
    apply_transformation,
    list_datasets,
    detect_inconsistent_types
)

# -------------------------------
# Constants & Setup
# -------------------------------
DATASET_DIR = "datasets"
DEFAULT_DATASET = "train_data.csv"
os.makedirs(DATASET_DIR, exist_ok=True)  # Ensure directory exists

# -------------------------------
# Sidebar: Dataset Selection
# -------------------------------
st.sidebar.header("📊 Dataset Selection")

# List available datasets from the datasets folder
available_datasets = list_datasets(DATASET_DIR)
dataset_choice = st.sidebar.radio("Choose Dataset Source:", ["Select Existing Dataset", "Upload New Dataset"])

dataset_path = None

if dataset_choice == "Select Existing Dataset":
    if available_datasets:
        selected_dataset = st.sidebar.selectbox("Select Dataset:", available_datasets)
        dataset_path = os.path.join(DATASET_DIR, selected_dataset)
        st.sidebar.success(f"Using `{selected_dataset}` dataset.")
    else:
        st.sidebar.warning("No datasets found. Please upload a new dataset.")
elif dataset_choice == "Upload New Dataset":
    uploaded_file = st.sidebar.file_uploader("Upload Dataset (CSV, JSON, or Excel)", type=["csv", "json", "xlsx"])
    if uploaded_file:
        file_ext = uploaded_file.name.split('.')[-1].lower()
        try:
            if file_ext == "csv":
                new_df = pd.read_csv(uploaded_file)
            elif file_ext == "json":
                new_df = pd.json_normalize(json.load(uploaded_file))
            elif file_ext == "xlsx":
                new_df = pd.read_excel(uploaded_file)
            else:
                st.error("Unsupported file format.")
                st.stop()
        except Exception as e:
            st.error(f"Error reading file: {e}")
            st.stop()

        # Save the new dataset with its filename
        dataset_path = os.path.join(DATASET_DIR, uploaded_file.name)
        save_dataset(new_df, dataset_path)
        st.sidebar.success(f"Dataset `{uploaded_file.name}` uploaded successfully!")
        available_datasets = list_datasets(DATASET_DIR)  # Refresh list
    else:
        st.sidebar.warning("Please upload a dataset.")

# -------------------------------
# Load the Selected Dataset
# -------------------------------
if dataset_path:
    df = load_dataset(dataset_path)
    if df.empty:
        st.warning("Dataset is empty or failed to load.")
else:
    df = pd.DataFrame()
    st.warning("No dataset selected. Please choose or upload a dataset.")

# -------------------------------
# Main App Title & Description
# -------------------------------
st.title("📊 The Data Hub")

# -------------------------------
# Tabs for Operations
# -------------------------------
tabs = st.tabs([
    "View & Summary", "Clean Data",
    "Visualize Data", "Data Profiling",
    "Outlier Detection", "Custom Transformations",
    "Export"
])

# -------------------------------
# Tab 1: View & Summary
# -------------------------------
with tabs[0]:
    st.subheader("📋 Current Dataset Preview")
    if not df.empty:
        st.dataframe(df)
        st.markdown("#### 🔎 Basic Statistics")
        st.write(df.describe(include="all"))
    else:
        st.warning("No dataset available. Please choose or upload a dataset.")

# -------------------------------
# Tab 2: Clean Data
# -------------------------------
with tabs[1]:
    st.subheader("🧼 Clean Your Dataset")
    if not df.empty:
        remove_duplicates = st.checkbox("Remove Duplicate Rows", value=True)
        fill_missing = st.checkbox("Fill Missing Values", value=False)
        fill_value = st.text_input("Fill missing values with:", value="0")

        st.markdown("#### Optional: Rename Columns")
        new_names = {}
        for col in df.columns:
            new_names[col] = st.text_input(f"Rename column '{col}'", value=col)

        if st.button("Clean Dataset"):
            cleaned_df = clean_dataset(df, remove_duplicates, fill_missing, fill_value)
            cleaned_df = cleaned_df.rename(columns=new_names)
            save_dataset(cleaned_df, dataset_path)
            st.success("✅ Dataset cleaned successfully!")
            st.dataframe(cleaned_df.head())
            df = cleaned_df
    else:
        st.warning("No dataset available for cleaning.")

# -------------------------------
# Tab 3: Visualize Data (Fixed KeyError Issue)
# -------------------------------
with tabs[2]:
    st.subheader("📊 Visualize Your Data")

    if not df.empty:
        viz_type = st.selectbox("Select Visualization Type", ["Histogram", "Scatter", "Box Plot", "Heatmap", "Line Chart"])
        numeric_cols = df.select_dtypes(include=["number"]).columns.tolist()

        if numeric_cols:
            # Validate column selection
            col = st.selectbox("Select Column", numeric_cols)

            if col:  # Ensure valid column selection
                fig, ax = plt.subplots()

                if viz_type == "Histogram":
                    ax.hist(df[col].dropna(), bins=20, color="skyblue", edgecolor="black")
                elif viz_type == "Box Plot":
                    sns.boxplot(x=df[col].dropna(), ax=ax)
                elif viz_type == "Scatter":
                    x_col = st.selectbox("X-axis", numeric_cols)
                    y_col = st.selectbox("Y-axis", numeric_cols)
                    if x_col and y_col:
                        ax.scatter(df[x_col], df[y_col], color="green")
                elif viz_type == "Heatmap":
                    corr = df[numeric_cols].corr()
                    sns.heatmap(corr, annot=True, cmap="coolwarm", ax=ax)
                elif viz_type == "Line Chart":
                    ax.plot(df.index, df[col], marker="o")

                st.pyplot(fig)
            else:
                st.warning("Please select a valid column.")
        else:
            st.warning("No numeric columns available for visualization.")
    else:
        st.warning("No dataset available for visualization.")

# -------------------------------
# Tab 4: Data Profiling
# -------------------------------
with tabs[3]:
    if not df.empty:

        # -------------------------------
        # 1. General Dataset Info
        # -------------------------------
        st.markdown("### 🛠️ General Information")
        st.write(f"✅ **Total Rows:** `{df.shape[0]}`")
        st.write(f"✅ **Total Columns:** `{df.shape[1]}`")
        st.write(f"✅ **Memory Usage:** `{df.memory_usage(deep=True).sum() / (1024 ** 2):.2f} MB`")
        st.write(f"✅ **Dataset Shape:** `{df.shape}`")

        # -------------------------------
        # 2. Dataset Quality Score
        # -------------------------------
        st.markdown("### 📊 Dataset Quality Score")
        score = compute_dataset_score(df)
        st.success(f"💯 Dataset Quality Score: `{score} / 100`")

        # -------------------------------
        # 3. Column Overview with Stats
        # -------------------------------
        st.markdown("### 🔥 Column Overview")

        # Numeric and categorical columns
        numeric_cols = df.select_dtypes(include=["number"]).columns
        categorical_cols = df.select_dtypes(include=["object"]).columns

        profile = pd.DataFrame({
            "Column": df.columns,
            "Data Type": df.dtypes.values,
            "Missing Values": df.isnull().sum().values,
            "Missing %": (df.isnull().sum() / len(df) * 100).values,
            "Unique Values": df.nunique().values
        })

        # Add numeric statistics
        if len(numeric_cols) > 0:
            numeric_stats = pd.DataFrame({
                "Column": numeric_cols,
                "Min": df[numeric_cols].min().values,
                "Max": df[numeric_cols].max().values,
                "Mean": df[numeric_cols].mean().values,
                "Std Dev": df[numeric_cols].std().values,
                "Skewness": df[numeric_cols].skew().values,
                "Kurtosis": df[numeric_cols].kurt().values
            })

            # Merge stats with the profile
            profile = profile.merge(numeric_stats, on="Column", how="left")

        st.dataframe(profile)

        # -------------------------------
        # 4. Missing Values Visualization
        # -------------------------------
        st.markdown("### 🔎 Missing Values Distribution")
        missing_values = df.isnull().sum()
        missing_values = missing_values[missing_values > 0]

        if not missing_values.empty:
            fig, ax = plt.subplots(figsize=(12, 5))
            sns.barplot(x=missing_values.index, y=missing_values.values, ax=ax, color="skyblue")
            ax.set_title("Missing Values per Column")
            ax.set_ylabel("Missing Count")
            ax.set_xticklabels(ax.get_xticklabels(), rotation=45)
            st.pyplot(fig)
        else:
            st.success("No missing values found!")

        # -------------------------------
        # 5. Duplicates Detection
        # -------------------------------
        st.markdown("### 🔥 Duplicates & Constant Columns Detection")
        
        # Duplicates
        duplicate_count = df.duplicated().sum()
        st.write(f"🔁 **Duplicate Rows:** `{duplicate_count}`")

        # Constant Columns
        constant_cols = [col for col in df.columns if df[col].nunique() == 1]
        if constant_cols:
            st.write(f"🚩 **Constant Columns:** `{constant_cols}`")
        else:
            st.success("No constant columns detected!")

        # -------------------------------
        # 6. Cardinality Analysis
        # -------------------------------
        st.markdown("### 🧬 Cardinality Analysis")
        
        high_cardinality = [col for col in df.columns if df[col].nunique() > len(df) * 0.8]
        if high_cardinality:
            st.write(f"🔢 **High-Cardinality Columns:** `{high_cardinality}`")
        else:
            st.success("No high-cardinality columns detected!")

        # -------------------------------
        # 7. Top Frequent & Rare Values
        # -------------------------------
        st.markdown("### 🎯 Frequent & Rare Values")

        for col in categorical_cols:
            st.write(f"✅ **{col}**")
            
            top_values = df[col].value_counts().nlargest(5)
            rare_values = df[col].value_counts().nsmallest(5)

            st.write("📊 **Top Frequent Values:**")
            st.dataframe(top_values)

            st.write("🧪 **Rare Values:**")
            st.dataframe(rare_values)

        # -------------------------------
        # 8. Correlation Matrix
        # -------------------------------
        st.markdown("### 📊 Correlation Matrix")
        
        if len(numeric_cols) > 1:
            corr = df[numeric_cols].corr()

            fig, ax = plt.subplots(figsize=(12, 8))
            sns.heatmap(corr, annot=True, fmt=".2f", cmap="coolwarm", square=True, ax=ax)
            st.pyplot(fig)
        else:
            st.info("Not enough numeric columns for correlation analysis.")

        # -------------------------------
        # 9. Pair Plot (Numerical Relationships)
        # -------------------------------
        st.markdown("### 🔥 Pair Plot (Numerical Relationships)")
        
        if len(numeric_cols) >= 2:
            pairplot = sns.pairplot(df[numeric_cols], diag_kind='kde')
            st.pyplot(pairplot.fig)
        else:
            st.info("Not enough numeric columns for pair plot visualization.")

        # -------------------------------
        # 10. Outlier Detection
        # -------------------------------
        st.markdown("### 🚩 Outlier Detection")
        
        outliers = detect_outliers(df)
        if outliers:
            st.write("✅ **Outliers Detected:**")
            st.dataframe(pd.DataFrame(outliers.items(), columns=["Column", "Outlier Count"]))
        else:
            st.success("No significant outliers detected!")

        # -------------------------------
        # 11. Inconsistent Data Types
        # -------------------------------
        st.markdown("### 🚫 Inconsistent Data Types")
        
        inconsistent_types = detect_inconsistent_types(df)
        if inconsistent_types:
            st.write("⚠️ **Inconsistent Data Types Detected:**")
            st.write(inconsistent_types)
        else:
            st.success("No inconsistent data types detected!")

    else:
        st.warning("No dataset available for profiling.")

# -------------------------------
# Tab 5: Outlier Detection
# -------------------------------
with tabs[4]:
    st.subheader("🚀 Outlier Detection")
    if not df.empty:
        outliers = detect_outliers(df)
        st.write(outliers)
    else:
        st.warning("No dataset available for outlier detection.")

# -------------------------------
# Tab 6: Export
# -------------------------------
with tabs[5]:
    st.subheader("📤 Export Dataset")
    export_format = st.selectbox("Export Format", ["CSV", "Excel", "JSON"])
    if not df.empty:
        st.download_button("Download", df.to_csv(index=False), f"dataset.{export_format.lower()}")