Spaces:
Build error
Build error
File size: 13,370 Bytes
2bdd84f e37cfd0 2bdd84f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 |
import streamlit as st
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from utils import (
load_dataset,
save_dataset,
clean_dataset,
compute_dataset_score,
detect_outliers,
apply_transformation,
list_datasets,
detect_inconsistent_types
)
# -------------------------------
# Constants & Setup
# -------------------------------
DATASET_DIR = "datasets"
DEFAULT_DATASET = "train_data.csv"
os.makedirs(DATASET_DIR, exist_ok=True) # Ensure directory exists
# -------------------------------
# Sidebar: Dataset Selection
# -------------------------------
st.sidebar.header("π Dataset Selection")
# List available datasets from the datasets folder
available_datasets = list_datasets(DATASET_DIR)
dataset_choice = st.sidebar.radio("Choose Dataset Source:", ["Select Existing Dataset", "Upload New Dataset"])
dataset_path = None
if dataset_choice == "Select Existing Dataset":
if available_datasets:
selected_dataset = st.sidebar.selectbox("Select Dataset:", available_datasets)
dataset_path = os.path.join(DATASET_DIR, selected_dataset)
st.sidebar.success(f"Using `{selected_dataset}` dataset.")
else:
st.sidebar.warning("No datasets found. Please upload a new dataset.")
elif dataset_choice == "Upload New Dataset":
uploaded_file = st.sidebar.file_uploader("Upload Dataset (CSV, JSON, or Excel)", type=["csv", "json", "xlsx"])
if uploaded_file:
file_ext = uploaded_file.name.split('.')[-1].lower()
try:
if file_ext == "csv":
new_df = pd.read_csv(uploaded_file)
elif file_ext == "json":
new_df = pd.json_normalize(json.load(uploaded_file))
elif file_ext == "xlsx":
new_df = pd.read_excel(uploaded_file)
else:
st.error("Unsupported file format.")
st.stop()
except Exception as e:
st.error(f"Error reading file: {e}")
st.stop()
# Save the new dataset with its filename
dataset_path = os.path.join(DATASET_DIR, uploaded_file.name)
save_dataset(new_df, dataset_path)
st.sidebar.success(f"Dataset `{uploaded_file.name}` uploaded successfully!")
available_datasets = list_datasets(DATASET_DIR) # Refresh list
else:
st.sidebar.warning("Please upload a dataset.")
# -------------------------------
# Load the Selected Dataset
# -------------------------------
if dataset_path:
df = load_dataset(dataset_path)
if df.empty:
st.warning("Dataset is empty or failed to load.")
else:
df = pd.DataFrame()
st.warning("No dataset selected. Please choose or upload a dataset.")
# -------------------------------
# Main App Title & Description
# -------------------------------
st.title("π The Data Hub")
# -------------------------------
# Tabs for Operations
# -------------------------------
tabs = st.tabs([
"View & Summary", "Clean Data",
"Visualize Data", "Data Profiling",
"Outlier Detection", "Custom Transformations",
"Export"
])
# -------------------------------
# Tab 1: View & Summary
# -------------------------------
with tabs[0]:
st.subheader("π Current Dataset Preview")
if not df.empty:
st.dataframe(df)
st.markdown("#### π Basic Statistics")
st.write(df.describe(include="all"))
else:
st.warning("No dataset available. Please choose or upload a dataset.")
# -------------------------------
# Tab 2: Clean Data
# -------------------------------
with tabs[1]:
st.subheader("π§Ό Clean Your Dataset")
if not df.empty:
remove_duplicates = st.checkbox("Remove Duplicate Rows", value=True)
fill_missing = st.checkbox("Fill Missing Values", value=False)
fill_value = st.text_input("Fill missing values with:", value="0")
st.markdown("#### Optional: Rename Columns")
new_names = {}
for col in df.columns:
new_names[col] = st.text_input(f"Rename column '{col}'", value=col)
if st.button("Clean Dataset"):
cleaned_df = clean_dataset(df, remove_duplicates, fill_missing, fill_value)
cleaned_df = cleaned_df.rename(columns=new_names)
save_dataset(cleaned_df, dataset_path)
st.success("β
Dataset cleaned successfully!")
st.dataframe(cleaned_df.head())
df = cleaned_df
else:
st.warning("No dataset available for cleaning.")
# -------------------------------
# Tab 3: Visualize Data (Fixed KeyError Issue)
# -------------------------------
with tabs[2]:
st.subheader("π Visualize Your Data")
if not df.empty:
viz_type = st.selectbox("Select Visualization Type", ["Histogram", "Scatter", "Box Plot", "Heatmap", "Line Chart"])
numeric_cols = df.select_dtypes(include=["number"]).columns.tolist()
if numeric_cols:
# Validate column selection
col = st.selectbox("Select Column", numeric_cols)
if col: # Ensure valid column selection
fig, ax = plt.subplots()
if viz_type == "Histogram":
ax.hist(df[col].dropna(), bins=20, color="skyblue", edgecolor="black")
elif viz_type == "Box Plot":
sns.boxplot(x=df[col].dropna(), ax=ax)
elif viz_type == "Scatter":
x_col = st.selectbox("X-axis", numeric_cols)
y_col = st.selectbox("Y-axis", numeric_cols)
if x_col and y_col:
ax.scatter(df[x_col], df[y_col], color="green")
elif viz_type == "Heatmap":
corr = df[numeric_cols].corr()
sns.heatmap(corr, annot=True, cmap="coolwarm", ax=ax)
elif viz_type == "Line Chart":
ax.plot(df.index, df[col], marker="o")
st.pyplot(fig)
else:
st.warning("Please select a valid column.")
else:
st.warning("No numeric columns available for visualization.")
else:
st.warning("No dataset available for visualization.")
# -------------------------------
# Tab 4: Data Profiling
# -------------------------------
with tabs[3]:
if not df.empty:
# -------------------------------
# 1. General Dataset Info
# -------------------------------
st.markdown("### π οΈ General Information")
st.write(f"β
**Total Rows:** `{df.shape[0]}`")
st.write(f"β
**Total Columns:** `{df.shape[1]}`")
st.write(f"β
**Memory Usage:** `{df.memory_usage(deep=True).sum() / (1024 ** 2):.2f} MB`")
st.write(f"β
**Dataset Shape:** `{df.shape}`")
# -------------------------------
# 2. Dataset Quality Score
# -------------------------------
st.markdown("### π Dataset Quality Score")
score = compute_dataset_score(df)
st.success(f"π― Dataset Quality Score: `{score} / 100`")
# -------------------------------
# 3. Column Overview with Stats
# -------------------------------
st.markdown("### π₯ Column Overview")
# Numeric and categorical columns
numeric_cols = df.select_dtypes(include=["number"]).columns
categorical_cols = df.select_dtypes(include=["object"]).columns
profile = pd.DataFrame({
"Column": df.columns,
"Data Type": df.dtypes.values,
"Missing Values": df.isnull().sum().values,
"Missing %": (df.isnull().sum() / len(df) * 100).values,
"Unique Values": df.nunique().values
})
# Add numeric statistics
if len(numeric_cols) > 0:
numeric_stats = pd.DataFrame({
"Column": numeric_cols,
"Min": df[numeric_cols].min().values,
"Max": df[numeric_cols].max().values,
"Mean": df[numeric_cols].mean().values,
"Std Dev": df[numeric_cols].std().values,
"Skewness": df[numeric_cols].skew().values,
"Kurtosis": df[numeric_cols].kurt().values
})
# Merge stats with the profile
profile = profile.merge(numeric_stats, on="Column", how="left")
st.dataframe(profile)
# -------------------------------
# 4. Missing Values Visualization
# -------------------------------
st.markdown("### π Missing Values Distribution")
missing_values = df.isnull().sum()
missing_values = missing_values[missing_values > 0]
if not missing_values.empty:
fig, ax = plt.subplots(figsize=(12, 5))
sns.barplot(x=missing_values.index, y=missing_values.values, ax=ax, color="skyblue")
ax.set_title("Missing Values per Column")
ax.set_ylabel("Missing Count")
ax.set_xticklabels(ax.get_xticklabels(), rotation=45)
st.pyplot(fig)
else:
st.success("No missing values found!")
# -------------------------------
# 5. Duplicates Detection
# -------------------------------
st.markdown("### π₯ Duplicates & Constant Columns Detection")
# Duplicates
duplicate_count = df.duplicated().sum()
st.write(f"π **Duplicate Rows:** `{duplicate_count}`")
# Constant Columns
constant_cols = [col for col in df.columns if df[col].nunique() == 1]
if constant_cols:
st.write(f"π© **Constant Columns:** `{constant_cols}`")
else:
st.success("No constant columns detected!")
# -------------------------------
# 6. Cardinality Analysis
# -------------------------------
st.markdown("### 𧬠Cardinality Analysis")
high_cardinality = [col for col in df.columns if df[col].nunique() > len(df) * 0.8]
if high_cardinality:
st.write(f"π’ **High-Cardinality Columns:** `{high_cardinality}`")
else:
st.success("No high-cardinality columns detected!")
# -------------------------------
# 7. Top Frequent & Rare Values
# -------------------------------
st.markdown("### π― Frequent & Rare Values")
for col in categorical_cols:
st.write(f"β
**{col}**")
top_values = df[col].value_counts().nlargest(5)
rare_values = df[col].value_counts().nsmallest(5)
st.write("π **Top Frequent Values:**")
st.dataframe(top_values)
st.write("π§ͺ **Rare Values:**")
st.dataframe(rare_values)
# -------------------------------
# 8. Correlation Matrix
# -------------------------------
st.markdown("### π Correlation Matrix")
if len(numeric_cols) > 1:
corr = df[numeric_cols].corr()
fig, ax = plt.subplots(figsize=(12, 8))
sns.heatmap(corr, annot=True, fmt=".2f", cmap="coolwarm", square=True, ax=ax)
st.pyplot(fig)
else:
st.info("Not enough numeric columns for correlation analysis.")
# -------------------------------
# 9. Pair Plot (Numerical Relationships)
# -------------------------------
st.markdown("### π₯ Pair Plot (Numerical Relationships)")
if len(numeric_cols) >= 2:
pairplot = sns.pairplot(df[numeric_cols], diag_kind='kde')
st.pyplot(pairplot.fig)
else:
st.info("Not enough numeric columns for pair plot visualization.")
# -------------------------------
# 10. Outlier Detection
# -------------------------------
st.markdown("### π© Outlier Detection")
outliers = detect_outliers(df)
if outliers:
st.write("β
**Outliers Detected:**")
st.dataframe(pd.DataFrame(outliers.items(), columns=["Column", "Outlier Count"]))
else:
st.success("No significant outliers detected!")
# -------------------------------
# 11. Inconsistent Data Types
# -------------------------------
st.markdown("### π« Inconsistent Data Types")
inconsistent_types = detect_inconsistent_types(df)
if inconsistent_types:
st.write("β οΈ **Inconsistent Data Types Detected:**")
st.write(inconsistent_types)
else:
st.success("No inconsistent data types detected!")
else:
st.warning("No dataset available for profiling.")
# -------------------------------
# Tab 5: Outlier Detection
# -------------------------------
with tabs[4]:
st.subheader("π Outlier Detection")
if not df.empty:
outliers = detect_outliers(df)
st.write(outliers)
else:
st.warning("No dataset available for outlier detection.")
# -------------------------------
# Tab 6: Export
# -------------------------------
with tabs[5]:
st.subheader("π€ Export Dataset")
export_format = st.selectbox("Export Format", ["CSV", "Excel", "JSON"])
if not df.empty:
st.download_button("Download", df.to_csv(index=False), f"dataset.{export_format.lower()}")
|