Maaz
Upload 27 files
7166038 verified
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import os
import pickle
class DataPreprocessor:
def __init__(self):
self.scaler = StandardScaler()
def load_data(self, filepath):
"""Load and return the dataset"""
if not os.path.exists(filepath):
raise FileNotFoundError(f"The file at {filepath} does not exist.")
df = pd.read_csv(filepath)
print("Data loaded successfully.")
return df
def preprocess_data(self, df):
"""Preprocess the data by handling missing values"""
# Handle missing values (zeros)
features_to_process = ['Glucose', 'BloodPressure', 'SkinThickness', 'BMI', 'Insulin']
for feature in features_to_process:
mean_value = df[feature].replace(0, np.nan).mean()
df[feature] = df[feature].replace(0, mean_value)
print("Missing values handled.")
return df
def split_data(self, df):
"""Split data into features and target"""
features = df.drop('Outcome', axis=1)
target = df['Outcome']
return features, target
def scale_features(self, features, is_training=False):
"""Scale features using StandardScaler"""
if is_training:
scaled_features = self.scaler.fit_transform(features)
# Save the scaler for future use
model_dir = "src/models"
os.makedirs(model_dir, exist_ok=True)
# Save the scaler as a pickle file
with open(f"{model_dir}/scaler.pkl", 'wb') as f:
pickle.dump(self.scaler, f)
# Save the scaled data as a CSV file
scaled_df = pd.DataFrame(scaled_features, columns=features.columns)
scaled_df['Outcome'] = df['Outcome'] # Add the Outcome column back
scaled_csv_path = "data/scaled_data.csv"
scaled_df.to_csv(scaled_csv_path, index=False)
print("Scaled data saved as csv file.")
else:
scaled_features = self.scaler.transform(features)
return scaled_features
if __name__ == "__main__":
preprocessor = DataPreprocessor()
# Load and preprocess data
df = preprocessor.load_data("data/preprocessed_data.csv")
df = preprocessor.preprocess_data(df)
# Split data into features and target
features, target = preprocessor.split_data(df)
# Scale features (Training phase)
scaled_features = preprocessor.scale_features(features, is_training=True)
print("Data preprocessing completed.")