Dataset / workflow.txt
vansh9878's picture
files added
825e978
1. seprate attributes and data
2. remove the datatypes from the attributes
C:\Users\Niall Dcunha\DatasetCreator\house price prediction\21754539_dataset
# import os
# import glob
# import pandas as pd
# import openai
# from openai import OpenAI
# from dotenv import load_dotenv
# import ast
# import re
# def extract_dict_from_response(response: str) -> dict:
# # Try extracting code block content containing the dictionary
# match = re.search(r"```(?:python)?\s*(\{.*?\})\s*```", response, re.DOTALL)
# if match:
# mapping_str = match.group(1)
# else:
# # Try extracting dictionary directly if it's not in code block
# match = re.search(r"(\{.*\})", response, re.DOTALL)
# if not match:
# raise ValueError("❌ Could not find a Python dictionary in the response.")
# mapping_str = match.group(1)
# try:
# return ast.literal_eval(mapping_str)
# except Exception as e:
# print("⚠️ Failed to evaluate extracted dictionary string.")
# print("String:", mapping_str)
# raise e
# # Load environment variables
# load_dotenv()
# client = OpenAI(
# api_key=os.getenv("OPENAI_API_KEY"),
# base_url=os.getenv("OPENAI_API_BASE") # Optional: for Azure or self-hosted
# )
# def load_csv_files(folder_path):
# csv_files = glob.glob(os.path.join(folder_path, "*.csv"))
# dataframes = []
# column_sets = []
# valid_paths = []
# print("πŸ“₯ Reading CSV files...")
# for file in csv_files:
# try:
# df = pd.read_csv(file)
# dataframes.append(df)
# column_sets.append(list(df.columns))
# valid_paths.append(file)
# print(f"βœ… Loaded: {os.path.basename(file)}")
# except pd.errors.ParserError as e:
# print(f"❌ Skipping file due to parsing error: {os.path.basename(file)}")
# print(f" ↳ {e}")
# except Exception as e:
# print(f"⚠️ Unexpected error with file {os.path.basename(file)}: {e}")
# return dataframes, column_sets, valid_paths
# def generate_mapping_prompt(column_sets):
# prompt = (
# "You are a data scientist helping to merge multiple ML prediction datasets. "
# "Each CSV may have different or similar column names. I need a unified mapping to standardize these datasets. "
# "Also, please identify likely prediction label columns (e.g., price, quality, outcome).\n\n"
# "Here are the column headers from each CSV:\n"
# )
# for i, columns in enumerate(column_sets):
# prompt += f"CSV {i+1}: {columns}\n"
# prompt += (
# "\nPlease provide:\n"
# "1. A Python dictionary mapping similar columns across these CSVs.\n"
# "2. A list of columns most likely to represent prediction labels.\n\n"
# "Format your response as:\n"
# "```python\n"
# "column_mapping = { ... }\n"
# "label_columns = [ ... ]\n"
# "```"
# )
# return prompt
# def get_column_mapping_from_openai(column_sets):
# prompt = generate_mapping_prompt(column_sets)
# response = client.chat.completions.create(
# model="gpt-4",
# messages=[
# {"role": "system", "content": "You are a helpful data scientist."},
# {"role": "user", "content": prompt}
# ],
# temperature=0.3
# )
# content = response.choices[0].message.content
# print("\nπŸ“© Received response from OpenAI.")
# try:
# # Try parsing both dictionary and label list from the response
# column_mapping_match = re.search(r"column_mapping\s*=\s*(\{.*?\})", content, re.DOTALL)
# label_columns_match = re.search(r"label_columns\s*=\s*(\[.*?\])", content, re.DOTALL)
# if column_mapping_match:
# mapping = ast.literal_eval(column_mapping_match.group(1))
# else:
# raise ValueError("❌ Could not find `column_mapping` in the response.")
# if label_columns_match:
# label_columns = ast.literal_eval(label_columns_match.group(1))
# else:
# label_columns = []
# except Exception as e:
# print("⚠️ Error parsing OpenAI response:")
# print(content)
# raise e
# return mapping, label_columns
# def standardize_columns(df, mapping):
# new_columns = {col: mapping.get(col, col) for col in df.columns}
# return df.rename(columns=new_columns)
# def merge_csvs(folder_path, output_file="merged_dataset.csv"):
# dfs, column_sets, csv_paths = load_csv_files(folder_path)
# if not dfs:
# print("❌ No valid CSVs found to merge.")
# return
# print("\n🧠 Requesting column mapping from OpenAI...")
# mapping, label_columns = get_column_mapping_from_openai(column_sets)
# print("\nπŸ“Œ Column Mapping:")
# for k, v in mapping.items():
# print(f" '{k}' -> '{v}'")
# print("\n🏷️ Suggested Label Columns:")
# for label in label_columns:
# print(f" - {label}")
# standardized_dfs = [standardize_columns(df, mapping) for df in dfs]
# merged_df = pd.concat(standardized_dfs, ignore_index=True, sort=False)
# merged_df.to_csv(output_file, index=False)
# print(f"\nβœ… Merged dataset saved as '{output_file}'")
# if __name__ == "__main__":
# folder_path = "house"
import os
import glob
import pandas as pd
import ast
import re
from itertools import combinations
from rapidfuzz import fuzz, process
from dotenv import load_dotenv
from openai import OpenAI
# Manual rename map to standardize some known variations
manual_rename_map = {
"review": "text",
"text": "text",
"NumBedrooms": "bedrooms",
"HousePrice": "price",
"TARGET(PRICE_IN_LACS)": "price",
"SquareFootage": "area",
"SQUARE_FT": "area",
"sentiment": "label",
"target": "label",
"type": "label",
"variety": "label",
"class": "label",
"HeartDisease": "label",
"Heart Attack Risk (Binary)": "label",
"Heart Attack Risk": "label"
}
def normalize(col):
return re.sub(r'[^a-z0-9]', '', col.lower())
def apply_manual_renaming(df, rename_map):
renamed = {}
for col in df.columns:
if col in rename_map:
renamed[col] = rename_map[col]
return df.rename(columns=renamed)
def get_fuzzy_common_columns(cols_list, threshold=75):
base = cols_list[0]
common = set()
for col in base:
match_all = True
for other in cols_list[1:]:
match, score, _ = process.extractOne(col, other, scorer=fuzz.token_sort_ratio)
if score < threshold:
match_all = False
break
if match_all:
common.add(col)
return common
def sortFiles(dfs):
unique_dfs = []
seen = []
for i, df1 in enumerate(dfs):
duplicate = False
for j in seen:
df2 = dfs[j]
if df1.shape != df2.shape:
continue
if df1.reset_index(drop=True).equals(df2.reset_index(drop=True)):
duplicate = True
break
if not duplicate:
unique_dfs.append(df1)
seen.append(i)
return unique_dfs
def load_csv_files(folder_path):
csv_files = glob.glob(os.path.join(folder_path, "*.csv"))
dfs = []
column_sets = []
paths = []
for file in csv_files:
try:
df = pd.read_csv(file)
dfs.append(df)
column_sets.append(list(df.columns))
paths.append(file)
print(f"βœ… Loaded: {os.path.basename(file)}")
except Exception as e:
print(f"❌ Failed to load {file}: {e}")
return dfs, column_sets, paths
def generate_mapping_prompt(column_sets):
prompt = (
"You are a data scientist helping to merge multiple machine learning prediction datasets. "
"Each CSV file may have different column names, even if they represent similar types of data. "
"Your task is to identify and map these similar columns across datasets to a common, unified name. "
"Columns with clearly similar features (e.g., 'Bedrooms' and 'BedroomsAbvGr') should be merged into one column with a relevant name like 'bedrooms'.\n\n"
"Avoid keeping redundant or unique columns that do not have any logical counterpart in other datasets unless they are essential. "
"The goal is not to maximize the number of columns or rows, but to create a clean, consistent dataset for training ML models.\n\n"
"Examples:\n"
"- Dataset1: 'Locality' -> Mumbai, Delhi\n"
"- Dataset2: 'Places' -> Goa, Singapore\n"
"β†’ Merge both into a common column like 'location'.\n\n"
"Please also identify likely label or target columns that are typically used for prediction (e.g., price, sentiment, outcome, quality).\n\n"
)
for i, cols in enumerate(column_sets):
prompt += f"CSV {i+1}: {cols}\n"
prompt += "\nPlease return:\n```python\ncolumn_mapping = { ... }\nlabel_columns = [ ... ]\n```"
return prompt
def get_column_mapping_from_openai(column_sets):
load_dotenv()
client = OpenAI(
api_key=os.getenv("OPENAI_API_KEY"),
base_url=os.getenv("OPENAI_API_BASE", "")
)
prompt = generate_mapping_prompt(column_sets)
response = client.chat.completions.create(
model="gpt-4",
messages=[
{"role": "system", "content": "You are a helpful data scientist."},
{"role": "user", "content": prompt}
],
temperature=0.3
)
content = response.choices[0].message.content
try:
column_mapping_match = re.search(r"column_mapping\s*=\s*(\{.*?\})", content, re.DOTALL)
label_columns_match = re.search(r"label_columns\s*=\s*(\[.*?\])", content, re.DOTALL)
column_mapping = ast.literal_eval(column_mapping_match.group(1)) if column_mapping_match else {}
label_columns = ast.literal_eval(label_columns_match.group(1)) if label_columns_match else []
except Exception as e:
print("⚠️ Error parsing OpenAI response:")
print(content)
raise e
return column_mapping, label_columns
def clean_and_merge(folder, query=None, use_ai=True):
os.makedirs("./final", exist_ok=True)
dfs, column_sets, csv_paths = load_csv_files(folder)
if not dfs:
print("No valid CSVs found.")
return
dfs = sortFiles(dfs)
dfs = [apply_manual_renaming(df, manual_rename_map) for df in dfs]
if use_ai:
try:
column_mapping, label_columns = get_column_mapping_from_openai(column_sets)
dfs = [df.rename(columns={col: column_mapping.get(col, col) for col in df.columns}) for df in dfs]
except Exception as e:
print("Falling back to fuzzy matching due to OpenAI error:", e)
use_ai = False
if not use_ai:
# Normalize columns for fuzzy match fallback
normalized_cols = []
for df in dfs:
normalized_cols.append({normalize(col) for col in df.columns})
# Get best combination with fuzzy common columns
max_common = set()
best_combo = []
for i in range(2, len(dfs)+1):
for combo in combinations(range(len(dfs)), i):
selected = [normalized_cols[j] for j in combo]
fuzzy_common = get_fuzzy_common_columns(selected)
if len(fuzzy_common) >= len(max_common):
max_common = fuzzy_common
best_combo = combo
# Harmonize and align
aligned_dfs = []
for idx in best_combo:
df = dfs[idx]
col_map = {}
for std_col in max_common:
match, _, _ = process.extractOne(std_col, [normalize(col) for col in df.columns])
for col in df.columns:
if normalize(col) == match:
col_map[col] = std_col
break
df_subset = df[list(col_map.keys())].rename(columns=col_map)
aligned_dfs.append(df_subset)
combined_df = pd.concat(aligned_dfs, ignore_index=True)
else:
combined_df = pd.concat(dfs, ignore_index=True)
# Label assignment fallback
for i, df in enumerate(dfs):
if 'label' not in df.columns:
name = os.path.basename(csv_paths[i]).split(".")[0].lower()
name_cleaned = name
if query:
words = set(re.sub(r'[^a-z]', ' ', query.lower()).split())
for word in words:
name_cleaned = name_cleaned.replace(word, "")
df['label'] = name_cleaned
# Decide best final file
largest_df = max(dfs, key=lambda df: len(df))
flag = False
if len(largest_df) > len(combined_df) and len(largest_df.columns) > 2:
flag = True
elif len(combined_df) > len(largest_df) and (len(largest_df.columns) - len(combined_df.columns)) > 3 and len(largest_df.columns) < 7:
flag = True
output_file = f"./final/{query or os.path.basename(folder)}.csv"
if flag:
largest_df.to_csv(output_file, index=False)
print(f"⚠️ Saved fallback single file due to poor merge: {output_file}")
else:
combined_df.to_csv(output_file, index=False)
print(f"βœ… Saved merged file: {output_file}")
# Example usage:
clean_and_merge("house", query="house", use_ai=True)
# merge_csvs(folder_path)