|
1. seprate attributes and data |
|
2. remove the datatypes from the attributes |
|
C:\Users\Niall Dcunha\DatasetCreator\house price prediction\21754539_dataset |
|
|
|
# import os |
|
# import glob |
|
# import pandas as pd |
|
# import openai |
|
# from openai import OpenAI |
|
# from dotenv import load_dotenv |
|
# import ast |
|
# import re |
|
|
|
# def extract_dict_from_response(response: str) -> dict: |
|
# # Try extracting code block content containing the dictionary |
|
# match = re.search(r"```(?:python)?\s*(\{.*?\})\s*```", response, re.DOTALL) |
|
# if match: |
|
# mapping_str = match.group(1) |
|
# else: |
|
# # Try extracting dictionary directly if it's not in code block |
|
# match = re.search(r"(\{.*\})", response, re.DOTALL) |
|
# if not match: |
|
# raise ValueError("β Could not find a Python dictionary in the response.") |
|
# mapping_str = match.group(1) |
|
|
|
# try: |
|
# return ast.literal_eval(mapping_str) |
|
# except Exception as e: |
|
# print("β οΈ Failed to evaluate extracted dictionary string.") |
|
# print("String:", mapping_str) |
|
# raise e |
|
|
|
# # Load environment variables |
|
# load_dotenv() |
|
# client = OpenAI( |
|
# api_key=os.getenv("OPENAI_API_KEY"), |
|
# base_url=os.getenv("OPENAI_API_BASE") # Optional: for Azure or self-hosted |
|
# ) |
|
|
|
# def load_csv_files(folder_path): |
|
# csv_files = glob.glob(os.path.join(folder_path, "*.csv")) |
|
# dataframes = [] |
|
# column_sets = [] |
|
# valid_paths = [] |
|
|
|
# print("π₯ Reading CSV files...") |
|
|
|
# for file in csv_files: |
|
# try: |
|
# df = pd.read_csv(file) |
|
# dataframes.append(df) |
|
# column_sets.append(list(df.columns)) |
|
# valid_paths.append(file) |
|
# print(f"β
Loaded: {os.path.basename(file)}") |
|
# except pd.errors.ParserError as e: |
|
# print(f"β Skipping file due to parsing error: {os.path.basename(file)}") |
|
# print(f" β³ {e}") |
|
# except Exception as e: |
|
# print(f"β οΈ Unexpected error with file {os.path.basename(file)}: {e}") |
|
|
|
# return dataframes, column_sets, valid_paths |
|
|
|
# def generate_mapping_prompt(column_sets): |
|
# prompt = ( |
|
# "You are a data scientist helping to merge multiple ML prediction datasets. " |
|
# "Each CSV may have different or similar column names. I need a unified mapping to standardize these datasets. " |
|
# "Also, please identify likely prediction label columns (e.g., price, quality, outcome).\n\n" |
|
# "Here are the column headers from each CSV:\n" |
|
# ) |
|
# for i, columns in enumerate(column_sets): |
|
# prompt += f"CSV {i+1}: {columns}\n" |
|
# prompt += ( |
|
# "\nPlease provide:\n" |
|
# "1. A Python dictionary mapping similar columns across these CSVs.\n" |
|
# "2. A list of columns most likely to represent prediction labels.\n\n" |
|
# "Format your response as:\n" |
|
# "```python\n" |
|
# "column_mapping = { ... }\n" |
|
# "label_columns = [ ... ]\n" |
|
# "```" |
|
# ) |
|
# return prompt |
|
|
|
# def get_column_mapping_from_openai(column_sets): |
|
# prompt = generate_mapping_prompt(column_sets) |
|
|
|
# response = client.chat.completions.create( |
|
# model="gpt-4", |
|
# messages=[ |
|
# {"role": "system", "content": "You are a helpful data scientist."}, |
|
# {"role": "user", "content": prompt} |
|
# ], |
|
# temperature=0.3 |
|
# ) |
|
|
|
# content = response.choices[0].message.content |
|
# print("\nπ© Received response from OpenAI.") |
|
|
|
# try: |
|
# # Try parsing both dictionary and label list from the response |
|
# column_mapping_match = re.search(r"column_mapping\s*=\s*(\{.*?\})", content, re.DOTALL) |
|
# label_columns_match = re.search(r"label_columns\s*=\s*(\[.*?\])", content, re.DOTALL) |
|
|
|
# if column_mapping_match: |
|
# mapping = ast.literal_eval(column_mapping_match.group(1)) |
|
# else: |
|
# raise ValueError("β Could not find `column_mapping` in the response.") |
|
|
|
# if label_columns_match: |
|
# label_columns = ast.literal_eval(label_columns_match.group(1)) |
|
# else: |
|
# label_columns = [] |
|
|
|
# except Exception as e: |
|
# print("β οΈ Error parsing OpenAI response:") |
|
# print(content) |
|
# raise e |
|
|
|
# return mapping, label_columns |
|
|
|
# def standardize_columns(df, mapping): |
|
# new_columns = {col: mapping.get(col, col) for col in df.columns} |
|
# return df.rename(columns=new_columns) |
|
|
|
# def merge_csvs(folder_path, output_file="merged_dataset.csv"): |
|
# dfs, column_sets, csv_paths = load_csv_files(folder_path) |
|
|
|
# if not dfs: |
|
# print("β No valid CSVs found to merge.") |
|
# return |
|
|
|
# print("\nπ§ Requesting column mapping from OpenAI...") |
|
# mapping, label_columns = get_column_mapping_from_openai(column_sets) |
|
|
|
# print("\nπ Column Mapping:") |
|
# for k, v in mapping.items(): |
|
# print(f" '{k}' -> '{v}'") |
|
|
|
# print("\nπ·οΈ Suggested Label Columns:") |
|
# for label in label_columns: |
|
# print(f" - {label}") |
|
|
|
# standardized_dfs = [standardize_columns(df, mapping) for df in dfs] |
|
# merged_df = pd.concat(standardized_dfs, ignore_index=True, sort=False) |
|
|
|
# merged_df.to_csv(output_file, index=False) |
|
# print(f"\nβ
Merged dataset saved as '{output_file}'") |
|
|
|
# if __name__ == "__main__": |
|
# folder_path = "house" |
|
|
|
|
|
import os |
|
import glob |
|
import pandas as pd |
|
import ast |
|
import re |
|
from itertools import combinations |
|
from rapidfuzz import fuzz, process |
|
from dotenv import load_dotenv |
|
from openai import OpenAI |
|
|
|
# Manual rename map to standardize some known variations |
|
manual_rename_map = { |
|
"review": "text", |
|
"text": "text", |
|
"NumBedrooms": "bedrooms", |
|
"HousePrice": "price", |
|
"TARGET(PRICE_IN_LACS)": "price", |
|
"SquareFootage": "area", |
|
"SQUARE_FT": "area", |
|
"sentiment": "label", |
|
"target": "label", |
|
"type": "label", |
|
"variety": "label", |
|
"class": "label", |
|
"HeartDisease": "label", |
|
"Heart Attack Risk (Binary)": "label", |
|
"Heart Attack Risk": "label" |
|
} |
|
|
|
|
|
def normalize(col): |
|
return re.sub(r'[^a-z0-9]', '', col.lower()) |
|
|
|
def apply_manual_renaming(df, rename_map): |
|
renamed = {} |
|
for col in df.columns: |
|
if col in rename_map: |
|
renamed[col] = rename_map[col] |
|
return df.rename(columns=renamed) |
|
|
|
def get_fuzzy_common_columns(cols_list, threshold=75): |
|
base = cols_list[0] |
|
common = set() |
|
for col in base: |
|
match_all = True |
|
for other in cols_list[1:]: |
|
match, score, _ = process.extractOne(col, other, scorer=fuzz.token_sort_ratio) |
|
if score < threshold: |
|
match_all = False |
|
break |
|
if match_all: |
|
common.add(col) |
|
return common |
|
|
|
def sortFiles(dfs): |
|
unique_dfs = [] |
|
seen = [] |
|
for i, df1 in enumerate(dfs): |
|
duplicate = False |
|
for j in seen: |
|
df2 = dfs[j] |
|
if df1.shape != df2.shape: |
|
continue |
|
if df1.reset_index(drop=True).equals(df2.reset_index(drop=True)): |
|
duplicate = True |
|
break |
|
if not duplicate: |
|
unique_dfs.append(df1) |
|
seen.append(i) |
|
return unique_dfs |
|
|
|
def load_csv_files(folder_path): |
|
csv_files = glob.glob(os.path.join(folder_path, "*.csv")) |
|
dfs = [] |
|
column_sets = [] |
|
paths = [] |
|
|
|
for file in csv_files: |
|
try: |
|
df = pd.read_csv(file) |
|
dfs.append(df) |
|
column_sets.append(list(df.columns)) |
|
paths.append(file) |
|
print(f"β
Loaded: {os.path.basename(file)}") |
|
except Exception as e: |
|
print(f"β Failed to load {file}: {e}") |
|
return dfs, column_sets, paths |
|
|
|
def generate_mapping_prompt(column_sets): |
|
prompt = ( |
|
"You are a data scientist helping to merge multiple machine learning prediction datasets. " |
|
"Each CSV file may have different column names, even if they represent similar types of data. " |
|
"Your task is to identify and map these similar columns across datasets to a common, unified name. " |
|
"Columns with clearly similar features (e.g., 'Bedrooms' and 'BedroomsAbvGr') should be merged into one column with a relevant name like 'bedrooms'.\n\n" |
|
"Avoid keeping redundant or unique columns that do not have any logical counterpart in other datasets unless they are essential. " |
|
"The goal is not to maximize the number of columns or rows, but to create a clean, consistent dataset for training ML models.\n\n" |
|
"Examples:\n" |
|
"- Dataset1: 'Locality' -> Mumbai, Delhi\n" |
|
"- Dataset2: 'Places' -> Goa, Singapore\n" |
|
"β Merge both into a common column like 'location'.\n\n" |
|
"Please also identify likely label or target columns that are typically used for prediction (e.g., price, sentiment, outcome, quality).\n\n" |
|
) |
|
|
|
for i, cols in enumerate(column_sets): |
|
prompt += f"CSV {i+1}: {cols}\n" |
|
prompt += "\nPlease return:\n```python\ncolumn_mapping = { ... }\nlabel_columns = [ ... ]\n```" |
|
return prompt |
|
|
|
def get_column_mapping_from_openai(column_sets): |
|
load_dotenv() |
|
client = OpenAI( |
|
api_key=os.getenv("OPENAI_API_KEY"), |
|
base_url=os.getenv("OPENAI_API_BASE", "") |
|
) |
|
|
|
prompt = generate_mapping_prompt(column_sets) |
|
|
|
response = client.chat.completions.create( |
|
model="gpt-4", |
|
messages=[ |
|
{"role": "system", "content": "You are a helpful data scientist."}, |
|
{"role": "user", "content": prompt} |
|
], |
|
temperature=0.3 |
|
) |
|
|
|
content = response.choices[0].message.content |
|
|
|
try: |
|
column_mapping_match = re.search(r"column_mapping\s*=\s*(\{.*?\})", content, re.DOTALL) |
|
label_columns_match = re.search(r"label_columns\s*=\s*(\[.*?\])", content, re.DOTALL) |
|
column_mapping = ast.literal_eval(column_mapping_match.group(1)) if column_mapping_match else {} |
|
label_columns = ast.literal_eval(label_columns_match.group(1)) if label_columns_match else [] |
|
except Exception as e: |
|
print("β οΈ Error parsing OpenAI response:") |
|
print(content) |
|
raise e |
|
|
|
return column_mapping, label_columns |
|
|
|
def clean_and_merge(folder, query=None, use_ai=True): |
|
os.makedirs("./final", exist_ok=True) |
|
dfs, column_sets, csv_paths = load_csv_files(folder) |
|
|
|
if not dfs: |
|
print("No valid CSVs found.") |
|
return |
|
|
|
dfs = sortFiles(dfs) |
|
dfs = [apply_manual_renaming(df, manual_rename_map) for df in dfs] |
|
|
|
if use_ai: |
|
try: |
|
column_mapping, label_columns = get_column_mapping_from_openai(column_sets) |
|
dfs = [df.rename(columns={col: column_mapping.get(col, col) for col in df.columns}) for df in dfs] |
|
except Exception as e: |
|
print("Falling back to fuzzy matching due to OpenAI error:", e) |
|
use_ai = False |
|
|
|
if not use_ai: |
|
# Normalize columns for fuzzy match fallback |
|
normalized_cols = [] |
|
for df in dfs: |
|
normalized_cols.append({normalize(col) for col in df.columns}) |
|
|
|
# Get best combination with fuzzy common columns |
|
max_common = set() |
|
best_combo = [] |
|
for i in range(2, len(dfs)+1): |
|
for combo in combinations(range(len(dfs)), i): |
|
selected = [normalized_cols[j] for j in combo] |
|
fuzzy_common = get_fuzzy_common_columns(selected) |
|
if len(fuzzy_common) >= len(max_common): |
|
max_common = fuzzy_common |
|
best_combo = combo |
|
|
|
# Harmonize and align |
|
aligned_dfs = [] |
|
for idx in best_combo: |
|
df = dfs[idx] |
|
col_map = {} |
|
for std_col in max_common: |
|
match, _, _ = process.extractOne(std_col, [normalize(col) for col in df.columns]) |
|
for col in df.columns: |
|
if normalize(col) == match: |
|
col_map[col] = std_col |
|
break |
|
df_subset = df[list(col_map.keys())].rename(columns=col_map) |
|
aligned_dfs.append(df_subset) |
|
|
|
combined_df = pd.concat(aligned_dfs, ignore_index=True) |
|
else: |
|
combined_df = pd.concat(dfs, ignore_index=True) |
|
|
|
# Label assignment fallback |
|
for i, df in enumerate(dfs): |
|
if 'label' not in df.columns: |
|
name = os.path.basename(csv_paths[i]).split(".")[0].lower() |
|
name_cleaned = name |
|
if query: |
|
words = set(re.sub(r'[^a-z]', ' ', query.lower()).split()) |
|
for word in words: |
|
name_cleaned = name_cleaned.replace(word, "") |
|
df['label'] = name_cleaned |
|
|
|
# Decide best final file |
|
largest_df = max(dfs, key=lambda df: len(df)) |
|
flag = False |
|
|
|
if len(largest_df) > len(combined_df) and len(largest_df.columns) > 2: |
|
flag = True |
|
elif len(combined_df) > len(largest_df) and (len(largest_df.columns) - len(combined_df.columns)) > 3 and len(largest_df.columns) < 7: |
|
flag = True |
|
|
|
output_file = f"./final/{query or os.path.basename(folder)}.csv" |
|
if flag: |
|
largest_df.to_csv(output_file, index=False) |
|
print(f"β οΈ Saved fallback single file due to poor merge: {output_file}") |
|
else: |
|
combined_df.to_csv(output_file, index=False) |
|
print(f"β
Saved merged file: {output_file}") |
|
|
|
# Example usage: |
|
clean_and_merge("house", query="house", use_ai=True) |
|
# merge_csvs(folder_path) |
|
|