Spaces:

vansh9878
/

Dataset

Runtime error

App Files Files Community

Dataset / workflow.txt

vansh9878

files added

825e978 1 day ago

raw

history blame contribute delete

13.5 kB

	1. seprate attributes and data
	2. remove the datatypes from the attributes
	C:\Users\Niall Dcunha\DatasetCreator\house price prediction\21754539_dataset

	# import os
	# import glob
	# import pandas as pd
	# import openai
	# from openai import OpenAI
	# from dotenv import load_dotenv
	# import ast
	# import re

	# def extract_dict_from_response(response: str) -> dict:
	# # Try extracting code block content containing the dictionary
	# match = re.search(r"```(?:python)?\s(\{.?\})\s*```", response, re.DOTALL)
	# if match:
	# mapping_str = match.group(1)
	# else:
	# # Try extracting dictionary directly if it's not in code block
	# match = re.search(r"(\{.*\})", response, re.DOTALL)
	# if not match:
	# raise ValueError("❌ Could not find a Python dictionary in the response.")
	# mapping_str = match.group(1)

	# try:
	# return ast.literal_eval(mapping_str)
	# except Exception as e:
	# print("⚠️ Failed to evaluate extracted dictionary string.")
	# print("String:", mapping_str)
	# raise e

	# # Load environment variables
	# load_dotenv()
	# client = OpenAI(
	# api_key=os.getenv("OPENAI_API_KEY"),
	# base_url=os.getenv("OPENAI_API_BASE") # Optional: for Azure or self-hosted
	# )

	# def load_csv_files(folder_path):
	# csv_files = glob.glob(os.path.join(folder_path, "*.csv"))
	# dataframes = []
	# column_sets = []
	# valid_paths = []

	# print("📥 Reading CSV files...")

	# for file in csv_files:
	# try:
	# df = pd.read_csv(file)
	# dataframes.append(df)
	# column_sets.append(list(df.columns))
	# valid_paths.append(file)
	# print(f"✅ Loaded: {os.path.basename(file)}")
	# except pd.errors.ParserError as e:
	# print(f"❌ Skipping file due to parsing error: {os.path.basename(file)}")
	# print(f" ↳ {e}")
	# except Exception as e:
	# print(f"⚠️ Unexpected error with file {os.path.basename(file)}: {e}")

	# return dataframes, column_sets, valid_paths

	# def generate_mapping_prompt(column_sets):
	# prompt = (
	# "You are a data scientist helping to merge multiple ML prediction datasets. "
	# "Each CSV may have different or similar column names. I need a unified mapping to standardize these datasets. "
	# "Also, please identify likely prediction label columns (e.g., price, quality, outcome).\n\n"
	# "Here are the column headers from each CSV:\n"
	# )
	# for i, columns in enumerate(column_sets):
	# prompt += f"CSV {i+1}: {columns}\n"
	# prompt += (
	# "\nPlease provide:\n"
	# "1. A Python dictionary mapping similar columns across these CSVs.\n"
	# "2. A list of columns most likely to represent prediction labels.\n\n"
	# "Format your response as:\n"
	# "```python\n"
	# "column_mapping = { ... }\n"
	# "label_columns = [ ... ]\n"
	# "```"
	# )
	# return prompt

	# def get_column_mapping_from_openai(column_sets):
	# prompt = generate_mapping_prompt(column_sets)

	# response = client.chat.completions.create(
	# model="gpt-4",
	# messages=[
	# {"role": "system", "content": "You are a helpful data scientist."},
	# {"role": "user", "content": prompt}
	# ],
	# temperature=0.3
	# )

	# content = response.choices[0].message.content
	# print("\n📩 Received response from OpenAI.")

	# try:
	# # Try parsing both dictionary and label list from the response
	# column_mapping_match = re.search(r"column_mapping\s=\s(\{.*?\})", content, re.DOTALL)
	# label_columns_match = re.search(r"label_columns\s=\s(\[.*?\])", content, re.DOTALL)

	# if column_mapping_match:
	# mapping = ast.literal_eval(column_mapping_match.group(1))
	# else:
	# raise ValueError("❌ Could not find `column_mapping` in the response.")

	# if label_columns_match:
	# label_columns = ast.literal_eval(label_columns_match.group(1))
	# else:
	# label_columns = []

	# except Exception as e:
	# print("⚠️ Error parsing OpenAI response:")
	# print(content)
	# raise e

	# return mapping, label_columns

	# def standardize_columns(df, mapping):
	# new_columns = {col: mapping.get(col, col) for col in df.columns}
	# return df.rename(columns=new_columns)

	# def merge_csvs(folder_path, output_file="merged_dataset.csv"):
	# dfs, column_sets, csv_paths = load_csv_files(folder_path)

	# if not dfs:
	# print("❌ No valid CSVs found to merge.")
	# return

	# print("\n🧠 Requesting column mapping from OpenAI...")
	# mapping, label_columns = get_column_mapping_from_openai(column_sets)

	# print("\n📌 Column Mapping:")
	# for k, v in mapping.items():
	# print(f" '{k}' -> '{v}'")

	# print("\n🏷️ Suggested Label Columns:")
	# for label in label_columns:
	# print(f" - {label}")

	# standardized_dfs = [standardize_columns(df, mapping) for df in dfs]
	# merged_df = pd.concat(standardized_dfs, ignore_index=True, sort=False)

	# merged_df.to_csv(output_file, index=False)
	# print(f"\n✅ Merged dataset saved as '{output_file}'")

	# if __name__ == "__main__":
	# folder_path = "house"


	import os
	import glob
	import pandas as pd
	import ast
	import re
	from itertools import combinations
	from rapidfuzz import fuzz, process
	from dotenv import load_dotenv
	from openai import OpenAI

	# Manual rename map to standardize some known variations
	manual_rename_map = {
	"review": "text",
	"text": "text",
	"NumBedrooms": "bedrooms",
	"HousePrice": "price",
	"TARGET(PRICE_IN_LACS)": "price",
	"SquareFootage": "area",
	"SQUARE_FT": "area",
	"sentiment": "label",
	"target": "label",
	"type": "label",
	"variety": "label",
	"class": "label",
	"HeartDisease": "label",
	"Heart Attack Risk (Binary)": "label",
	"Heart Attack Risk": "label"
	}


	def normalize(col):
	return re.sub(r'[^a-z0-9]', '', col.lower())

	def apply_manual_renaming(df, rename_map):
	renamed = {}
	for col in df.columns:
	if col in rename_map:
	renamed[col] = rename_map[col]
	return df.rename(columns=renamed)

	def get_fuzzy_common_columns(cols_list, threshold=75):
	base = cols_list[0]
	common = set()
	for col in base:
	match_all = True
	for other in cols_list[1:]:
	match, score, _ = process.extractOne(col, other, scorer=fuzz.token_sort_ratio)
	if score < threshold:
	match_all = False
	break
	if match_all:
	common.add(col)
	return common

	def sortFiles(dfs):
	unique_dfs = []
	seen = []
	for i, df1 in enumerate(dfs):
	duplicate = False
	for j in seen:
	df2 = dfs[j]
	if df1.shape != df2.shape:
	continue
	if df1.reset_index(drop=True).equals(df2.reset_index(drop=True)):
	duplicate = True
	break
	if not duplicate:
	unique_dfs.append(df1)
	seen.append(i)
	return unique_dfs

	def load_csv_files(folder_path):
	csv_files = glob.glob(os.path.join(folder_path, "*.csv"))
	dfs = []
	column_sets = []
	paths = []

	for file in csv_files:
	try:
	df = pd.read_csv(file)
	dfs.append(df)
	column_sets.append(list(df.columns))
	paths.append(file)
	print(f"✅ Loaded: {os.path.basename(file)}")
	except Exception as e:
	print(f"❌ Failed to load {file}: {e}")
	return dfs, column_sets, paths

	def generate_mapping_prompt(column_sets):
	prompt = (
	"You are a data scientist helping to merge multiple machine learning prediction datasets. "
	"Each CSV file may have different column names, even if they represent similar types of data. "
	"Your task is to identify and map these similar columns across datasets to a common, unified name. "
	"Columns with clearly similar features (e.g., 'Bedrooms' and 'BedroomsAbvGr') should be merged into one column with a relevant name like 'bedrooms'.\n\n"
	"Avoid keeping redundant or unique columns that do not have any logical counterpart in other datasets unless they are essential. "
	"The goal is not to maximize the number of columns or rows, but to create a clean, consistent dataset for training ML models.\n\n"
	"Examples:\n"
	"- Dataset1: 'Locality' -> Mumbai, Delhi\n"
	"- Dataset2: 'Places' -> Goa, Singapore\n"
	"→ Merge both into a common column like 'location'.\n\n"
	"Please also identify likely label or target columns that are typically used for prediction (e.g., price, sentiment, outcome, quality).\n\n"
	)

	for i, cols in enumerate(column_sets):
	prompt += f"CSV {i+1}: {cols}\n"
	prompt += "\nPlease return:\n```python\ncolumn_mapping = { ... }\nlabel_columns = [ ... ]\n```"
	return prompt

	def get_column_mapping_from_openai(column_sets):
	load_dotenv()
	client = OpenAI(
	api_key=os.getenv("OPENAI_API_KEY"),
	base_url=os.getenv("OPENAI_API_BASE", "")
	)

	prompt = generate_mapping_prompt(column_sets)

	response = client.chat.completions.create(
	model="gpt-4",
	messages=[
	{"role": "system", "content": "You are a helpful data scientist."},
	{"role": "user", "content": prompt}
	],
	temperature=0.3
	)

	content = response.choices[0].message.content

	try:
	column_mapping_match = re.search(r"column_mapping\s=\s(\{.*?\})", content, re.DOTALL)
	label_columns_match = re.search(r"label_columns\s=\s(\[.*?\])", content, re.DOTALL)
	column_mapping = ast.literal_eval(column_mapping_match.group(1)) if column_mapping_match else {}
	label_columns = ast.literal_eval(label_columns_match.group(1)) if label_columns_match else []
	except Exception as e:
	print("⚠️ Error parsing OpenAI response:")
	print(content)
	raise e

	return column_mapping, label_columns

	def clean_and_merge(folder, query=None, use_ai=True):
	os.makedirs("./final", exist_ok=True)
	dfs, column_sets, csv_paths = load_csv_files(folder)

	if not dfs:
	print("No valid CSVs found.")
	return

	dfs = sortFiles(dfs)
	dfs = [apply_manual_renaming(df, manual_rename_map) for df in dfs]

	if use_ai:
	try:
	column_mapping, label_columns = get_column_mapping_from_openai(column_sets)
	dfs = [df.rename(columns={col: column_mapping.get(col, col) for col in df.columns}) for df in dfs]
	except Exception as e:
	print("Falling back to fuzzy matching due to OpenAI error:", e)
	use_ai = False

	if not use_ai:
	# Normalize columns for fuzzy match fallback
	normalized_cols = []
	for df in dfs:
	normalized_cols.append({normalize(col) for col in df.columns})

	# Get best combination with fuzzy common columns
	max_common = set()
	best_combo = []
	for i in range(2, len(dfs)+1):
	for combo in combinations(range(len(dfs)), i):
	selected = [normalized_cols[j] for j in combo]
	fuzzy_common = get_fuzzy_common_columns(selected)
	if len(fuzzy_common) >= len(max_common):
	max_common = fuzzy_common
	best_combo = combo

	# Harmonize and align
	aligned_dfs = []
	for idx in best_combo:
	df = dfs[idx]
	col_map = {}
	for std_col in max_common:
	match, _, _ = process.extractOne(std_col, [normalize(col) for col in df.columns])
	for col in df.columns:
	if normalize(col) == match:
	col_map[col] = std_col
	break
	df_subset = df[list(col_map.keys())].rename(columns=col_map)
	aligned_dfs.append(df_subset)

	combined_df = pd.concat(aligned_dfs, ignore_index=True)
	else:
	combined_df = pd.concat(dfs, ignore_index=True)

	# Label assignment fallback
	for i, df in enumerate(dfs):
	if 'label' not in df.columns:
	name = os.path.basename(csv_paths[i]).split(".")[0].lower()
	name_cleaned = name
	if query:
	words = set(re.sub(r'[^a-z]', ' ', query.lower()).split())
	for word in words:
	name_cleaned = name_cleaned.replace(word, "")
	df['label'] = name_cleaned

	# Decide best final file
	largest_df = max(dfs, key=lambda df: len(df))
	flag = False

	if len(largest_df) > len(combined_df) and len(largest_df.columns) > 2:
	flag = True
	elif len(combined_df) > len(largest_df) and (len(largest_df.columns) - len(combined_df.columns)) > 3 and len(largest_df.columns) < 7:
	flag = True

	output_file = f"./final/{query or os.path.basename(folder)}.csv"
	if flag:
	largest_df.to_csv(output_file, index=False)
	print(f"⚠️ Saved fallback single file due to poor merge: {output_file}")
	else:
	combined_df.to_csv(output_file, index=False)
	print(f"✅ Saved merged file: {output_file}")

	# Example usage:
	clean_and_merge("house", query="house", use_ai=True)
	# merge_csvs(folder_path)