Spaces:

vansh9878
/

Dataset

Runtime error

App Files Files Community

Dataset / csv_merge_openai.py

vansh9878

files added

825e978 1 day ago

raw

history blame contribute delete

4.38 kB

	# import os
	# import pandas as pd
	# from openai import OpenAI
	# from dotenv import load_dotenv

	# # Load environment variables
	# load_dotenv()

	# # Setup OpenAI client
	# client = OpenAI(
	# api_key=os.getenv("OPENAI_API_KEY"),
	# base_url=os.getenv("OPENAI_API_BASE", "")
	# )

	# # Set the folder containing CSV files
	# folder_path = 'house'
	# csv_files = [f for f in os.listdir(folder_path) if f.endswith(".csv")]

	# # Loop through each CSV file and process
	# for file in csv_files:
	# file_path = os.path.join(folder_path, file)
	# try:
	# df = pd.read_csv(file_path, nrows=1) # Read just the header row
	# columns = df.columns.tolist()
	# columns_str = ", ".join(columns)

	# # Formulate prompt
	# prompt = (
	# f"The following are column labels from a dataset: {columns_str}.\n"
	# "Among all the labels, return a list of labels which can be clubbed and merged into 1 label. Note that the lables which can be merged must belong to different datasets, do not merge lables of the same dataset"
	# )

	# # Ask OpenAI for insight
	# response = client.chat.completions.create(
	# model="gpt-4",
	# messages=[
	# {"role": "system", "content": "You are a helpful data analyst."},
	# {"role": "user", "content": prompt}
	# ],
	# temperature=0.3
	# )

	# print(f"\n🔍 File: {file}")
	# print("📋 Columns:", columns_str)
	# print("💡 AI Insight:", response.choices[0].message.content)

	# except Exception as e:
	# print(f"❌ Error processing {file}: {e}")

	import os
	import json
	import pandas as pd
	from openai import OpenAI
	from dotenv import load_dotenv

	load_dotenv()

	client = OpenAI(
	api_key=os.getenv("OPENAI_API_KEY"),
	base_url=os.getenv("OPENAI_API_BASE", "")
	)

	folder_path = 'downloads/house'
	csv_files = [f for f in os.listdir(folder_path) if f.endswith(".csv")]

	# Collect all column headers
	all_columns = {}
	for file in csv_files:
	try:
	df = pd.read_csv(os.path.join(folder_path, file), nrows=1)
	all_columns[file] = df.columns.tolist()
	except Exception as e:
	print(f"❌ Could not read {file}: {e}")

	flattened_cols = [f"{file}: {', '.join(cols)}" for file, cols in all_columns.items()]
	prompt = (
	"The following are column headers from multiple CSV datasets:\n\n"
	+ f"{flattened_cols}"
	+ "\n\nIdentify which labels across different datasets can be considered equivalent and merged. "
	"Return only a valid JSON dictionary where keys are standard labels and values are lists of corresponding labels to rename. No explanation."
	)

	response = client.chat.completions.create(
	model="gpt-4",
	messages=[
	{"role": "system", "content": "You are a helpful data analyst."},
	{"role": "user", "content": prompt}
	],
	temperature=0.3
	)

	# Parse JSON dictionary
	merge_map_text = response.choices[0].message.content.strip()
	try:
	start = merge_map_text.find("{")
	end = merge_map_text.rfind("}") + 1
	json_text = merge_map_text[start:end]
	merge_map = json.loads(json_text)
	print("\n🧠 Parsed Merge Map:")
	print(json.dumps(merge_map, indent=2))
	except Exception as e:
	print("❌ Could not parse merge map from GPT:", e)
	merge_map = {}

	# Merge DataFrames
	merged_df = pd.DataFrame()

	for file in csv_files:
	try:
	df = pd.read_csv(os.path.join(folder_path, file), on_bad_lines='skip')

	# Rename columns to standard labels
	for standard_label, variants in merge_map.items():
	for variant in variants:
	if variant in df.columns:
	df[standard_label] = df[variant]

	# Retain only the standard columns we care about
	df = df[list(set(df.columns) & set(merge_map.keys()))]

	if not df.empty:
	merged_df = pd.concat([merged_df, df], ignore_index=True)

	except Exception as e:
	print(f"❌ Error processing {file}: {e}")

	# Final clean-up
	if not merged_df.empty:
	merged_df.drop_duplicates(inplace=True)
	merged_df.to_csv("merged_cleaned_dataset.csv", index=False)
	print("\n✅ Merged and cleaned dataset saved as 'merged_cleaned_dataset.csv'")
	else:
	print("⚠️ No data was merged. Check if the merge map matches the actual columns.")