Dataset / csv_merge_openai.py
vansh9878's picture
files added
825e978
# import os
# import pandas as pd
# from openai import OpenAI
# from dotenv import load_dotenv
# # Load environment variables
# load_dotenv()
# # Setup OpenAI client
# client = OpenAI(
# api_key=os.getenv("OPENAI_API_KEY"),
# base_url=os.getenv("OPENAI_API_BASE", "")
# )
# # Set the folder containing CSV files
# folder_path = 'house'
# csv_files = [f for f in os.listdir(folder_path) if f.endswith(".csv")]
# # Loop through each CSV file and process
# for file in csv_files:
# file_path = os.path.join(folder_path, file)
# try:
# df = pd.read_csv(file_path, nrows=1) # Read just the header row
# columns = df.columns.tolist()
# columns_str = ", ".join(columns)
# # Formulate prompt
# prompt = (
# f"The following are column labels from a dataset: {columns_str}.\n"
# "Among all the labels, return a list of labels which can be clubbed and merged into 1 label. Note that the lables which can be merged must belong to different datasets, do not merge lables of the same dataset"
# )
# # Ask OpenAI for insight
# response = client.chat.completions.create(
# model="gpt-4",
# messages=[
# {"role": "system", "content": "You are a helpful data analyst."},
# {"role": "user", "content": prompt}
# ],
# temperature=0.3
# )
# print(f"\nπŸ” File: {file}")
# print("πŸ“‹ Columns:", columns_str)
# print("πŸ’‘ AI Insight:", response.choices[0].message.content)
# except Exception as e:
# print(f"❌ Error processing {file}: {e}")
import os
import json
import pandas as pd
from openai import OpenAI
from dotenv import load_dotenv
load_dotenv()
client = OpenAI(
api_key=os.getenv("OPENAI_API_KEY"),
base_url=os.getenv("OPENAI_API_BASE", "")
)
folder_path = 'downloads/house'
csv_files = [f for f in os.listdir(folder_path) if f.endswith(".csv")]
# Collect all column headers
all_columns = {}
for file in csv_files:
try:
df = pd.read_csv(os.path.join(folder_path, file), nrows=1)
all_columns[file] = df.columns.tolist()
except Exception as e:
print(f"❌ Could not read {file}: {e}")
flattened_cols = [f"{file}: {', '.join(cols)}" for file, cols in all_columns.items()]
prompt = (
"The following are column headers from multiple CSV datasets:\n\n"
+ f"{flattened_cols}"
+ "\n\nIdentify which labels across different datasets can be considered equivalent and merged. "
"Return only a valid JSON dictionary where keys are standard labels and values are lists of corresponding labels to rename. No explanation."
)
response = client.chat.completions.create(
model="gpt-4",
messages=[
{"role": "system", "content": "You are a helpful data analyst."},
{"role": "user", "content": prompt}
],
temperature=0.3
)
# Parse JSON dictionary
merge_map_text = response.choices[0].message.content.strip()
try:
start = merge_map_text.find("{")
end = merge_map_text.rfind("}") + 1
json_text = merge_map_text[start:end]
merge_map = json.loads(json_text)
print("\n🧠 Parsed Merge Map:")
print(json.dumps(merge_map, indent=2))
except Exception as e:
print("❌ Could not parse merge map from GPT:", e)
merge_map = {}
# Merge DataFrames
merged_df = pd.DataFrame()
for file in csv_files:
try:
df = pd.read_csv(os.path.join(folder_path, file), on_bad_lines='skip')
# Rename columns to standard labels
for standard_label, variants in merge_map.items():
for variant in variants:
if variant in df.columns:
df[standard_label] = df[variant]
# Retain only the standard columns we care about
df = df[list(set(df.columns) & set(merge_map.keys()))]
if not df.empty:
merged_df = pd.concat([merged_df, df], ignore_index=True)
except Exception as e:
print(f"❌ Error processing {file}: {e}")
# Final clean-up
if not merged_df.empty:
merged_df.drop_duplicates(inplace=True)
merged_df.to_csv("merged_cleaned_dataset.csv", index=False)
print("\nβœ… Merged and cleaned dataset saved as 'merged_cleaned_dataset.csv'")
else:
print("⚠️ No data was merged. Check if the merge map matches the actual columns.")