|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import os |
|
import json |
|
import pandas as pd |
|
from openai import OpenAI |
|
from dotenv import load_dotenv |
|
|
|
load_dotenv() |
|
|
|
client = OpenAI( |
|
api_key=os.getenv("OPENAI_API_KEY"), |
|
base_url=os.getenv("OPENAI_API_BASE", "") |
|
) |
|
|
|
folder_path = 'downloads/house' |
|
csv_files = [f for f in os.listdir(folder_path) if f.endswith(".csv")] |
|
|
|
|
|
all_columns = {} |
|
for file in csv_files: |
|
try: |
|
df = pd.read_csv(os.path.join(folder_path, file), nrows=1) |
|
all_columns[file] = df.columns.tolist() |
|
except Exception as e: |
|
print(f"β Could not read {file}: {e}") |
|
|
|
flattened_cols = [f"{file}: {', '.join(cols)}" for file, cols in all_columns.items()] |
|
prompt = ( |
|
"The following are column headers from multiple CSV datasets:\n\n" |
|
+ f"{flattened_cols}" |
|
+ "\n\nIdentify which labels across different datasets can be considered equivalent and merged. " |
|
"Return only a valid JSON dictionary where keys are standard labels and values are lists of corresponding labels to rename. No explanation." |
|
) |
|
|
|
response = client.chat.completions.create( |
|
model="gpt-4", |
|
messages=[ |
|
{"role": "system", "content": "You are a helpful data analyst."}, |
|
{"role": "user", "content": prompt} |
|
], |
|
temperature=0.3 |
|
) |
|
|
|
|
|
merge_map_text = response.choices[0].message.content.strip() |
|
try: |
|
start = merge_map_text.find("{") |
|
end = merge_map_text.rfind("}") + 1 |
|
json_text = merge_map_text[start:end] |
|
merge_map = json.loads(json_text) |
|
print("\nπ§ Parsed Merge Map:") |
|
print(json.dumps(merge_map, indent=2)) |
|
except Exception as e: |
|
print("β Could not parse merge map from GPT:", e) |
|
merge_map = {} |
|
|
|
|
|
merged_df = pd.DataFrame() |
|
|
|
for file in csv_files: |
|
try: |
|
df = pd.read_csv(os.path.join(folder_path, file), on_bad_lines='skip') |
|
|
|
|
|
for standard_label, variants in merge_map.items(): |
|
for variant in variants: |
|
if variant in df.columns: |
|
df[standard_label] = df[variant] |
|
|
|
|
|
df = df[list(set(df.columns) & set(merge_map.keys()))] |
|
|
|
if not df.empty: |
|
merged_df = pd.concat([merged_df, df], ignore_index=True) |
|
|
|
except Exception as e: |
|
print(f"β Error processing {file}: {e}") |
|
|
|
|
|
if not merged_df.empty: |
|
merged_df.drop_duplicates(inplace=True) |
|
merged_df.to_csv("merged_cleaned_dataset.csv", index=False) |
|
print("\nβ
Merged and cleaned dataset saved as 'merged_cleaned_dataset.csv'") |
|
else: |
|
print("β οΈ No data was merged. Check if the merge map matches the actual columns.") |
|
|