File size: 4,384 Bytes
825e978 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 |
# import os
# import pandas as pd
# from openai import OpenAI
# from dotenv import load_dotenv
# # Load environment variables
# load_dotenv()
# # Setup OpenAI client
# client = OpenAI(
# api_key=os.getenv("OPENAI_API_KEY"),
# base_url=os.getenv("OPENAI_API_BASE", "")
# )
# # Set the folder containing CSV files
# folder_path = 'house'
# csv_files = [f for f in os.listdir(folder_path) if f.endswith(".csv")]
# # Loop through each CSV file and process
# for file in csv_files:
# file_path = os.path.join(folder_path, file)
# try:
# df = pd.read_csv(file_path, nrows=1) # Read just the header row
# columns = df.columns.tolist()
# columns_str = ", ".join(columns)
# # Formulate prompt
# prompt = (
# f"The following are column labels from a dataset: {columns_str}.\n"
# "Among all the labels, return a list of labels which can be clubbed and merged into 1 label. Note that the lables which can be merged must belong to different datasets, do not merge lables of the same dataset"
# )
# # Ask OpenAI for insight
# response = client.chat.completions.create(
# model="gpt-4",
# messages=[
# {"role": "system", "content": "You are a helpful data analyst."},
# {"role": "user", "content": prompt}
# ],
# temperature=0.3
# )
# print(f"\nπ File: {file}")
# print("π Columns:", columns_str)
# print("π‘ AI Insight:", response.choices[0].message.content)
# except Exception as e:
# print(f"β Error processing {file}: {e}")
import os
import json
import pandas as pd
from openai import OpenAI
from dotenv import load_dotenv
load_dotenv()
client = OpenAI(
api_key=os.getenv("OPENAI_API_KEY"),
base_url=os.getenv("OPENAI_API_BASE", "")
)
folder_path = 'downloads/house'
csv_files = [f for f in os.listdir(folder_path) if f.endswith(".csv")]
# Collect all column headers
all_columns = {}
for file in csv_files:
try:
df = pd.read_csv(os.path.join(folder_path, file), nrows=1)
all_columns[file] = df.columns.tolist()
except Exception as e:
print(f"β Could not read {file}: {e}")
flattened_cols = [f"{file}: {', '.join(cols)}" for file, cols in all_columns.items()]
prompt = (
"The following are column headers from multiple CSV datasets:\n\n"
+ f"{flattened_cols}"
+ "\n\nIdentify which labels across different datasets can be considered equivalent and merged. "
"Return only a valid JSON dictionary where keys are standard labels and values are lists of corresponding labels to rename. No explanation."
)
response = client.chat.completions.create(
model="gpt-4",
messages=[
{"role": "system", "content": "You are a helpful data analyst."},
{"role": "user", "content": prompt}
],
temperature=0.3
)
# Parse JSON dictionary
merge_map_text = response.choices[0].message.content.strip()
try:
start = merge_map_text.find("{")
end = merge_map_text.rfind("}") + 1
json_text = merge_map_text[start:end]
merge_map = json.loads(json_text)
print("\nπ§ Parsed Merge Map:")
print(json.dumps(merge_map, indent=2))
except Exception as e:
print("β Could not parse merge map from GPT:", e)
merge_map = {}
# Merge DataFrames
merged_df = pd.DataFrame()
for file in csv_files:
try:
df = pd.read_csv(os.path.join(folder_path, file), on_bad_lines='skip')
# Rename columns to standard labels
for standard_label, variants in merge_map.items():
for variant in variants:
if variant in df.columns:
df[standard_label] = df[variant]
# Retain only the standard columns we care about
df = df[list(set(df.columns) & set(merge_map.keys()))]
if not df.empty:
merged_df = pd.concat([merged_df, df], ignore_index=True)
except Exception as e:
print(f"β Error processing {file}: {e}")
# Final clean-up
if not merged_df.empty:
merged_df.drop_duplicates(inplace=True)
merged_df.to_csv("merged_cleaned_dataset.csv", index=False)
print("\nβ
Merged and cleaned dataset saved as 'merged_cleaned_dataset.csv'")
else:
print("β οΈ No data was merged. Check if the merge map matches the actual columns.")
|