File size: 4,384 Bytes
825e978
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
# import os
# import pandas as pd
# from openai import OpenAI
# from dotenv import load_dotenv

# # Load environment variables
# load_dotenv()

# # Setup OpenAI client
# client = OpenAI(
#     api_key=os.getenv("OPENAI_API_KEY"),
#     base_url=os.getenv("OPENAI_API_BASE", "")
# )

# # Set the folder containing CSV files
# folder_path = 'house'
# csv_files = [f for f in os.listdir(folder_path) if f.endswith(".csv")]

# # Loop through each CSV file and process
# for file in csv_files:
#     file_path = os.path.join(folder_path, file)
#     try:
#         df = pd.read_csv(file_path, nrows=1)  # Read just the header row
#         columns = df.columns.tolist()
#         columns_str = ", ".join(columns)

#         # Formulate prompt
#         prompt = (
#             f"The following are column labels from a dataset: {columns_str}.\n"
#             "Among all the labels, return a list of labels which can be clubbed and merged into 1 label. Note that the lables which can be merged must belong to different datasets, do not merge lables of the same dataset"
#         )

#         # Ask OpenAI for insight
#         response = client.chat.completions.create(
#             model="gpt-4",
#             messages=[
#                 {"role": "system", "content": "You are a helpful data analyst."},
#                 {"role": "user", "content": prompt}
#             ],
#             temperature=0.3
#         )

#         print(f"\nπŸ” File: {file}")
#         print("πŸ“‹ Columns:", columns_str)
#         print("πŸ’‘ AI Insight:", response.choices[0].message.content)

#     except Exception as e:
#         print(f"❌ Error processing {file}: {e}")

import os
import json
import pandas as pd
from openai import OpenAI
from dotenv import load_dotenv

load_dotenv()

client = OpenAI(
    api_key=os.getenv("OPENAI_API_KEY"),
    base_url=os.getenv("OPENAI_API_BASE", "")
)

folder_path = 'downloads/house'
csv_files = [f for f in os.listdir(folder_path) if f.endswith(".csv")]

# Collect all column headers
all_columns = {}
for file in csv_files:
    try:
        df = pd.read_csv(os.path.join(folder_path, file), nrows=1)
        all_columns[file] = df.columns.tolist()
    except Exception as e:
        print(f"❌ Could not read {file}: {e}")

flattened_cols = [f"{file}: {', '.join(cols)}" for file, cols in all_columns.items()]
prompt = (
    "The following are column headers from multiple CSV datasets:\n\n"
    + f"{flattened_cols}"
    + "\n\nIdentify which labels across different datasets can be considered equivalent and merged. "
      "Return only a valid JSON dictionary where keys are standard labels and values are lists of corresponding labels to rename. No explanation."
)

response = client.chat.completions.create(
    model="gpt-4",
    messages=[
        {"role": "system", "content": "You are a helpful data analyst."},
        {"role": "user", "content": prompt}
    ],
    temperature=0.3
)

# Parse JSON dictionary
merge_map_text = response.choices[0].message.content.strip()
try:
    start = merge_map_text.find("{")
    end = merge_map_text.rfind("}") + 1
    json_text = merge_map_text[start:end]
    merge_map = json.loads(json_text)
    print("\n🧠 Parsed Merge Map:")
    print(json.dumps(merge_map, indent=2))
except Exception as e:
    print("❌ Could not parse merge map from GPT:", e)
    merge_map = {}

# Merge DataFrames
merged_df = pd.DataFrame()

for file in csv_files:
    try:
        df = pd.read_csv(os.path.join(folder_path, file), on_bad_lines='skip')

        # Rename columns to standard labels
        for standard_label, variants in merge_map.items():
            for variant in variants:
                if variant in df.columns:
                    df[standard_label] = df[variant]
        
        # Retain only the standard columns we care about
        df = df[list(set(df.columns) & set(merge_map.keys()))]

        if not df.empty:
            merged_df = pd.concat([merged_df, df], ignore_index=True)

    except Exception as e:
        print(f"❌ Error processing {file}: {e}")

# Final clean-up
if not merged_df.empty:
    merged_df.drop_duplicates(inplace=True)
    merged_df.to_csv("merged_cleaned_dataset.csv", index=False)
    print("\nβœ… Merged and cleaned dataset saved as 'merged_cleaned_dataset.csv'")
else:
    print("⚠️ No data was merged. Check if the merge map matches the actual columns.")