# import os # import pandas as pd # from openai import OpenAI # from dotenv import load_dotenv # # Load environment variables # load_dotenv() # # Setup OpenAI client # client = OpenAI( # api_key=os.getenv("OPENAI_API_KEY"), # base_url=os.getenv("OPENAI_API_BASE", "") # ) # # Set the folder containing CSV files # folder_path = 'house' # csv_files = [f for f in os.listdir(folder_path) if f.endswith(".csv")] # # Loop through each CSV file and process # for file in csv_files: # file_path = os.path.join(folder_path, file) # try: # df = pd.read_csv(file_path, nrows=1) # Read just the header row # columns = df.columns.tolist() # columns_str = ", ".join(columns) # # Formulate prompt # prompt = ( # f"The following are column labels from a dataset: {columns_str}.\n" # "Among all the labels, return a list of labels which can be clubbed and merged into 1 label. Note that the lables which can be merged must belong to different datasets, do not merge lables of the same dataset" # ) # # Ask OpenAI for insight # response = client.chat.completions.create( # model="gpt-4", # messages=[ # {"role": "system", "content": "You are a helpful data analyst."}, # {"role": "user", "content": prompt} # ], # temperature=0.3 # ) # print(f"\nšŸ” File: {file}") # print("šŸ“‹ Columns:", columns_str) # print("šŸ’” AI Insight:", response.choices[0].message.content) # except Exception as e: # print(f"āŒ Error processing {file}: {e}") import os import json import pandas as pd from openai import OpenAI from dotenv import load_dotenv load_dotenv() client = OpenAI( api_key=os.getenv("OPENAI_API_KEY"), base_url=os.getenv("OPENAI_API_BASE", "") ) folder_path = 'downloads/house' csv_files = [f for f in os.listdir(folder_path) if f.endswith(".csv")] # Collect all column headers all_columns = {} for file in csv_files: try: df = pd.read_csv(os.path.join(folder_path, file), nrows=1) all_columns[file] = df.columns.tolist() except Exception as e: print(f"āŒ Could not read {file}: {e}") flattened_cols = [f"{file}: {', '.join(cols)}" for file, cols in all_columns.items()] prompt = ( "The following are column headers from multiple CSV datasets:\n\n" + f"{flattened_cols}" + "\n\nIdentify which labels across different datasets can be considered equivalent and merged. " "Return only a valid JSON dictionary where keys are standard labels and values are lists of corresponding labels to rename. No explanation." ) response = client.chat.completions.create( model="gpt-4", messages=[ {"role": "system", "content": "You are a helpful data analyst."}, {"role": "user", "content": prompt} ], temperature=0.3 ) # Parse JSON dictionary merge_map_text = response.choices[0].message.content.strip() try: start = merge_map_text.find("{") end = merge_map_text.rfind("}") + 1 json_text = merge_map_text[start:end] merge_map = json.loads(json_text) print("\n🧠 Parsed Merge Map:") print(json.dumps(merge_map, indent=2)) except Exception as e: print("āŒ Could not parse merge map from GPT:", e) merge_map = {} # Merge DataFrames merged_df = pd.DataFrame() for file in csv_files: try: df = pd.read_csv(os.path.join(folder_path, file), on_bad_lines='skip') # Rename columns to standard labels for standard_label, variants in merge_map.items(): for variant in variants: if variant in df.columns: df[standard_label] = df[variant] # Retain only the standard columns we care about df = df[list(set(df.columns) & set(merge_map.keys()))] if not df.empty: merged_df = pd.concat([merged_df, df], ignore_index=True) except Exception as e: print(f"āŒ Error processing {file}: {e}") # Final clean-up if not merged_df.empty: merged_df.drop_duplicates(inplace=True) merged_df.to_csv("merged_cleaned_dataset.csv", index=False) print("\nāœ… Merged and cleaned dataset saved as 'merged_cleaned_dataset.csv'") else: print("āš ļø No data was merged. Check if the merge map matches the actual columns.")