Spaces:

vansh9878
/

Dataset

Runtime error

File size: 4,384 Bytes

825e978

# import os
# import pandas as pd
# from openai import OpenAI
# from dotenv import load_dotenv

# # Load environment variables
# load_dotenv()

# # Setup OpenAI client
# client = OpenAI(
#     api_key=os.getenv("OPENAI_API_KEY"),
#     base_url=os.getenv("OPENAI_API_BASE", "")
# )

# # Set the folder containing CSV files
# folder_path = 'house'
# csv_files = [f for f in os.listdir(folder_path) if f.endswith(".csv")]

# # Loop through each CSV file and process
# for file in csv_files:
#     file_path = os.path.join(folder_path, file)
#     try:
#         df = pd.read_csv(file_path, nrows=1)  # Read just the header row
#         columns = df.columns.tolist()
#         columns_str = ", ".join(columns)

#         # Formulate prompt
#         prompt = (
#             f"The following are column labels from a dataset: {columns_str}.\n"
#             "Among all the labels, return a list of labels which can be clubbed and merged into 1 label. Note that the lables which can be merged must belong to different datasets, do not merge lables of the same dataset"
#         )

#         # Ask OpenAI for insight
#         response = client.chat.completions.create(
#             model="gpt-4",
#             messages=[
#                 {"role": "system", "content": "You are a helpful data analyst."},
#                 {"role": "user", "content": prompt}
#             ],
#             temperature=0.3
#         )

#         print(f"\n🔍 File: {file}")
#         print("📋 Columns:", columns_str)
#         print("💡 AI Insight:", response.choices[0].message.content)

#     except Exception as e:
#         print(f"❌ Error processing {file}: {e}")

import os
import json
import pandas as pd
from openai import OpenAI
from dotenv import load_dotenv

load_dotenv()

client = OpenAI(
    api_key=os.getenv("OPENAI_API_KEY"),
    base_url=os.getenv("OPENAI_API_BASE", "")
)

folder_path = 'downloads/house'
csv_files = [f for f in os.listdir(folder_path) if f.endswith(".csv")]

# Collect all column headers
all_columns = {}
for file in csv_files:
    try:
        df = pd.read_csv(os.path.join(folder_path, file), nrows=1)
        all_columns[file] = df.columns.tolist()
    except Exception as e:
        print(f"❌ Could not read {file}: {e}")

flattened_cols = [f"{file}: {', '.join(cols)}" for file, cols in all_columns.items()]
prompt = (
    "The following are column headers from multiple CSV datasets:\n\n"
    + f"{flattened_cols}"
    + "\n\nIdentify which labels across different datasets can be considered equivalent and merged. "
      "Return only a valid JSON dictionary where keys are standard labels and values are lists of corresponding labels to rename. No explanation."
)

response = client.chat.completions.create(
    model="gpt-4",
    messages=[
        {"role": "system", "content": "You are a helpful data analyst."},
        {"role": "user", "content": prompt}
    ],
    temperature=0.3
)

# Parse JSON dictionary
merge_map_text = response.choices[0].message.content.strip()
try:
    start = merge_map_text.find("{")
    end = merge_map_text.rfind("}") + 1
    json_text = merge_map_text[start:end]
    merge_map = json.loads(json_text)
    print("\n🧠 Parsed Merge Map:")
    print(json.dumps(merge_map, indent=2))
except Exception as e:
    print("❌ Could not parse merge map from GPT:", e)
    merge_map = {}

# Merge DataFrames
merged_df = pd.DataFrame()

for file in csv_files:
    try:
        df = pd.read_csv(os.path.join(folder_path, file), on_bad_lines='skip')

        # Rename columns to standard labels
        for standard_label, variants in merge_map.items():
            for variant in variants:
                if variant in df.columns:
                    df[standard_label] = df[variant]
        
        # Retain only the standard columns we care about
        df = df[list(set(df.columns) & set(merge_map.keys()))]

        if not df.empty:
            merged_df = pd.concat([merged_df, df], ignore_index=True)

    except Exception as e:
        print(f"❌ Error processing {file}: {e}")

# Final clean-up
if not merged_df.empty:
    merged_df.drop_duplicates(inplace=True)
    merged_df.to_csv("merged_cleaned_dataset.csv", index=False)
    print("\n✅ Merged and cleaned dataset saved as 'merged_cleaned_dataset.csv'")
else:
    print("⚠️ No data was merged. Check if the merge map matches the actual columns.")