Spaces:

vansh9878
/

Dataset

Runtime error

File size: 13,482 Bytes

825e978

1. seprate attributes and data
2. remove the datatypes from the attributes
C:\Users\Niall Dcunha\DatasetCreator\house price prediction\21754539_dataset

# import os
# import glob
# import pandas as pd
# import openai
# from openai import OpenAI
# from dotenv import load_dotenv
# import ast
# import re

# def extract_dict_from_response(response: str) -> dict:
#     # Try extracting code block content containing the dictionary
#     match = re.search(r"```(?:python)?\s*(\{.*?\})\s*```", response, re.DOTALL)
#     if match:
#         mapping_str = match.group(1)
#     else:
#         # Try extracting dictionary directly if it's not in code block
#         match = re.search(r"(\{.*\})", response, re.DOTALL)
#         if not match:
#             raise ValueError("❌ Could not find a Python dictionary in the response.")
#         mapping_str = match.group(1)
    
#     try:
#         return ast.literal_eval(mapping_str)
#     except Exception as e:
#         print("⚠️ Failed to evaluate extracted dictionary string.")
#         print("String:", mapping_str)
#         raise e

# # Load environment variables
# load_dotenv()
# client = OpenAI(
#     api_key=os.getenv("OPENAI_API_KEY"),
#     base_url=os.getenv("OPENAI_API_BASE")  # Optional: for Azure or self-hosted
# )

# def load_csv_files(folder_path):
#     csv_files = glob.glob(os.path.join(folder_path, "*.csv"))
#     dataframes = []
#     column_sets = []
#     valid_paths = []

#     print("📥 Reading CSV files...")

#     for file in csv_files:
#         try:
#             df = pd.read_csv(file)
#             dataframes.append(df)
#             column_sets.append(list(df.columns))
#             valid_paths.append(file)
#             print(f"✅ Loaded: {os.path.basename(file)}")
#         except pd.errors.ParserError as e:
#             print(f"❌ Skipping file due to parsing error: {os.path.basename(file)}")
#             print(f"   ↳ {e}")
#         except Exception as e:
#             print(f"⚠️ Unexpected error with file {os.path.basename(file)}: {e}")

#     return dataframes, column_sets, valid_paths

# def generate_mapping_prompt(column_sets):
#     prompt = (
#         "You are a data scientist helping to merge multiple ML prediction datasets. "
#         "Each CSV may have different or similar column names. I need a unified mapping to standardize these datasets. "
#         "Also, please identify likely prediction label columns (e.g., price, quality, outcome).\n\n"
#         "Here are the column headers from each CSV:\n"
#     )
#     for i, columns in enumerate(column_sets):
#         prompt += f"CSV {i+1}: {columns}\n"
#     prompt += (
#         "\nPlease provide:\n"
#         "1. A Python dictionary mapping similar columns across these CSVs.\n"
#         "2. A list of columns most likely to represent prediction labels.\n\n"
#         "Format your response as:\n"
#         "```python\n"
#         "column_mapping = { ... }\n"
#         "label_columns = [ ... ]\n"
#         "```"
#     )
#     return prompt

# def get_column_mapping_from_openai(column_sets):
#     prompt = generate_mapping_prompt(column_sets)

#     response = client.chat.completions.create(
#         model="gpt-4",
#         messages=[
#             {"role": "system", "content": "You are a helpful data scientist."},
#             {"role": "user", "content": prompt}
#         ],
#         temperature=0.3
#     )

#     content = response.choices[0].message.content
#     print("\n📩 Received response from OpenAI.")

#     try:
#         # Try parsing both dictionary and label list from the response
#         column_mapping_match = re.search(r"column_mapping\s*=\s*(\{.*?\})", content, re.DOTALL)
#         label_columns_match = re.search(r"label_columns\s*=\s*(\[.*?\])", content, re.DOTALL)

#         if column_mapping_match:
#             mapping = ast.literal_eval(column_mapping_match.group(1))
#         else:
#             raise ValueError("❌ Could not find `column_mapping` in the response.")

#         if label_columns_match:
#             label_columns = ast.literal_eval(label_columns_match.group(1))
#         else:
#             label_columns = []

#     except Exception as e:
#         print("⚠️ Error parsing OpenAI response:")
#         print(content)
#         raise e

#     return mapping, label_columns

# def standardize_columns(df, mapping):
#     new_columns = {col: mapping.get(col, col) for col in df.columns}
#     return df.rename(columns=new_columns)

# def merge_csvs(folder_path, output_file="merged_dataset.csv"):
#     dfs, column_sets, csv_paths = load_csv_files(folder_path)

#     if not dfs:
#         print("❌ No valid CSVs found to merge.")
#         return

#     print("\n🧠 Requesting column mapping from OpenAI...")
#     mapping, label_columns = get_column_mapping_from_openai(column_sets)

#     print("\n📌 Column Mapping:")
#     for k, v in mapping.items():
#         print(f"  '{k}' -> '{v}'")

#     print("\n🏷️  Suggested Label Columns:")
#     for label in label_columns:
#         print(f"  - {label}")

#     standardized_dfs = [standardize_columns(df, mapping) for df in dfs]
#     merged_df = pd.concat(standardized_dfs, ignore_index=True, sort=False)

#     merged_df.to_csv(output_file, index=False)
#     print(f"\n✅ Merged dataset saved as '{output_file}'")

# if __name__ == "__main__":
#     folder_path = "house"


import os
import glob
import pandas as pd
import ast
import re
from itertools import combinations
from rapidfuzz import fuzz, process
from dotenv import load_dotenv
from openai import OpenAI

# Manual rename map to standardize some known variations
manual_rename_map = {
    "review": "text",
    "text": "text",
    "NumBedrooms": "bedrooms",
    "HousePrice": "price",
    "TARGET(PRICE_IN_LACS)": "price",
    "SquareFootage": "area",
    "SQUARE_FT": "area",
    "sentiment": "label",
    "target": "label",
    "type": "label",
    "variety": "label",
    "class": "label",
    "HeartDisease": "label",
    "Heart Attack Risk (Binary)": "label",
    "Heart Attack Risk": "label"
}


def normalize(col):
    return re.sub(r'[^a-z0-9]', '', col.lower())

def apply_manual_renaming(df, rename_map):
    renamed = {}
    for col in df.columns:
        if col in rename_map:
            renamed[col] = rename_map[col]
    return df.rename(columns=renamed)

def get_fuzzy_common_columns(cols_list, threshold=75):
    base = cols_list[0]
    common = set()
    for col in base:
        match_all = True
        for other in cols_list[1:]:
            match, score, _ = process.extractOne(col, other, scorer=fuzz.token_sort_ratio)
            if score < threshold:
                match_all = False
                break
        if match_all:
            common.add(col)
    return common

def sortFiles(dfs):
    unique_dfs = []
    seen = []
    for i, df1 in enumerate(dfs):
        duplicate = False
        for j in seen:
            df2 = dfs[j]
            if df1.shape != df2.shape:
                continue
            if df1.reset_index(drop=True).equals(df2.reset_index(drop=True)):
                duplicate = True
                break
        if not duplicate:
            unique_dfs.append(df1)
            seen.append(i)
    return unique_dfs

def load_csv_files(folder_path):
    csv_files = glob.glob(os.path.join(folder_path, "*.csv"))
    dfs = []
    column_sets = []
    paths = []

    for file in csv_files:
        try:
            df = pd.read_csv(file)
            dfs.append(df)
            column_sets.append(list(df.columns))
            paths.append(file)
            print(f"✅ Loaded: {os.path.basename(file)}")
        except Exception as e:
            print(f"❌ Failed to load {file}: {e}")
    return dfs, column_sets, paths

def generate_mapping_prompt(column_sets):
    prompt = (
    "You are a data scientist helping to merge multiple machine learning prediction datasets. "
    "Each CSV file may have different column names, even if they represent similar types of data. "
    "Your task is to identify and map these similar columns across datasets to a common, unified name. "
    "Columns with clearly similar features (e.g., 'Bedrooms' and 'BedroomsAbvGr') should be merged into one column with a relevant name like 'bedrooms'.\n\n"
    "Avoid keeping redundant or unique columns that do not have any logical counterpart in other datasets unless they are essential. "
    "The goal is not to maximize the number of columns or rows, but to create a clean, consistent dataset for training ML models.\n\n"
    "Examples:\n"
    "- Dataset1: 'Locality' -> Mumbai, Delhi\n"
    "- Dataset2: 'Places' -> Goa, Singapore\n"
    "→ Merge both into a common column like 'location'.\n\n"
    "Please also identify likely label or target columns that are typically used for prediction (e.g., price, sentiment, outcome, quality).\n\n"
)

    for i, cols in enumerate(column_sets):
        prompt += f"CSV {i+1}: {cols}\n"
    prompt += "\nPlease return:\n```python\ncolumn_mapping = { ... }\nlabel_columns = [ ... ]\n```"
    return prompt

def get_column_mapping_from_openai(column_sets):
    load_dotenv()
    client = OpenAI(
        api_key=os.getenv("OPENAI_API_KEY"),
        base_url=os.getenv("OPENAI_API_BASE", "")
    )

    prompt = generate_mapping_prompt(column_sets)

    response = client.chat.completions.create(
        model="gpt-4",
        messages=[
            {"role": "system", "content": "You are a helpful data scientist."},
            {"role": "user", "content": prompt}
        ],
        temperature=0.3
    )

    content = response.choices[0].message.content

    try:
        column_mapping_match = re.search(r"column_mapping\s*=\s*(\{.*?\})", content, re.DOTALL)
        label_columns_match = re.search(r"label_columns\s*=\s*(\[.*?\])", content, re.DOTALL)
        column_mapping = ast.literal_eval(column_mapping_match.group(1)) if column_mapping_match else {}
        label_columns = ast.literal_eval(label_columns_match.group(1)) if label_columns_match else []
    except Exception as e:
        print("⚠️ Error parsing OpenAI response:")
        print(content)
        raise e

    return column_mapping, label_columns

def clean_and_merge(folder, query=None, use_ai=True):
    os.makedirs("./final", exist_ok=True)
    dfs, column_sets, csv_paths = load_csv_files(folder)

    if not dfs:
        print("No valid CSVs found.")
        return

    dfs = sortFiles(dfs)
    dfs = [apply_manual_renaming(df, manual_rename_map) for df in dfs]

    if use_ai:
        try:
            column_mapping, label_columns = get_column_mapping_from_openai(column_sets)
            dfs = [df.rename(columns={col: column_mapping.get(col, col) for col in df.columns}) for df in dfs]
        except Exception as e:
            print("Falling back to fuzzy matching due to OpenAI error:", e)
            use_ai = False

    if not use_ai:
        # Normalize columns for fuzzy match fallback
        normalized_cols = []
        for df in dfs:
            normalized_cols.append({normalize(col) for col in df.columns})

        # Get best combination with fuzzy common columns
        max_common = set()
        best_combo = []
        for i in range(2, len(dfs)+1):
            for combo in combinations(range(len(dfs)), i):
                selected = [normalized_cols[j] for j in combo]
                fuzzy_common = get_fuzzy_common_columns(selected)
                if len(fuzzy_common) >= len(max_common):
                    max_common = fuzzy_common
                    best_combo = combo

        # Harmonize and align
        aligned_dfs = []
        for idx in best_combo:
            df = dfs[idx]
            col_map = {}
            for std_col in max_common:
                match, _, _ = process.extractOne(std_col, [normalize(col) for col in df.columns])
                for col in df.columns:
                    if normalize(col) == match:
                        col_map[col] = std_col
                        break
            df_subset = df[list(col_map.keys())].rename(columns=col_map)
            aligned_dfs.append(df_subset)

        combined_df = pd.concat(aligned_dfs, ignore_index=True)
    else:
        combined_df = pd.concat(dfs, ignore_index=True)

    # Label assignment fallback
    for i, df in enumerate(dfs):
        if 'label' not in df.columns:
            name = os.path.basename(csv_paths[i]).split(".")[0].lower()
            name_cleaned = name
            if query:
                words = set(re.sub(r'[^a-z]', ' ', query.lower()).split())
                for word in words:
                    name_cleaned = name_cleaned.replace(word, "")
            df['label'] = name_cleaned

    # Decide best final file
    largest_df = max(dfs, key=lambda df: len(df))
    flag = False

    if len(largest_df) > len(combined_df) and len(largest_df.columns) > 2:
        flag = True
    elif len(combined_df) > len(largest_df) and (len(largest_df.columns) - len(combined_df.columns)) > 3 and len(largest_df.columns) < 7:
        flag = True

    output_file = f"./final/{query or os.path.basename(folder)}.csv"
    if flag:
        largest_df.to_csv(output_file, index=False)
        print(f"⚠️ Saved fallback single file due to poor merge: {output_file}")
    else:
        combined_df.to_csv(output_file, index=False)
        print(f"✅ Saved merged file: {output_file}")

# Example usage:
clean_and_merge("house", query="house", use_ai=True)
#     merge_csvs(folder_path)