Spaces:

vansh9878
/

Dataset

Runtime error

File size: 4,806 Bytes

825e978

import pandas as pd
import glob
import re
from itertools import combinations
import os
from rapidfuzz import process, fuzz
import getLabels


def get_fuzzy_common_columns(cols_list, threshold=75):
    """
    Given a list of sets of column names (normalized),
    return the set of column names that are 'fuzzy common'
    across all lists.
    """
    # Start with columns from the first dataset
    base = cols_list[0]
    common = set()

    for col in base:
        match_all = True
        for other in cols_list[1:]:
            match, score, _ = process.extractOne(col, other, scorer=fuzz.token_sort_ratio)
            if score < threshold:
                match_all = False
                break
        if match_all:
            common.add(col)
    return common


def sortFiles(dfs):
    unique_dfs = []
    seen = []

    for i, df1 in enumerate(dfs):
        duplicate = False
        for j in seen:
            df2 = dfs[j]

            # Check if same shape
            if df1.shape != df2.shape:
                continue

            if df1.reset_index(drop=True).equals(df2.reset_index(drop=True)):
                duplicate = True
                break

        if not duplicate:
            unique_dfs.append(df1)
            seen.append(i)

    return unique_dfs


def normalize(col):
    return re.sub(r'[^a-z0-9]', '', col.lower())

def clean(query):
    os.makedirs("./final", exist_ok=True)

    csv_files = glob.glob("downloads/"+query+"/*.csv")
    if len(csv_files)<1:
        print("No csv file found!!")
        exit(0)
    dfs=[]
    skip=[]
    for i,f in enumerate(csv_files):
        try:
            print(f"Reading {f}")
            df = pd.read_csv(f)
            dfs.append(df)
        except Exception as e:
            skip.append(i)
            print(f"Failed to read {f}: {e}")
    print(len(dfs))
    dfs=sortFiles(dfs)
    print(len(dfs))

    labelList=getLabels.LabelsExtraction2(query,dfs,csv_files,skip)
    print(labelList)
    for i,df in enumerate(dfs):
        if labelList[i] in df.columns:
            df.rename(columns={labelList[i]:"label"},inplace=True)

# Step 2: Store normalized-to-original column mappings
    normalized_cols = []
    orig_col_maps = []

    for df in dfs:
        norm_to_orig = {}
        norm_cols = []
        for col in df.columns:
            norm = normalize(col)
            norm_cols.append(norm)
            norm_to_orig[norm] = col
        normalized_cols.append(set(norm_cols))
        orig_col_maps.append(norm_to_orig)

# Step 3: Find combination with max common columns
    max_common = set()
    best_combo = []

    for i in range(2, len(dfs) + 1):
        for combo in combinations(range(len(dfs)), i):
            selected_cols = [normalized_cols[j] for j in combo]
            fuzzy_common = get_fuzzy_common_columns(selected_cols)
            if len(fuzzy_common) >= len(max_common):
                max_common = fuzzy_common
                best_combo = combo

  
# Step 4: Harmonize columns and subset
    aligned_dfs = []

    for idx in best_combo:
        df = dfs[idx]
        original_cols = list(df.columns)
        new_columns = {}
        
        for std_col in max_common:
            # Match this standard col to the most similar original column in this DataFrame
            match, score, _ = process.extractOne(std_col, [normalize(col) for col in original_cols], scorer=fuzz.token_sort_ratio)
            
            # Find the original column that corresponds to the matched normalized name
            for col in original_cols:
                if normalize(col) == match:
                    new_columns[col] = std_col  # Map original -> standard
                    break

        # Subset and rename
        df_subset = df[list(new_columns.keys())].copy()
        df_subset.rename(columns=new_columns, inplace=True)
        aligned_dfs.append(df_subset)

# Step 5: Combine
    combined_df = pd.concat(aligned_dfs, ignore_index=True)
    print(best_combo)
# print(combined_df.head())

    maxCount=0
    idx=-1
    for i in range(len(dfs)):
        if dfs[i].index.size > maxCount:
            maxCount=dfs[i].index.size
            idx=i

    flag=False
    if maxCount>combined_df.index.size and len(dfs[idx].columns)>2:
        # print("11")
        flag=True
    elif combined_df.index.size>maxCount and (len(dfs[idx].columns)-len(combined_df.columns))>3 and len(dfs[idx].columns)<7:
        # print(len(dfs[idx].columns)-len(combined_df.columns))
        flag=True
    
    if flag:
        dfs[idx].to_csv("./final/"+query+".csv", index=False)
        print("The merge file was not upto the mark so saved a single file..."+str(idx))
    else:
        combined_df.to_csv("./final/"+query+".csv", index=False)
        print("Saved Merged file...")



clean("twitter sentiment analysis")