import pandas as pd import glob import re from itertools import combinations import os from rapidfuzz import process, fuzz import getLabels def get_fuzzy_common_columns(cols_list, threshold=75): """ Given a list of sets of column names (normalized), return the set of column names that are 'fuzzy common' across all lists. """ # Start with columns from the first dataset base = cols_list[0] common = set() for col in base: match_all = True for other in cols_list[1:]: match, score, _ = process.extractOne(col, other, scorer=fuzz.token_sort_ratio) if score < threshold: match_all = False break if match_all: common.add(col) return common def sortFiles(dfs): unique_dfs = [] seen = [] for i, df1 in enumerate(dfs): duplicate = False for j in seen: df2 = dfs[j] # Check if same shape if df1.shape != df2.shape: continue if df1.reset_index(drop=True).equals(df2.reset_index(drop=True)): duplicate = True break if not duplicate: unique_dfs.append(df1) seen.append(i) return unique_dfs def normalize(col): return re.sub(r'[^a-z0-9]', '', col.lower()) def clean(query): os.makedirs("./final", exist_ok=True) csv_files = glob.glob("downloads/"+query+"/*.csv") if len(csv_files)<1: print("No csv file found!!") exit(0) dfs=[] skip=[] for i,f in enumerate(csv_files): try: print(f"Reading {f}") df = pd.read_csv(f) dfs.append(df) except Exception as e: skip.append(i) print(f"Failed to read {f}: {e}") print(len(dfs)) dfs=sortFiles(dfs) print(len(dfs)) labelList=getLabels.LabelsExtraction2(query,dfs,csv_files,skip) print(labelList) for i,df in enumerate(dfs): if labelList[i] in df.columns: df.rename(columns={labelList[i]:"label"},inplace=True) # Step 2: Store normalized-to-original column mappings normalized_cols = [] orig_col_maps = [] for df in dfs: norm_to_orig = {} norm_cols = [] for col in df.columns: norm = normalize(col) norm_cols.append(norm) norm_to_orig[norm] = col normalized_cols.append(set(norm_cols)) orig_col_maps.append(norm_to_orig) # Step 3: Find combination with max common columns max_common = set() best_combo = [] for i in range(2, len(dfs) + 1): for combo in combinations(range(len(dfs)), i): selected_cols = [normalized_cols[j] for j in combo] fuzzy_common = get_fuzzy_common_columns(selected_cols) if len(fuzzy_common) >= len(max_common): max_common = fuzzy_common best_combo = combo # Step 4: Harmonize columns and subset aligned_dfs = [] for idx in best_combo: df = dfs[idx] original_cols = list(df.columns) new_columns = {} for std_col in max_common: # Match this standard col to the most similar original column in this DataFrame match, score, _ = process.extractOne(std_col, [normalize(col) for col in original_cols], scorer=fuzz.token_sort_ratio) # Find the original column that corresponds to the matched normalized name for col in original_cols: if normalize(col) == match: new_columns[col] = std_col # Map original -> standard break # Subset and rename df_subset = df[list(new_columns.keys())].copy() df_subset.rename(columns=new_columns, inplace=True) aligned_dfs.append(df_subset) # Step 5: Combine combined_df = pd.concat(aligned_dfs, ignore_index=True) print(best_combo) # print(combined_df.head()) maxCount=0 idx=-1 for i in range(len(dfs)): if dfs[i].index.size > maxCount: maxCount=dfs[i].index.size idx=i flag=False if maxCount>combined_df.index.size and len(dfs[idx].columns)>2: # print("11") flag=True elif combined_df.index.size>maxCount and (len(dfs[idx].columns)-len(combined_df.columns))>3 and len(dfs[idx].columns)<7: # print(len(dfs[idx].columns)-len(combined_df.columns)) flag=True if flag: dfs[idx].to_csv("./final/"+query+".csv", index=False) print("The merge file was not upto the mark so saved a single file..."+str(idx)) else: combined_df.to_csv("./final/"+query+".csv", index=False) print("Saved Merged file...") clean("twitter sentiment analysis")