File size: 4,806 Bytes
825e978 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 |
import pandas as pd
import glob
import re
from itertools import combinations
import os
from rapidfuzz import process, fuzz
import getLabels
def get_fuzzy_common_columns(cols_list, threshold=75):
"""
Given a list of sets of column names (normalized),
return the set of column names that are 'fuzzy common'
across all lists.
"""
# Start with columns from the first dataset
base = cols_list[0]
common = set()
for col in base:
match_all = True
for other in cols_list[1:]:
match, score, _ = process.extractOne(col, other, scorer=fuzz.token_sort_ratio)
if score < threshold:
match_all = False
break
if match_all:
common.add(col)
return common
def sortFiles(dfs):
unique_dfs = []
seen = []
for i, df1 in enumerate(dfs):
duplicate = False
for j in seen:
df2 = dfs[j]
# Check if same shape
if df1.shape != df2.shape:
continue
if df1.reset_index(drop=True).equals(df2.reset_index(drop=True)):
duplicate = True
break
if not duplicate:
unique_dfs.append(df1)
seen.append(i)
return unique_dfs
def normalize(col):
return re.sub(r'[^a-z0-9]', '', col.lower())
def clean(query):
os.makedirs("./final", exist_ok=True)
csv_files = glob.glob("downloads/"+query+"/*.csv")
if len(csv_files)<1:
print("No csv file found!!")
exit(0)
dfs=[]
skip=[]
for i,f in enumerate(csv_files):
try:
print(f"Reading {f}")
df = pd.read_csv(f)
dfs.append(df)
except Exception as e:
skip.append(i)
print(f"Failed to read {f}: {e}")
print(len(dfs))
dfs=sortFiles(dfs)
print(len(dfs))
labelList=getLabels.LabelsExtraction2(query,dfs,csv_files,skip)
print(labelList)
for i,df in enumerate(dfs):
if labelList[i] in df.columns:
df.rename(columns={labelList[i]:"label"},inplace=True)
# Step 2: Store normalized-to-original column mappings
normalized_cols = []
orig_col_maps = []
for df in dfs:
norm_to_orig = {}
norm_cols = []
for col in df.columns:
norm = normalize(col)
norm_cols.append(norm)
norm_to_orig[norm] = col
normalized_cols.append(set(norm_cols))
orig_col_maps.append(norm_to_orig)
# Step 3: Find combination with max common columns
max_common = set()
best_combo = []
for i in range(2, len(dfs) + 1):
for combo in combinations(range(len(dfs)), i):
selected_cols = [normalized_cols[j] for j in combo]
fuzzy_common = get_fuzzy_common_columns(selected_cols)
if len(fuzzy_common) >= len(max_common):
max_common = fuzzy_common
best_combo = combo
# Step 4: Harmonize columns and subset
aligned_dfs = []
for idx in best_combo:
df = dfs[idx]
original_cols = list(df.columns)
new_columns = {}
for std_col in max_common:
# Match this standard col to the most similar original column in this DataFrame
match, score, _ = process.extractOne(std_col, [normalize(col) for col in original_cols], scorer=fuzz.token_sort_ratio)
# Find the original column that corresponds to the matched normalized name
for col in original_cols:
if normalize(col) == match:
new_columns[col] = std_col # Map original -> standard
break
# Subset and rename
df_subset = df[list(new_columns.keys())].copy()
df_subset.rename(columns=new_columns, inplace=True)
aligned_dfs.append(df_subset)
# Step 5: Combine
combined_df = pd.concat(aligned_dfs, ignore_index=True)
print(best_combo)
# print(combined_df.head())
maxCount=0
idx=-1
for i in range(len(dfs)):
if dfs[i].index.size > maxCount:
maxCount=dfs[i].index.size
idx=i
flag=False
if maxCount>combined_df.index.size and len(dfs[idx].columns)>2:
# print("11")
flag=True
elif combined_df.index.size>maxCount and (len(dfs[idx].columns)-len(combined_df.columns))>3 and len(dfs[idx].columns)<7:
# print(len(dfs[idx].columns)-len(combined_df.columns))
flag=True
if flag:
dfs[idx].to_csv("./final/"+query+".csv", index=False)
print("The merge file was not upto the mark so saved a single file..."+str(idx))
else:
combined_df.to_csv("./final/"+query+".csv", index=False)
print("Saved Merged file...")
clean("twitter sentiment analysis") |