|
import pandas as pd |
|
import glob |
|
import re |
|
from itertools import combinations |
|
import os |
|
from rapidfuzz import process, fuzz |
|
import getLabels |
|
|
|
|
|
def get_fuzzy_common_columns(cols_list, threshold=75): |
|
""" |
|
Given a list of sets of column names (normalized), |
|
return the set of column names that are 'fuzzy common' |
|
across all lists. |
|
""" |
|
|
|
base = cols_list[0] |
|
common = set() |
|
|
|
for col in base: |
|
match_all = True |
|
for other in cols_list[1:]: |
|
match, score, _ = process.extractOne(col, other, scorer=fuzz.token_sort_ratio) |
|
if score < threshold: |
|
match_all = False |
|
break |
|
if match_all: |
|
common.add(col) |
|
return common |
|
|
|
|
|
def sortFiles(dfs): |
|
unique_dfs = [] |
|
seen = [] |
|
|
|
for i, df1 in enumerate(dfs): |
|
duplicate = False |
|
for j in seen: |
|
df2 = dfs[j] |
|
|
|
|
|
if df1.shape != df2.shape: |
|
continue |
|
|
|
if df1.reset_index(drop=True).equals(df2.reset_index(drop=True)): |
|
duplicate = True |
|
break |
|
|
|
if not duplicate: |
|
unique_dfs.append(df1) |
|
seen.append(i) |
|
|
|
return unique_dfs |
|
|
|
|
|
def normalize(col): |
|
return re.sub(r'[^a-z0-9]', '', col.lower()) |
|
|
|
def clean(query): |
|
os.makedirs("./final", exist_ok=True) |
|
|
|
csv_files = glob.glob("downloads/"+query+"/*.csv") |
|
if len(csv_files)<1: |
|
print("No csv file found!!") |
|
exit(0) |
|
dfs=[] |
|
skip=[] |
|
for i,f in enumerate(csv_files): |
|
try: |
|
print(f"Reading {f}") |
|
df = pd.read_csv(f) |
|
dfs.append(df) |
|
except Exception as e: |
|
skip.append(i) |
|
print(f"Failed to read {f}: {e}") |
|
print(len(dfs)) |
|
dfs=sortFiles(dfs) |
|
print(len(dfs)) |
|
|
|
labelList=getLabels.LabelsExtraction2(query,dfs,csv_files,skip) |
|
print(labelList) |
|
for i,df in enumerate(dfs): |
|
if labelList[i] in df.columns: |
|
df.rename(columns={labelList[i]:"label"},inplace=True) |
|
|
|
|
|
normalized_cols = [] |
|
orig_col_maps = [] |
|
|
|
for df in dfs: |
|
norm_to_orig = {} |
|
norm_cols = [] |
|
for col in df.columns: |
|
norm = normalize(col) |
|
norm_cols.append(norm) |
|
norm_to_orig[norm] = col |
|
normalized_cols.append(set(norm_cols)) |
|
orig_col_maps.append(norm_to_orig) |
|
|
|
|
|
max_common = set() |
|
best_combo = [] |
|
|
|
for i in range(2, len(dfs) + 1): |
|
for combo in combinations(range(len(dfs)), i): |
|
selected_cols = [normalized_cols[j] for j in combo] |
|
fuzzy_common = get_fuzzy_common_columns(selected_cols) |
|
if len(fuzzy_common) >= len(max_common): |
|
max_common = fuzzy_common |
|
best_combo = combo |
|
|
|
|
|
|
|
aligned_dfs = [] |
|
|
|
for idx in best_combo: |
|
df = dfs[idx] |
|
original_cols = list(df.columns) |
|
new_columns = {} |
|
|
|
for std_col in max_common: |
|
|
|
match, score, _ = process.extractOne(std_col, [normalize(col) for col in original_cols], scorer=fuzz.token_sort_ratio) |
|
|
|
|
|
for col in original_cols: |
|
if normalize(col) == match: |
|
new_columns[col] = std_col |
|
break |
|
|
|
|
|
df_subset = df[list(new_columns.keys())].copy() |
|
df_subset.rename(columns=new_columns, inplace=True) |
|
aligned_dfs.append(df_subset) |
|
|
|
|
|
combined_df = pd.concat(aligned_dfs, ignore_index=True) |
|
print(best_combo) |
|
|
|
|
|
maxCount=0 |
|
idx=-1 |
|
for i in range(len(dfs)): |
|
if dfs[i].index.size > maxCount: |
|
maxCount=dfs[i].index.size |
|
idx=i |
|
|
|
flag=False |
|
if maxCount>combined_df.index.size and len(dfs[idx].columns)>2: |
|
|
|
flag=True |
|
elif combined_df.index.size>maxCount and (len(dfs[idx].columns)-len(combined_df.columns))>3 and len(dfs[idx].columns)<7: |
|
|
|
flag=True |
|
|
|
if flag: |
|
dfs[idx].to_csv("./final/"+query+".csv", index=False) |
|
print("The merge file was not upto the mark so saved a single file..."+str(idx)) |
|
else: |
|
combined_df.to_csv("./final/"+query+".csv", index=False) |
|
print("Saved Merged file...") |
|
|
|
|
|
|
|
clean("twitter sentiment analysis") |