Dataset / cleanDataset.py
vansh9878's picture
files added
825e978
import pandas as pd
import glob
import re
from itertools import combinations
import os
from rapidfuzz import process, fuzz
import getLabels
def get_fuzzy_common_columns(cols_list, threshold=75):
"""
Given a list of sets of column names (normalized),
return the set of column names that are 'fuzzy common'
across all lists.
"""
# Start with columns from the first dataset
base = cols_list[0]
common = set()
for col in base:
match_all = True
for other in cols_list[1:]:
match, score, _ = process.extractOne(col, other, scorer=fuzz.token_sort_ratio)
if score < threshold:
match_all = False
break
if match_all:
common.add(col)
return common
def sortFiles(dfs):
unique_dfs = []
seen = []
for i, df1 in enumerate(dfs):
duplicate = False
for j in seen:
df2 = dfs[j]
# Check if same shape
if df1.shape != df2.shape:
continue
if df1.reset_index(drop=True).equals(df2.reset_index(drop=True)):
duplicate = True
break
if not duplicate:
unique_dfs.append(df1)
seen.append(i)
return unique_dfs
def normalize(col):
return re.sub(r'[^a-z0-9]', '', col.lower())
def clean(query):
os.makedirs("./final", exist_ok=True)
csv_files = glob.glob("downloads/"+query+"/*.csv")
if len(csv_files)<1:
print("No csv file found!!")
exit(0)
dfs=[]
skip=[]
for i,f in enumerate(csv_files):
try:
print(f"Reading {f}")
df = pd.read_csv(f)
dfs.append(df)
except Exception as e:
skip.append(i)
print(f"Failed to read {f}: {e}")
print(len(dfs))
dfs=sortFiles(dfs)
print(len(dfs))
labelList=getLabels.LabelsExtraction2(query,dfs,csv_files,skip)
print(labelList)
for i,df in enumerate(dfs):
if labelList[i] in df.columns:
df.rename(columns={labelList[i]:"label"},inplace=True)
# Step 2: Store normalized-to-original column mappings
normalized_cols = []
orig_col_maps = []
for df in dfs:
norm_to_orig = {}
norm_cols = []
for col in df.columns:
norm = normalize(col)
norm_cols.append(norm)
norm_to_orig[norm] = col
normalized_cols.append(set(norm_cols))
orig_col_maps.append(norm_to_orig)
# Step 3: Find combination with max common columns
max_common = set()
best_combo = []
for i in range(2, len(dfs) + 1):
for combo in combinations(range(len(dfs)), i):
selected_cols = [normalized_cols[j] for j in combo]
fuzzy_common = get_fuzzy_common_columns(selected_cols)
if len(fuzzy_common) >= len(max_common):
max_common = fuzzy_common
best_combo = combo
# Step 4: Harmonize columns and subset
aligned_dfs = []
for idx in best_combo:
df = dfs[idx]
original_cols = list(df.columns)
new_columns = {}
for std_col in max_common:
# Match this standard col to the most similar original column in this DataFrame
match, score, _ = process.extractOne(std_col, [normalize(col) for col in original_cols], scorer=fuzz.token_sort_ratio)
# Find the original column that corresponds to the matched normalized name
for col in original_cols:
if normalize(col) == match:
new_columns[col] = std_col # Map original -> standard
break
# Subset and rename
df_subset = df[list(new_columns.keys())].copy()
df_subset.rename(columns=new_columns, inplace=True)
aligned_dfs.append(df_subset)
# Step 5: Combine
combined_df = pd.concat(aligned_dfs, ignore_index=True)
print(best_combo)
# print(combined_df.head())
maxCount=0
idx=-1
for i in range(len(dfs)):
if dfs[i].index.size > maxCount:
maxCount=dfs[i].index.size
idx=i
flag=False
if maxCount>combined_df.index.size and len(dfs[idx].columns)>2:
# print("11")
flag=True
elif combined_df.index.size>maxCount and (len(dfs[idx].columns)-len(combined_df.columns))>3 and len(dfs[idx].columns)<7:
# print(len(dfs[idx].columns)-len(combined_df.columns))
flag=True
if flag:
dfs[idx].to_csv("./final/"+query+".csv", index=False)
print("The merge file was not upto the mark so saved a single file..."+str(idx))
else:
combined_df.to_csv("./final/"+query+".csv", index=False)
print("Saved Merged file...")
clean("twitter sentiment analysis")