Spaces:

vansh9878
/

Dataset

Runtime error

App Files Files Community

Dataset / cleanDataset.py

vansh9878

files added

825e978 1 day ago

raw

history blame contribute delete

4.81 kB

	import pandas as pd
	import glob
	import re
	from itertools import combinations
	import os
	from rapidfuzz import process, fuzz
	import getLabels


	def get_fuzzy_common_columns(cols_list, threshold=75):
	"""
	Given a list of sets of column names (normalized),
	return the set of column names that are 'fuzzy common'
	across all lists.
	"""
	# Start with columns from the first dataset
	base = cols_list[0]
	common = set()

	for col in base:
	match_all = True
	for other in cols_list[1:]:
	match, score, _ = process.extractOne(col, other, scorer=fuzz.token_sort_ratio)
	if score < threshold:
	match_all = False
	break
	if match_all:
	common.add(col)
	return common


	def sortFiles(dfs):
	unique_dfs = []
	seen = []

	for i, df1 in enumerate(dfs):
	duplicate = False
	for j in seen:
	df2 = dfs[j]

	# Check if same shape
	if df1.shape != df2.shape:
	continue

	if df1.reset_index(drop=True).equals(df2.reset_index(drop=True)):
	duplicate = True
	break

	if not duplicate:
	unique_dfs.append(df1)
	seen.append(i)

	return unique_dfs


	def normalize(col):
	return re.sub(r'[^a-z0-9]', '', col.lower())

	def clean(query):
	os.makedirs("./final", exist_ok=True)

	csv_files = glob.glob("downloads/"+query+"/*.csv")
	if len(csv_files)<1:
	print("No csv file found!!")
	exit(0)
	dfs=[]
	skip=[]
	for i,f in enumerate(csv_files):
	try:
	print(f"Reading {f}")
	df = pd.read_csv(f)
	dfs.append(df)
	except Exception as e:
	skip.append(i)
	print(f"Failed to read {f}: {e}")
	print(len(dfs))
	dfs=sortFiles(dfs)
	print(len(dfs))

	labelList=getLabels.LabelsExtraction2(query,dfs,csv_files,skip)
	print(labelList)
	for i,df in enumerate(dfs):
	if labelList[i] in df.columns:
	df.rename(columns={labelList[i]:"label"},inplace=True)

	# Step 2: Store normalized-to-original column mappings
	normalized_cols = []
	orig_col_maps = []

	for df in dfs:
	norm_to_orig = {}
	norm_cols = []
	for col in df.columns:
	norm = normalize(col)
	norm_cols.append(norm)
	norm_to_orig[norm] = col
	normalized_cols.append(set(norm_cols))
	orig_col_maps.append(norm_to_orig)

	# Step 3: Find combination with max common columns
	max_common = set()
	best_combo = []

	for i in range(2, len(dfs) + 1):
	for combo in combinations(range(len(dfs)), i):
	selected_cols = [normalized_cols[j] for j in combo]
	fuzzy_common = get_fuzzy_common_columns(selected_cols)
	if len(fuzzy_common) >= len(max_common):
	max_common = fuzzy_common
	best_combo = combo


	# Step 4: Harmonize columns and subset
	aligned_dfs = []

	for idx in best_combo:
	df = dfs[idx]
	original_cols = list(df.columns)
	new_columns = {}

	for std_col in max_common:
	# Match this standard col to the most similar original column in this DataFrame
	match, score, _ = process.extractOne(std_col, [normalize(col) for col in original_cols], scorer=fuzz.token_sort_ratio)

	# Find the original column that corresponds to the matched normalized name
	for col in original_cols:
	if normalize(col) == match:
	new_columns[col] = std_col # Map original -> standard
	break

	# Subset and rename
	df_subset = df[list(new_columns.keys())].copy()
	df_subset.rename(columns=new_columns, inplace=True)
	aligned_dfs.append(df_subset)

	# Step 5: Combine
	combined_df = pd.concat(aligned_dfs, ignore_index=True)
	print(best_combo)
	# print(combined_df.head())

	maxCount=0
	idx=-1
	for i in range(len(dfs)):
	if dfs[i].index.size > maxCount:
	maxCount=dfs[i].index.size
	idx=i

	flag=False
	if maxCount>combined_df.index.size and len(dfs[idx].columns)>2:
	# print("11")
	flag=True
	elif combined_df.index.size>maxCount and (len(dfs[idx].columns)-len(combined_df.columns))>3 and len(dfs[idx].columns)<7:
	# print(len(dfs[idx].columns)-len(combined_df.columns))
	flag=True

	if flag:
	dfs[idx].to_csv("./final/"+query+".csv", index=False)
	print("The merge file was not upto the mark so saved a single file..."+str(idx))
	else:
	combined_df.to_csv("./final/"+query+".csv", index=False)
	print("Saved Merged file...")



	clean("twitter sentiment analysis")