File size: 4,806 Bytes
825e978
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
import pandas as pd
import glob
import re
from itertools import combinations
import os
from rapidfuzz import process, fuzz
import getLabels


def get_fuzzy_common_columns(cols_list, threshold=75):
    """
    Given a list of sets of column names (normalized),
    return the set of column names that are 'fuzzy common'
    across all lists.
    """
    # Start with columns from the first dataset
    base = cols_list[0]
    common = set()

    for col in base:
        match_all = True
        for other in cols_list[1:]:
            match, score, _ = process.extractOne(col, other, scorer=fuzz.token_sort_ratio)
            if score < threshold:
                match_all = False
                break
        if match_all:
            common.add(col)
    return common


def sortFiles(dfs):
    unique_dfs = []
    seen = []

    for i, df1 in enumerate(dfs):
        duplicate = False
        for j in seen:
            df2 = dfs[j]

            # Check if same shape
            if df1.shape != df2.shape:
                continue

            if df1.reset_index(drop=True).equals(df2.reset_index(drop=True)):
                duplicate = True
                break

        if not duplicate:
            unique_dfs.append(df1)
            seen.append(i)

    return unique_dfs


def normalize(col):
    return re.sub(r'[^a-z0-9]', '', col.lower())

def clean(query):
    os.makedirs("./final", exist_ok=True)

    csv_files = glob.glob("downloads/"+query+"/*.csv")
    if len(csv_files)<1:
        print("No csv file found!!")
        exit(0)
    dfs=[]
    skip=[]
    for i,f in enumerate(csv_files):
        try:
            print(f"Reading {f}")
            df = pd.read_csv(f)
            dfs.append(df)
        except Exception as e:
            skip.append(i)
            print(f"Failed to read {f}: {e}")
    print(len(dfs))
    dfs=sortFiles(dfs)
    print(len(dfs))

    labelList=getLabels.LabelsExtraction2(query,dfs,csv_files,skip)
    print(labelList)
    for i,df in enumerate(dfs):
        if labelList[i] in df.columns:
            df.rename(columns={labelList[i]:"label"},inplace=True)

# Step 2: Store normalized-to-original column mappings
    normalized_cols = []
    orig_col_maps = []

    for df in dfs:
        norm_to_orig = {}
        norm_cols = []
        for col in df.columns:
            norm = normalize(col)
            norm_cols.append(norm)
            norm_to_orig[norm] = col
        normalized_cols.append(set(norm_cols))
        orig_col_maps.append(norm_to_orig)

# Step 3: Find combination with max common columns
    max_common = set()
    best_combo = []

    for i in range(2, len(dfs) + 1):
        for combo in combinations(range(len(dfs)), i):
            selected_cols = [normalized_cols[j] for j in combo]
            fuzzy_common = get_fuzzy_common_columns(selected_cols)
            if len(fuzzy_common) >= len(max_common):
                max_common = fuzzy_common
                best_combo = combo

  
# Step 4: Harmonize columns and subset
    aligned_dfs = []

    for idx in best_combo:
        df = dfs[idx]
        original_cols = list(df.columns)
        new_columns = {}
        
        for std_col in max_common:
            # Match this standard col to the most similar original column in this DataFrame
            match, score, _ = process.extractOne(std_col, [normalize(col) for col in original_cols], scorer=fuzz.token_sort_ratio)
            
            # Find the original column that corresponds to the matched normalized name
            for col in original_cols:
                if normalize(col) == match:
                    new_columns[col] = std_col  # Map original -> standard
                    break

        # Subset and rename
        df_subset = df[list(new_columns.keys())].copy()
        df_subset.rename(columns=new_columns, inplace=True)
        aligned_dfs.append(df_subset)

# Step 5: Combine
    combined_df = pd.concat(aligned_dfs, ignore_index=True)
    print(best_combo)
# print(combined_df.head())

    maxCount=0
    idx=-1
    for i in range(len(dfs)):
        if dfs[i].index.size > maxCount:
            maxCount=dfs[i].index.size
            idx=i

    flag=False
    if maxCount>combined_df.index.size and len(dfs[idx].columns)>2:
        # print("11")
        flag=True
    elif combined_df.index.size>maxCount and (len(dfs[idx].columns)-len(combined_df.columns))>3 and len(dfs[idx].columns)<7:
        # print(len(dfs[idx].columns)-len(combined_df.columns))
        flag=True
    
    if flag:
        dfs[idx].to_csv("./final/"+query+".csv", index=False)
        print("The merge file was not upto the mark so saved a single file..."+str(idx))
    else:
        combined_df.to_csv("./final/"+query+".csv", index=False)
        print("Saved Merged file...")



clean("twitter sentiment analysis")