{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "4b227ed2", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import glob\n", "import re\n", "# from collections import defaultdict\n", "from itertools import combinations\n", "import os" ] }, { "cell_type": "code", "execution_count": 2, "id": "9a976256", "metadata": {}, "outputs": [], "source": [ "query=\"house predictions\"\n", "\n", "\n", "# Helper: Standardize column names\n", "def normalize(col):\n", " return re.sub(r'[^a-z0-9]', '', col.lower())\n", "\n", "# Step 1: Read CSVs\n", "csv_files = glob.glob(\"downloads/covid 19/*.csv\")\n", "dfs = [pd.read_csv(f) for f in csv_files]\n", "\n", "# Step 2: Store normalized-to-original column mappings\n", "normalized_cols = []\n", "orig_col_maps = []\n", "\n", "for df in dfs:\n", " norm_to_orig = {}\n", " norm_cols = []\n", " for col in df.columns:\n", " norm = normalize(col)\n", " norm_cols.append(norm)\n", " norm_to_orig[norm] = col\n", " normalized_cols.append(set(norm_cols))\n", " orig_col_maps.append(norm_to_orig)\n", "\n", "\n", "# normalized_cols,orig_col_maps" ] }, { "cell_type": "code", "execution_count": null, "id": "d82953b5", "metadata": {}, "outputs": [], "source": [ "from rapidfuzz import process, fuzz\n", "\n", "def get_fuzzy_common_columns(cols_list, threshold=85):\n", " \"\"\"\n", " Given a list of sets of column names (normalized),\n", " return the set of column names that are 'fuzzy common'\n", " across all lists.\n", " \"\"\"\n", " # Start with columns from the first dataset\n", " base = cols_list[0]\n", " common = set()\n", "\n", " for col in base:\n", " match_all = True\n", " for other in cols_list[1:]:\n", " match, score, _ = process.extractOne(col, other, scorer=fuzz.token_sort_ratio)\n", " if score < threshold:\n", " match_all = False\n", " break\n", " if match_all:\n", " common.add(col)\n", " return common\n" ] }, { "cell_type": "code", "execution_count": null, "id": "0eb8e3d5", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[{'max', 'prcp', 'min', 'lat', 'long', 'temp', 'provincestate', 'dayfromjanfirst', 'date', 'wdsp', 'fog', 'fatalities', 'ah', 'stp', 'confirmedcases', 'countryregion', 'rh', 'slp', 'dewp', 'id'}, {'max', 'prcp', 'min', 'lat', 'long', 'temp', 'provincestate', 'dayfromjanfirst', 'date', 'wdsp', 'fog', 'fatalities', 'ah', 'stp', 'countryprovince', 'confirmedcases', 'countryregion', 'rh', 'slp', 'dewp', 'id'}]\n", "[{'max', 'prcp', 'min', 'lat', 'long', 'temp', 'provincestate', 'dayfromjanfirst', 'date', 'wdsp', 'fog', 'fatalities', 'ah', 'stp', 'confirmedcases', 'countryregion', 'rh', 'slp', 'dewp', 'id'}, {'cumulativecases', 'datereported', 'newdeaths', 'countrycode', 'cumulativedeaths', 'whoregion', 'country', 'newcases'}]\n", "[{'max', 'prcp', 'min', 'lat', 'long', 'temp', 'provincestate', 'dayfromjanfirst', 'date', 'wdsp', 'fog', 'fatalities', 'ah', 'stp', 'countryprovince', 'confirmedcases', 'countryregion', 'rh', 'slp', 'dewp', 'id'}, {'cumulativecases', 'datereported', 'newdeaths', 'countrycode', 'cumulativedeaths', 'whoregion', 'country', 'newcases'}]\n", "[{'max', 'prcp', 'min', 'lat', 'long', 'temp', 'provincestate', 'dayfromjanfirst', 'date', 'wdsp', 'fog', 'fatalities', 'ah', 'stp', 'confirmedcases', 'countryregion', 'rh', 'slp', 'dewp', 'id'}, {'max', 'prcp', 'min', 'lat', 'long', 'temp', 'provincestate', 'dayfromjanfirst', 'date', 'wdsp', 'fog', 'fatalities', 'ah', 'stp', 'countryprovince', 'confirmedcases', 'countryregion', 'rh', 'slp', 'dewp', 'id'}, {'cumulativecases', 'datereported', 'newdeaths', 'countrycode', 'cumulativedeaths', 'whoregion', 'country', 'newcases'}]\n" ] }, { "data": { "text/plain": [ "(set(), [])" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "max_common = set()\n", "best_combo = []\n", "\n", "for i in range(2, len(dfs) + 1):\n", " for combo in combinations(range(len(dfs)), i):\n", " common = set.intersection(*[normalized_cols[i] for i in combo])\n", " # some=[normalized_cols[i] for i in combo]\n", " # print(some)\n", " if len(common) > len(max_common):\n", " max_common = common\n", " best_combo = combo\n", "\n", "max_common,best_combo" ] }, { "cell_type": "code", "execution_count": 17, "id": "b72d4152", "metadata": {}, "outputs": [], "source": [ "aligned_dfs = []\n", "for idx in best_combo:\n", " df = dfs[idx]\n", " norm_to_orig = orig_col_maps[idx]\n", " selected_cols = [norm_to_orig[col] for col in max_common]\n", " df_subset = df[selected_cols].copy()\n", " df_subset.columns = [col for col in max_common] # unify column names\n", " aligned_dfs.append(df_subset)\n", "\n", "# Step 5: Combine\n", "combined_df = pd.concat(aligned_dfs, ignore_index=True)" ] }, { "cell_type": "code", "execution_count": null, "id": "b0768203", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 9, "id": "f315dfad", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "17892\n", "24414\n", "54960\n" ] } ], "source": [ "for df in dfs:\n", " print(df.index.size)" ] }, { "cell_type": "code", "execution_count": 11, "id": "7972a696", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[17892, 24414]\n", "[17892, 54960]\n", "[24414, 54960]\n", "[17892, 24414, 54960]\n" ] } ], "source": [ "for i in range(2, len(dfs) + 1):\n", " for combo in combinations(range(len(dfs)), i):\n", " counts=[dfs[i].index.size for i in combo]\n", " print(counts)" ] }, { "cell_type": "code", "execution_count": 12, "id": "1a0f8006", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(54960, 2)" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "maxCount=0\n", "idx=-1\n", "for i in range(len(dfs)):\n", " if dfs[i].index.size > maxCount:\n", " maxCount=dfs[i].index.size\n", " idx=i\n", "\n", "maxCount,idx" ] }, { "cell_type": "code", "execution_count": 18, "id": "240d4fd1", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(42306, 54960)" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "combined_df.index.size,maxCount" ] }, { "cell_type": "code", "execution_count": 20, "id": "5eba3fe9", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'hello'" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "str=\"hello and \"\n", "str[:5]" ] }, { "cell_type": "code", "execution_count": 2, "id": "94dac715", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
fixed acidityvolatile aciditycitric acidresidual sugarchloridesfree sulfur dioxidetotal sulfur dioxidedensitypHsulphatesalcoholquality
07.40.700.001.90.07611.034.00.99783.510.569.45
17.80.880.002.60.09825.067.00.99683.200.689.85
27.80.760.042.30.09215.054.00.99703.260.659.85
311.20.280.561.90.07517.060.00.99803.160.589.86
47.40.700.001.90.07611.034.00.99783.510.569.45
\n", "
" ], "text/plain": [ " fixed acidity volatile acidity citric acid residual sugar chlorides \\\n", "0 7.4 0.70 0.00 1.9 0.076 \n", "1 7.8 0.88 0.00 2.6 0.098 \n", "2 7.8 0.76 0.04 2.3 0.092 \n", "3 11.2 0.28 0.56 1.9 0.075 \n", "4 7.4 0.70 0.00 1.9 0.076 \n", "\n", " free sulfur dioxide total sulfur dioxide density pH sulphates \\\n", "0 11.0 34.0 0.9978 3.51 0.56 \n", "1 25.0 67.0 0.9968 3.20 0.68 \n", "2 15.0 54.0 0.9970 3.26 0.65 \n", "3 17.0 60.0 0.9980 3.16 0.58 \n", "4 11.0 34.0 0.9978 3.51 0.56 \n", "\n", " alcohol quality \n", "0 9.4 5 \n", "1 9.8 5 \n", "2 9.8 5 \n", "3 9.8 6 \n", "4 9.4 5 " ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import pandas as pd\n", "\n", "df=pd.read_csv(\"downloads/wine quality prediction/redwine.csv\")\n", "df.head()" ] }, { "cell_type": "code", "execution_count": 9, "id": "d0947632", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'downloads/wine quality prediction\\\\redwine.csv'" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import glob\n", "csv_files = glob.glob(\"downloads/\"+\"wine quality prediction\"+\"/*.csv\")\n", "csv_files[0]" ] }, { "cell_type": "code", "execution_count": 11, "id": "22e2e148", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
fixed acidityvolatile aciditycitric acidresidual sugarchloridesfree sulfur dioxidetotal sulfur dioxidedensitypHsulphatesalcoholquality
07.40.700.001.90.07611.034.00.99783.510.569.45
17.80.880.002.60.09825.067.00.99683.200.689.85
27.80.760.042.30.09215.054.00.99703.260.659.85
311.20.280.561.90.07517.060.00.99803.160.589.86
47.40.700.001.90.07611.034.00.99783.510.569.45
\n", "
" ], "text/plain": [ " fixed acidity volatile acidity citric acid residual sugar chlorides \\\n", "0 7.4 0.70 0.00 1.9 0.076 \n", "1 7.8 0.88 0.00 2.6 0.098 \n", "2 7.8 0.76 0.04 2.3 0.092 \n", "3 11.2 0.28 0.56 1.9 0.075 \n", "4 7.4 0.70 0.00 1.9 0.076 \n", "\n", " free sulfur dioxide total sulfur dioxide density pH sulphates \\\n", "0 11.0 34.0 0.9978 3.51 0.56 \n", "1 25.0 67.0 0.9968 3.20 0.68 \n", "2 15.0 54.0 0.9970 3.26 0.65 \n", "3 17.0 60.0 0.9980 3.16 0.58 \n", "4 11.0 34.0 0.9978 3.51 0.56 \n", "\n", " alcohol quality \n", "0 9.4 5 \n", "1 9.8 5 \n", "2 9.8 5 \n", "3 9.8 6 \n", "4 9.4 5 " ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df=pd.read_csv(csv_files[0])\n", "df.head()" ] }, { "cell_type": "code", "execution_count": null, "id": "1553de09", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
fixed acidityvolatile aciditycitric acidresidual sugarchloridesfree sulfur dioxidetotal sulfur dioxidedensitypHsulphatesalcoholqualitylabel
07.40.700.001.90.07611.034.00.99783.510.569.45red
17.80.880.002.60.09825.067.00.99683.200.689.85red
27.80.760.042.30.09215.054.00.99703.260.659.85red
311.20.280.561.90.07517.060.00.99803.160.589.86red
47.40.700.001.90.07611.034.00.99783.510.569.45red
\n", "
" ], "text/plain": [ " fixed acidity volatile acidity citric acid residual sugar chlorides \\\n", "0 7.4 0.70 0.00 1.9 0.076 \n", "1 7.8 0.88 0.00 2.6 0.098 \n", "2 7.8 0.76 0.04 2.3 0.092 \n", "3 11.2 0.28 0.56 1.9 0.075 \n", "4 7.4 0.70 0.00 1.9 0.076 \n", "\n", " free sulfur dioxide total sulfur dioxide density pH sulphates \\\n", "0 11.0 34.0 0.9978 3.51 0.56 \n", "1 25.0 67.0 0.9968 3.20 0.68 \n", "2 15.0 54.0 0.9970 3.26 0.65 \n", "3 17.0 60.0 0.9980 3.16 0.58 \n", "4 11.0 34.0 0.9978 3.51 0.56 \n", "\n", " alcohol quality label \n", "0 9.4 5 red \n", "1 9.8 5 red \n", "2 9.8 5 red \n", "3 9.8 6 red \n", "4 9.4 5 red " ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import os\n", "newName=os.path.basename(csv_files[0]).lower().split(\".\")[0]\n", "query=\"wine quality prediction\"\n", "\n", "words=set(query.lower().split())\n", "\n", "for word in words:\n", " if word in newName:\n", " newName=newName.replace(word,\"\")\n", "\n", "df['label']=newName\n", "\n", "df.head()" ] }, { "cell_type": "code", "execution_count": 2, "id": "8c258b22", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Date_reportedCountry_codeCountryWHO_regionNew_casesCumulative_casesNew_deathsCumulative_deaths
02020-01-05AFAfghanistanEMRONaN0NaN0
12020-01-12AFAfghanistanEMRONaN0NaN0
22020-01-19AFAfghanistanEMRONaN0NaN0
32020-01-26AFAfghanistanEMRONaN0NaN0
42020-02-02AFAfghanistanEMRONaN0NaN0
\n", "
" ], "text/plain": [ " Date_reported Country_code Country WHO_region New_cases \\\n", "0 2020-01-05 AF Afghanistan EMRO NaN \n", "1 2020-01-12 AF Afghanistan EMRO NaN \n", "2 2020-01-19 AF Afghanistan EMRO NaN \n", "3 2020-01-26 AF Afghanistan EMRO NaN \n", "4 2020-02-02 AF Afghanistan EMRO NaN \n", "\n", " Cumulative_cases New_deaths Cumulative_deaths \n", "0 0 NaN 0 \n", "1 0 NaN 0 \n", "2 0 NaN 0 \n", "3 0 NaN 0 \n", "4 0 NaN 0 " ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import pandas as pd\n", "\n", "df=pd.read_csv(\"final/covid 19.csv\")\n", "df.head()" ] }, { "cell_type": "code", "execution_count": 3, "id": "6b43c357", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 54960 entries, 0 to 54959\n", "Data columns (total 8 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 Date_reported 54960 non-null object \n", " 1 Country_code 54731 non-null object \n", " 2 Country 54960 non-null object \n", " 3 WHO_region 50838 non-null object \n", " 4 New_cases 38082 non-null float64\n", " 5 Cumulative_cases 54960 non-null int64 \n", " 6 New_deaths 24747 non-null float64\n", " 7 Cumulative_deaths 54960 non-null int64 \n", "dtypes: float64(2), int64(2), object(4)\n", "memory usage: 3.4+ MB\n" ] } ], "source": [ "df.info()" ] }, { "cell_type": "code", "execution_count": 4, "id": "ab7a92d8", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['Date_reported', 'Country_code', 'Country', 'WHO_region']" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "object_columns = df.dtypes[df.dtypes == 'object'].index.tolist()\n", "object_columns" ] }, { "cell_type": "code", "execution_count": 5, "id": "ae0b8edb", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " New_cases Cumulative_cases New_deaths Cumulative_deaths \\\n", "0 NaN 0 NaN 0 \n", "1 NaN 0 NaN 0 \n", "2 NaN 0 NaN 0 \n", "3 NaN 0 NaN 0 \n", "4 NaN 0 NaN 0 \n", "\n", " Date_reported_2020-01-05 Date_reported_2020-01-12 \\\n", "0 1 0 \n", "1 0 1 \n", "2 0 0 \n", "3 0 0 \n", "4 0 0 \n", "\n", " Date_reported_2020-01-19 Date_reported_2020-01-26 \\\n", "0 0 0 \n", "1 0 0 \n", "2 1 0 \n", "3 0 1 \n", "4 0 0 \n", "\n", " Date_reported_2020-02-02 Date_reported_2020-02-09 ... Country_Zambia \\\n", "0 0 0 ... 0 \n", "1 0 0 ... 0 \n", "2 0 0 ... 0 \n", "3 0 0 ... 0 \n", "4 1 0 ... 0 \n", "\n", " Country_Zimbabwe \\\n", "0 0 \n", "1 0 \n", "2 0 \n", "3 0 \n", "4 0 \n", "\n", " Country_occupied Palestinian territory, including east Jerusalem \\\n", "0 0 \n", "1 0 \n", "2 0 \n", "3 0 \n", "4 0 \n", "\n", " WHO_region_AFRO WHO_region_AMRO WHO_region_EMRO WHO_region_EURO \\\n", "0 0 0 1 0 \n", "1 0 0 1 0 \n", "2 0 0 1 0 \n", "3 0 0 1 0 \n", "4 0 0 1 0 \n", "\n", " WHO_region_OTHER WHO_region_SEARO WHO_region_WPRO \n", "0 0 0 0 \n", "1 0 0 0 \n", "2 0 0 0 \n", "3 0 0 0 \n", "4 0 0 0 \n", "\n", "[5 rows x 719 columns]\n" ] } ], "source": [ "import pandas as pd\n", "\n", "def one_hot_encode_objects(df):\n", " object_cols = df.select_dtypes(include='object').columns\n", "\n", " for col in object_cols:\n", " if \"date\" in col:\n", " continue\n", "\n", " # Perform one-hot encoding\n", " dummies = pd.get_dummies(df[col], prefix=col).astype(int)\n", " df = pd.concat([df, dummies], axis=1)\n", " \n", " df = df.drop(columns=object_cols)\n", " return df\n", "\n", "\n", "def preprocessing(query):\n", " df=pd.read_csv(\"final/\"+query+\".csv\")\n", " # print(df.head())\n", " df=one_hot_encode_objects(df)\n", " print(df.head())\n", " \n", " \n", "preprocessing(\"covid 19\")" ] }, { "cell_type": "code", "execution_count": 2, "id": "f4ab7ad9", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Reduced file saved to: final/twitter sentiment analysis.csv\n" ] } ], "source": [ "import pandas as pd\n", "\n", "def reduce_csv_to_10_percent(file_path):\n", " # Read the original CSV\n", " df = pd.read_csv(file_path)\n", "\n", " # Sample 10% of the rows\n", " reduced_df = df.sample(frac=0.1, random_state=42)\n", "\n", " # Save back to the original file path, overwriting it\n", " reduced_df.to_csv(file_path, index=False)\n", " print(f\"Reduced file saved to: {file_path}\")\n", "\n", "# Example usage\n", "reduce_csv_to_10_percent(\"final/twitter sentiment analysis.csv\")" ] }, { "cell_type": "code", "execution_count": null, "id": "5644317d", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "base", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.7" } }, "nbformat": 4, "nbformat_minor": 5 }