{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "4b227ed2",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import glob\n",
    "import re\n",
    "# from collections import defaultdict\n",
    "from itertools import combinations\n",
    "import os"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "9a976256",
   "metadata": {},
   "outputs": [],
   "source": [
    "query=\"house predictions\"\n",
    "\n",
    "\n",
    "# Helper: Standardize column names\n",
    "def normalize(col):\n",
    "    return re.sub(r'[^a-z0-9]', '', col.lower())\n",
    "\n",
    "# Step 1: Read CSVs\n",
    "csv_files = glob.glob(\"downloads/covid 19/*.csv\")\n",
    "dfs = [pd.read_csv(f) for f in csv_files]\n",
    "\n",
    "# Step 2: Store normalized-to-original column mappings\n",
    "normalized_cols = []\n",
    "orig_col_maps = []\n",
    "\n",
    "for df in dfs:\n",
    "    norm_to_orig = {}\n",
    "    norm_cols = []\n",
    "    for col in df.columns:\n",
    "        norm = normalize(col)\n",
    "        norm_cols.append(norm)\n",
    "        norm_to_orig[norm] = col\n",
    "    normalized_cols.append(set(norm_cols))\n",
    "    orig_col_maps.append(norm_to_orig)\n",
    "\n",
    "\n",
    "# normalized_cols,orig_col_maps"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d82953b5",
   "metadata": {},
   "outputs": [],
   "source": [
    "from rapidfuzz import process, fuzz\n",
    "\n",
    "def get_fuzzy_common_columns(cols_list, threshold=85):\n",
    "    \"\"\"\n",
    "    Given a list of sets of column names (normalized),\n",
    "    return the set of column names that are 'fuzzy common'\n",
    "    across all lists.\n",
    "    \"\"\"\n",
    "    # Start with columns from the first dataset\n",
    "    base = cols_list[0]\n",
    "    common = set()\n",
    "\n",
    "    for col in base:\n",
    "        match_all = True\n",
    "        for other in cols_list[1:]:\n",
    "            match, score, _ = process.extractOne(col, other, scorer=fuzz.token_sort_ratio)\n",
    "            if score < threshold:\n",
    "                match_all = False\n",
    "                break\n",
    "        if match_all:\n",
    "            common.add(col)\n",
    "    return common\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0eb8e3d5",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[{'max', 'prcp', 'min', 'lat', 'long', 'temp', 'provincestate', 'dayfromjanfirst', 'date', 'wdsp', 'fog', 'fatalities', 'ah', 'stp', 'confirmedcases', 'countryregion', 'rh', 'slp', 'dewp', 'id'}, {'max', 'prcp', 'min', 'lat', 'long', 'temp', 'provincestate', 'dayfromjanfirst', 'date', 'wdsp', 'fog', 'fatalities', 'ah', 'stp', 'countryprovince', 'confirmedcases', 'countryregion', 'rh', 'slp', 'dewp', 'id'}]\n",
      "[{'max', 'prcp', 'min', 'lat', 'long', 'temp', 'provincestate', 'dayfromjanfirst', 'date', 'wdsp', 'fog', 'fatalities', 'ah', 'stp', 'confirmedcases', 'countryregion', 'rh', 'slp', 'dewp', 'id'}, {'cumulativecases', 'datereported', 'newdeaths', 'countrycode', 'cumulativedeaths', 'whoregion', 'country', 'newcases'}]\n",
      "[{'max', 'prcp', 'min', 'lat', 'long', 'temp', 'provincestate', 'dayfromjanfirst', 'date', 'wdsp', 'fog', 'fatalities', 'ah', 'stp', 'countryprovince', 'confirmedcases', 'countryregion', 'rh', 'slp', 'dewp', 'id'}, {'cumulativecases', 'datereported', 'newdeaths', 'countrycode', 'cumulativedeaths', 'whoregion', 'country', 'newcases'}]\n",
      "[{'max', 'prcp', 'min', 'lat', 'long', 'temp', 'provincestate', 'dayfromjanfirst', 'date', 'wdsp', 'fog', 'fatalities', 'ah', 'stp', 'confirmedcases', 'countryregion', 'rh', 'slp', 'dewp', 'id'}, {'max', 'prcp', 'min', 'lat', 'long', 'temp', 'provincestate', 'dayfromjanfirst', 'date', 'wdsp', 'fog', 'fatalities', 'ah', 'stp', 'countryprovince', 'confirmedcases', 'countryregion', 'rh', 'slp', 'dewp', 'id'}, {'cumulativecases', 'datereported', 'newdeaths', 'countrycode', 'cumulativedeaths', 'whoregion', 'country', 'newcases'}]\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "(set(), [])"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "max_common = set()\n",
    "best_combo = []\n",
    "\n",
    "for i in range(2, len(dfs) + 1):\n",
    "    for combo in combinations(range(len(dfs)), i):\n",
    "        common = set.intersection(*[normalized_cols[i] for i in combo])\n",
    "        # some=[normalized_cols[i] for i in combo]\n",
    "        # print(some)\n",
    "        if len(common) > len(max_common):\n",
    "            max_common = common\n",
    "            best_combo = combo\n",
    "\n",
    "max_common,best_combo"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "b72d4152",
   "metadata": {},
   "outputs": [],
   "source": [
    "aligned_dfs = []\n",
    "for idx in best_combo:\n",
    "    df = dfs[idx]\n",
    "    norm_to_orig = orig_col_maps[idx]\n",
    "    selected_cols = [norm_to_orig[col] for col in max_common]\n",
    "    df_subset = df[selected_cols].copy()\n",
    "    df_subset.columns = [col for col in max_common]  # unify column names\n",
    "    aligned_dfs.append(df_subset)\n",
    "\n",
    "# Step 5: Combine\n",
    "combined_df = pd.concat(aligned_dfs, ignore_index=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b0768203",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "f315dfad",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "17892\n",
      "24414\n",
      "54960\n"
     ]
    }
   ],
   "source": [
    "for df in dfs:\n",
    "    print(df.index.size)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "7972a696",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[17892, 24414]\n",
      "[17892, 54960]\n",
      "[24414, 54960]\n",
      "[17892, 24414, 54960]\n"
     ]
    }
   ],
   "source": [
    "for i in range(2, len(dfs) + 1):\n",
    "    for combo in combinations(range(len(dfs)), i):\n",
    "        counts=[dfs[i].index.size for i in combo]\n",
    "        print(counts)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "1a0f8006",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(54960, 2)"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "maxCount=0\n",
    "idx=-1\n",
    "for i in range(len(dfs)):\n",
    "    if dfs[i].index.size > maxCount:\n",
    "        maxCount=dfs[i].index.size\n",
    "        idx=i\n",
    "\n",
    "maxCount,idx"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "240d4fd1",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(42306, 54960)"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "combined_df.index.size,maxCount"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "5eba3fe9",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'hello'"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "str=\"hello and \"\n",
    "str[:5]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "94dac715",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>fixed acidity</th>\n",
       "      <th>volatile acidity</th>\n",
       "      <th>citric acid</th>\n",
       "      <th>residual sugar</th>\n",
       "      <th>chlorides</th>\n",
       "      <th>free sulfur dioxide</th>\n",
       "      <th>total sulfur dioxide</th>\n",
       "      <th>density</th>\n",
       "      <th>pH</th>\n",
       "      <th>sulphates</th>\n",
       "      <th>alcohol</th>\n",
       "      <th>quality</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>7.4</td>\n",
       "      <td>0.70</td>\n",
       "      <td>0.00</td>\n",
       "      <td>1.9</td>\n",
       "      <td>0.076</td>\n",
       "      <td>11.0</td>\n",
       "      <td>34.0</td>\n",
       "      <td>0.9978</td>\n",
       "      <td>3.51</td>\n",
       "      <td>0.56</td>\n",
       "      <td>9.4</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>7.8</td>\n",
       "      <td>0.88</td>\n",
       "      <td>0.00</td>\n",
       "      <td>2.6</td>\n",
       "      <td>0.098</td>\n",
       "      <td>25.0</td>\n",
       "      <td>67.0</td>\n",
       "      <td>0.9968</td>\n",
       "      <td>3.20</td>\n",
       "      <td>0.68</td>\n",
       "      <td>9.8</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>7.8</td>\n",
       "      <td>0.76</td>\n",
       "      <td>0.04</td>\n",
       "      <td>2.3</td>\n",
       "      <td>0.092</td>\n",
       "      <td>15.0</td>\n",
       "      <td>54.0</td>\n",
       "      <td>0.9970</td>\n",
       "      <td>3.26</td>\n",
       "      <td>0.65</td>\n",
       "      <td>9.8</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>11.2</td>\n",
       "      <td>0.28</td>\n",
       "      <td>0.56</td>\n",
       "      <td>1.9</td>\n",
       "      <td>0.075</td>\n",
       "      <td>17.0</td>\n",
       "      <td>60.0</td>\n",
       "      <td>0.9980</td>\n",
       "      <td>3.16</td>\n",
       "      <td>0.58</td>\n",
       "      <td>9.8</td>\n",
       "      <td>6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>7.4</td>\n",
       "      <td>0.70</td>\n",
       "      <td>0.00</td>\n",
       "      <td>1.9</td>\n",
       "      <td>0.076</td>\n",
       "      <td>11.0</td>\n",
       "      <td>34.0</td>\n",
       "      <td>0.9978</td>\n",
       "      <td>3.51</td>\n",
       "      <td>0.56</td>\n",
       "      <td>9.4</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  \\\n",
       "0            7.4              0.70         0.00             1.9      0.076   \n",
       "1            7.8              0.88         0.00             2.6      0.098   \n",
       "2            7.8              0.76         0.04             2.3      0.092   \n",
       "3           11.2              0.28         0.56             1.9      0.075   \n",
       "4            7.4              0.70         0.00             1.9      0.076   \n",
       "\n",
       "   free sulfur dioxide  total sulfur dioxide  density    pH  sulphates  \\\n",
       "0                 11.0                  34.0   0.9978  3.51       0.56   \n",
       "1                 25.0                  67.0   0.9968  3.20       0.68   \n",
       "2                 15.0                  54.0   0.9970  3.26       0.65   \n",
       "3                 17.0                  60.0   0.9980  3.16       0.58   \n",
       "4                 11.0                  34.0   0.9978  3.51       0.56   \n",
       "\n",
       "   alcohol  quality  \n",
       "0      9.4        5  \n",
       "1      9.8        5  \n",
       "2      9.8        5  \n",
       "3      9.8        6  \n",
       "4      9.4        5  "
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "\n",
    "df=pd.read_csv(\"downloads/wine quality prediction/redwine.csv\")\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "d0947632",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'downloads/wine quality prediction\\\\redwine.csv'"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import glob\n",
    "csv_files = glob.glob(\"downloads/\"+\"wine quality prediction\"+\"/*.csv\")\n",
    "csv_files[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "22e2e148",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>fixed acidity</th>\n",
       "      <th>volatile acidity</th>\n",
       "      <th>citric acid</th>\n",
       "      <th>residual sugar</th>\n",
       "      <th>chlorides</th>\n",
       "      <th>free sulfur dioxide</th>\n",
       "      <th>total sulfur dioxide</th>\n",
       "      <th>density</th>\n",
       "      <th>pH</th>\n",
       "      <th>sulphates</th>\n",
       "      <th>alcohol</th>\n",
       "      <th>quality</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>7.4</td>\n",
       "      <td>0.70</td>\n",
       "      <td>0.00</td>\n",
       "      <td>1.9</td>\n",
       "      <td>0.076</td>\n",
       "      <td>11.0</td>\n",
       "      <td>34.0</td>\n",
       "      <td>0.9978</td>\n",
       "      <td>3.51</td>\n",
       "      <td>0.56</td>\n",
       "      <td>9.4</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>7.8</td>\n",
       "      <td>0.88</td>\n",
       "      <td>0.00</td>\n",
       "      <td>2.6</td>\n",
       "      <td>0.098</td>\n",
       "      <td>25.0</td>\n",
       "      <td>67.0</td>\n",
       "      <td>0.9968</td>\n",
       "      <td>3.20</td>\n",
       "      <td>0.68</td>\n",
       "      <td>9.8</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>7.8</td>\n",
       "      <td>0.76</td>\n",
       "      <td>0.04</td>\n",
       "      <td>2.3</td>\n",
       "      <td>0.092</td>\n",
       "      <td>15.0</td>\n",
       "      <td>54.0</td>\n",
       "      <td>0.9970</td>\n",
       "      <td>3.26</td>\n",
       "      <td>0.65</td>\n",
       "      <td>9.8</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>11.2</td>\n",
       "      <td>0.28</td>\n",
       "      <td>0.56</td>\n",
       "      <td>1.9</td>\n",
       "      <td>0.075</td>\n",
       "      <td>17.0</td>\n",
       "      <td>60.0</td>\n",
       "      <td>0.9980</td>\n",
       "      <td>3.16</td>\n",
       "      <td>0.58</td>\n",
       "      <td>9.8</td>\n",
       "      <td>6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>7.4</td>\n",
       "      <td>0.70</td>\n",
       "      <td>0.00</td>\n",
       "      <td>1.9</td>\n",
       "      <td>0.076</td>\n",
       "      <td>11.0</td>\n",
       "      <td>34.0</td>\n",
       "      <td>0.9978</td>\n",
       "      <td>3.51</td>\n",
       "      <td>0.56</td>\n",
       "      <td>9.4</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  \\\n",
       "0            7.4              0.70         0.00             1.9      0.076   \n",
       "1            7.8              0.88         0.00             2.6      0.098   \n",
       "2            7.8              0.76         0.04             2.3      0.092   \n",
       "3           11.2              0.28         0.56             1.9      0.075   \n",
       "4            7.4              0.70         0.00             1.9      0.076   \n",
       "\n",
       "   free sulfur dioxide  total sulfur dioxide  density    pH  sulphates  \\\n",
       "0                 11.0                  34.0   0.9978  3.51       0.56   \n",
       "1                 25.0                  67.0   0.9968  3.20       0.68   \n",
       "2                 15.0                  54.0   0.9970  3.26       0.65   \n",
       "3                 17.0                  60.0   0.9980  3.16       0.58   \n",
       "4                 11.0                  34.0   0.9978  3.51       0.56   \n",
       "\n",
       "   alcohol  quality  \n",
       "0      9.4        5  \n",
       "1      9.8        5  \n",
       "2      9.8        5  \n",
       "3      9.8        6  \n",
       "4      9.4        5  "
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df=pd.read_csv(csv_files[0])\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1553de09",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>fixed acidity</th>\n",
       "      <th>volatile acidity</th>\n",
       "      <th>citric acid</th>\n",
       "      <th>residual sugar</th>\n",
       "      <th>chlorides</th>\n",
       "      <th>free sulfur dioxide</th>\n",
       "      <th>total sulfur dioxide</th>\n",
       "      <th>density</th>\n",
       "      <th>pH</th>\n",
       "      <th>sulphates</th>\n",
       "      <th>alcohol</th>\n",
       "      <th>quality</th>\n",
       "      <th>label</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>7.4</td>\n",
       "      <td>0.70</td>\n",
       "      <td>0.00</td>\n",
       "      <td>1.9</td>\n",
       "      <td>0.076</td>\n",
       "      <td>11.0</td>\n",
       "      <td>34.0</td>\n",
       "      <td>0.9978</td>\n",
       "      <td>3.51</td>\n",
       "      <td>0.56</td>\n",
       "      <td>9.4</td>\n",
       "      <td>5</td>\n",
       "      <td>red</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>7.8</td>\n",
       "      <td>0.88</td>\n",
       "      <td>0.00</td>\n",
       "      <td>2.6</td>\n",
       "      <td>0.098</td>\n",
       "      <td>25.0</td>\n",
       "      <td>67.0</td>\n",
       "      <td>0.9968</td>\n",
       "      <td>3.20</td>\n",
       "      <td>0.68</td>\n",
       "      <td>9.8</td>\n",
       "      <td>5</td>\n",
       "      <td>red</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>7.8</td>\n",
       "      <td>0.76</td>\n",
       "      <td>0.04</td>\n",
       "      <td>2.3</td>\n",
       "      <td>0.092</td>\n",
       "      <td>15.0</td>\n",
       "      <td>54.0</td>\n",
       "      <td>0.9970</td>\n",
       "      <td>3.26</td>\n",
       "      <td>0.65</td>\n",
       "      <td>9.8</td>\n",
       "      <td>5</td>\n",
       "      <td>red</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>11.2</td>\n",
       "      <td>0.28</td>\n",
       "      <td>0.56</td>\n",
       "      <td>1.9</td>\n",
       "      <td>0.075</td>\n",
       "      <td>17.0</td>\n",
       "      <td>60.0</td>\n",
       "      <td>0.9980</td>\n",
       "      <td>3.16</td>\n",
       "      <td>0.58</td>\n",
       "      <td>9.8</td>\n",
       "      <td>6</td>\n",
       "      <td>red</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>7.4</td>\n",
       "      <td>0.70</td>\n",
       "      <td>0.00</td>\n",
       "      <td>1.9</td>\n",
       "      <td>0.076</td>\n",
       "      <td>11.0</td>\n",
       "      <td>34.0</td>\n",
       "      <td>0.9978</td>\n",
       "      <td>3.51</td>\n",
       "      <td>0.56</td>\n",
       "      <td>9.4</td>\n",
       "      <td>5</td>\n",
       "      <td>red</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  \\\n",
       "0            7.4              0.70         0.00             1.9      0.076   \n",
       "1            7.8              0.88         0.00             2.6      0.098   \n",
       "2            7.8              0.76         0.04             2.3      0.092   \n",
       "3           11.2              0.28         0.56             1.9      0.075   \n",
       "4            7.4              0.70         0.00             1.9      0.076   \n",
       "\n",
       "   free sulfur dioxide  total sulfur dioxide  density    pH  sulphates  \\\n",
       "0                 11.0                  34.0   0.9978  3.51       0.56   \n",
       "1                 25.0                  67.0   0.9968  3.20       0.68   \n",
       "2                 15.0                  54.0   0.9970  3.26       0.65   \n",
       "3                 17.0                  60.0   0.9980  3.16       0.58   \n",
       "4                 11.0                  34.0   0.9978  3.51       0.56   \n",
       "\n",
       "   alcohol  quality label  \n",
       "0      9.4        5   red  \n",
       "1      9.8        5   red  \n",
       "2      9.8        5   red  \n",
       "3      9.8        6   red  \n",
       "4      9.4        5   red  "
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import os\n",
    "newName=os.path.basename(csv_files[0]).lower().split(\".\")[0]\n",
    "query=\"wine quality prediction\"\n",
    "\n",
    "words=set(query.lower().split())\n",
    "\n",
    "for word in words:\n",
    "    if word in newName:\n",
    "        newName=newName.replace(word,\"\")\n",
    "\n",
    "df['label']=newName\n",
    "\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "8c258b22",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Date_reported</th>\n",
       "      <th>Country_code</th>\n",
       "      <th>Country</th>\n",
       "      <th>WHO_region</th>\n",
       "      <th>New_cases</th>\n",
       "      <th>Cumulative_cases</th>\n",
       "      <th>New_deaths</th>\n",
       "      <th>Cumulative_deaths</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2020-01-05</td>\n",
       "      <td>AF</td>\n",
       "      <td>Afghanistan</td>\n",
       "      <td>EMRO</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2020-01-12</td>\n",
       "      <td>AF</td>\n",
       "      <td>Afghanistan</td>\n",
       "      <td>EMRO</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2020-01-19</td>\n",
       "      <td>AF</td>\n",
       "      <td>Afghanistan</td>\n",
       "      <td>EMRO</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>2020-01-26</td>\n",
       "      <td>AF</td>\n",
       "      <td>Afghanistan</td>\n",
       "      <td>EMRO</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>2020-02-02</td>\n",
       "      <td>AF</td>\n",
       "      <td>Afghanistan</td>\n",
       "      <td>EMRO</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  Date_reported Country_code      Country WHO_region  New_cases  \\\n",
       "0    2020-01-05           AF  Afghanistan       EMRO        NaN   \n",
       "1    2020-01-12           AF  Afghanistan       EMRO        NaN   \n",
       "2    2020-01-19           AF  Afghanistan       EMRO        NaN   \n",
       "3    2020-01-26           AF  Afghanistan       EMRO        NaN   \n",
       "4    2020-02-02           AF  Afghanistan       EMRO        NaN   \n",
       "\n",
       "   Cumulative_cases  New_deaths  Cumulative_deaths  \n",
       "0                 0         NaN                  0  \n",
       "1                 0         NaN                  0  \n",
       "2                 0         NaN                  0  \n",
       "3                 0         NaN                  0  \n",
       "4                 0         NaN                  0  "
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "\n",
    "df=pd.read_csv(\"final/covid 19.csv\")\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "6b43c357",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 54960 entries, 0 to 54959\n",
      "Data columns (total 8 columns):\n",
      " #   Column             Non-Null Count  Dtype  \n",
      "---  ------             --------------  -----  \n",
      " 0   Date_reported      54960 non-null  object \n",
      " 1   Country_code       54731 non-null  object \n",
      " 2   Country            54960 non-null  object \n",
      " 3   WHO_region         50838 non-null  object \n",
      " 4   New_cases          38082 non-null  float64\n",
      " 5   Cumulative_cases   54960 non-null  int64  \n",
      " 6   New_deaths         24747 non-null  float64\n",
      " 7   Cumulative_deaths  54960 non-null  int64  \n",
      "dtypes: float64(2), int64(2), object(4)\n",
      "memory usage: 3.4+ MB\n"
     ]
    }
   ],
   "source": [
    "df.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "ab7a92d8",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['Date_reported', 'Country_code', 'Country', 'WHO_region']"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "object_columns = df.dtypes[df.dtypes == 'object'].index.tolist()\n",
    "object_columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "ae0b8edb",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "   New_cases  Cumulative_cases  New_deaths  Cumulative_deaths  \\\n",
      "0        NaN                 0         NaN                  0   \n",
      "1        NaN                 0         NaN                  0   \n",
      "2        NaN                 0         NaN                  0   \n",
      "3        NaN                 0         NaN                  0   \n",
      "4        NaN                 0         NaN                  0   \n",
      "\n",
      "   Date_reported_2020-01-05  Date_reported_2020-01-12  \\\n",
      "0                         1                         0   \n",
      "1                         0                         1   \n",
      "2                         0                         0   \n",
      "3                         0                         0   \n",
      "4                         0                         0   \n",
      "\n",
      "   Date_reported_2020-01-19  Date_reported_2020-01-26  \\\n",
      "0                         0                         0   \n",
      "1                         0                         0   \n",
      "2                         1                         0   \n",
      "3                         0                         1   \n",
      "4                         0                         0   \n",
      "\n",
      "   Date_reported_2020-02-02  Date_reported_2020-02-09  ...  Country_Zambia  \\\n",
      "0                         0                         0  ...               0   \n",
      "1                         0                         0  ...               0   \n",
      "2                         0                         0  ...               0   \n",
      "3                         0                         0  ...               0   \n",
      "4                         1                         0  ...               0   \n",
      "\n",
      "   Country_Zimbabwe  \\\n",
      "0                 0   \n",
      "1                 0   \n",
      "2                 0   \n",
      "3                 0   \n",
      "4                 0   \n",
      "\n",
      "   Country_occupied Palestinian territory, including east Jerusalem  \\\n",
      "0                                                  0                  \n",
      "1                                                  0                  \n",
      "2                                                  0                  \n",
      "3                                                  0                  \n",
      "4                                                  0                  \n",
      "\n",
      "   WHO_region_AFRO  WHO_region_AMRO  WHO_region_EMRO  WHO_region_EURO  \\\n",
      "0                0                0                1                0   \n",
      "1                0                0                1                0   \n",
      "2                0                0                1                0   \n",
      "3                0                0                1                0   \n",
      "4                0                0                1                0   \n",
      "\n",
      "   WHO_region_OTHER  WHO_region_SEARO  WHO_region_WPRO  \n",
      "0                 0                 0                0  \n",
      "1                 0                 0                0  \n",
      "2                 0                 0                0  \n",
      "3                 0                 0                0  \n",
      "4                 0                 0                0  \n",
      "\n",
      "[5 rows x 719 columns]\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "\n",
    "def one_hot_encode_objects(df):\n",
    "    object_cols = df.select_dtypes(include='object').columns\n",
    "\n",
    "    for col in object_cols:\n",
    "        if \"date\" in col:\n",
    "            continue\n",
    "\n",
    "        # Perform one-hot encoding\n",
    "        dummies = pd.get_dummies(df[col], prefix=col).astype(int)\n",
    "        df = pd.concat([df, dummies], axis=1)\n",
    "        \n",
    "    df = df.drop(columns=object_cols)\n",
    "    return df\n",
    "\n",
    "\n",
    "def preprocessing(query):\n",
    "    df=pd.read_csv(\"final/\"+query+\".csv\")\n",
    "    # print(df.head())\n",
    "    df=one_hot_encode_objects(df)\n",
    "    print(df.head())\n",
    "    \n",
    "    \n",
    "preprocessing(\"covid 19\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "f4ab7ad9",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Reduced file saved to: final/twitter sentiment analysis.csv\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "\n",
    "def reduce_csv_to_10_percent(file_path):\n",
    "    # Read the original CSV\n",
    "    df = pd.read_csv(file_path)\n",
    "\n",
    "    # Sample 10% of the rows\n",
    "    reduced_df = df.sample(frac=0.1, random_state=42)\n",
    "\n",
    "    # Save back to the original file path, overwriting it\n",
    "    reduced_df.to_csv(file_path, index=False)\n",
    "    print(f\"Reduced file saved to: {file_path}\")\n",
    "\n",
    "# Example usage\n",
    "reduce_csv_to_10_percent(\"final/twitter sentiment analysis.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5644317d",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "base",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}