{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 108,
   "id": "c30096f9-3ae8-4389-948a-30423525c3b6",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import datetime"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d9cc960c-5167-4eb0-bcd3-5e7adfaaae64",
   "metadata": {},
   "source": [
    "# Helpful Pandas Methods"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f06dfcee-8d93-4be4-90ad-3f46e13b051e",
   "metadata": {},
   "source": [
    "This subchapter goes over a lot of simple but extremely useful `pandas` methods that you will likely use a lot. \n",
    "\n",
    "To start, let's load in the same dataset as the first subchapter in the `Pandas` chapter. As a reminder, this dataset has beer sales across 50 continental states in the US. It is sourced from [_Salience and Taxation: Theory and Evidence_](https://www.aeaweb.org/articles?id=10.1257/aer.99.4.1145) by Chetty, Looney, and Kroft (AER 2009), and it includes 7 columns:\n",
    "- `st_name`: the state abbreviation\n",
    "- `year`: the year the data was recorded\n",
    "- `c_beer`: the quantity of beer consumed, in thousands of gallons\n",
    "- `beer_tax`: the ad valorem tax, as a percentage\n",
    "- `btax_dollars`: the excise tax, represented in dollars per case (24 cans) of beer \n",
    "- `population`: the population of the state, in thousands\n",
    "- `salestax`: the sales tax percentage"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 122,
   "id": "d8d78be9-4cc4-41d3-879f-6308553189e5",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>st_name</th>\n",
       "      <th>year</th>\n",
       "      <th>c_beer</th>\n",
       "      <th>beer_tax</th>\n",
       "      <th>btax_dollars</th>\n",
       "      <th>population</th>\n",
       "      <th>salestax</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>AL</td>\n",
       "      <td>1970</td>\n",
       "      <td>33098</td>\n",
       "      <td>72.341130</td>\n",
       "      <td>2.370</td>\n",
       "      <td>3450</td>\n",
       "      <td>4.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>AL</td>\n",
       "      <td>1971</td>\n",
       "      <td>37598</td>\n",
       "      <td>69.304600</td>\n",
       "      <td>2.370</td>\n",
       "      <td>3497</td>\n",
       "      <td>4.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>AL</td>\n",
       "      <td>1972</td>\n",
       "      <td>42719</td>\n",
       "      <td>67.149190</td>\n",
       "      <td>2.370</td>\n",
       "      <td>3539</td>\n",
       "      <td>4.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>AL</td>\n",
       "      <td>1973</td>\n",
       "      <td>46203</td>\n",
       "      <td>63.217026</td>\n",
       "      <td>2.370</td>\n",
       "      <td>3580</td>\n",
       "      <td>4.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>AL</td>\n",
       "      <td>1974</td>\n",
       "      <td>49769</td>\n",
       "      <td>56.933796</td>\n",
       "      <td>2.370</td>\n",
       "      <td>3627</td>\n",
       "      <td>4.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1703</th>\n",
       "      <td>WY</td>\n",
       "      <td>1999</td>\n",
       "      <td>12423</td>\n",
       "      <td>0.319894</td>\n",
       "      <td>0.045</td>\n",
       "      <td>492</td>\n",
       "      <td>4.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1704</th>\n",
       "      <td>WY</td>\n",
       "      <td>2000</td>\n",
       "      <td>12595</td>\n",
       "      <td>0.309491</td>\n",
       "      <td>0.045</td>\n",
       "      <td>494</td>\n",
       "      <td>4.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1705</th>\n",
       "      <td>WY</td>\n",
       "      <td>2001</td>\n",
       "      <td>12808</td>\n",
       "      <td>0.300928</td>\n",
       "      <td>0.045</td>\n",
       "      <td>494</td>\n",
       "      <td>4.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1706</th>\n",
       "      <td>WY</td>\n",
       "      <td>2002</td>\n",
       "      <td>13191</td>\n",
       "      <td>0.296244</td>\n",
       "      <td>0.045</td>\n",
       "      <td>499</td>\n",
       "      <td>4.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1707</th>\n",
       "      <td>WY</td>\n",
       "      <td>2003</td>\n",
       "      <td>15535</td>\n",
       "      <td>0.289643</td>\n",
       "      <td>0.045</td>\n",
       "      <td>501</td>\n",
       "      <td>4.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>1708 rows × 7 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "     st_name  year  c_beer   beer_tax  btax_dollars  population  salestax\n",
       "0         AL  1970   33098  72.341130         2.370        3450       4.0\n",
       "1         AL  1971   37598  69.304600         2.370        3497       4.0\n",
       "2         AL  1972   42719  67.149190         2.370        3539       4.0\n",
       "3         AL  1973   46203  63.217026         2.370        3580       4.0\n",
       "4         AL  1974   49769  56.933796         2.370        3627       4.0\n",
       "...      ...   ...     ...        ...           ...         ...       ...\n",
       "1703      WY  1999   12423   0.319894         0.045         492       4.0\n",
       "1704      WY  2000   12595   0.309491         0.045         494       4.0\n",
       "1705      WY  2001   12808   0.300928         0.045         494       4.0\n",
       "1706      WY  2002   13191   0.296244         0.045         499       4.0\n",
       "1707      WY  2003   15535   0.289643         0.045         501       4.0\n",
       "\n",
       "[1708 rows x 7 columns]"
      ]
     },
     "execution_count": 122,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.read_csv('data/beer_tax.csv')\n",
    "df"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4c0a871a-3eb8-473b-833e-60837aa27deb",
   "metadata": {},
   "source": [
    "## Summary Statistics"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e2c56768-606e-47d8-89de-e38fb9c5ad17",
   "metadata": {},
   "source": [
    "To start, we will note that `pandas` is designed to work extremely well with `NumPy`. Most (if not all) `NumPy` functions can be applied on `pandas` DataFrames and Series. A few examples are shown below."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "id": "5285fab1-013b-4b02-a44f-5b3cf2580305",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "3393246"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "np.sum(df['year'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "id": "e52a0237-b7a2-4e54-8176-ce2b5f77b29c",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "year          1986.677986\n",
       "c_beer      108031.059719\n",
       "beer_tax         6.525340\n",
       "dtype: float64"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "np.mean(df[['year','c_beer','beer_tax']], axis = 0)\n",
    "\n",
    "# What does axis = 0 represent? \n",
    "# Hint: read the documentation: https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.mean.html"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "id": "6e5e1622-2626-459e-b432-bb640ff9de62",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "year          1986.677986\n",
       "c_beer      108031.059719\n",
       "beer_tax         6.525340\n",
       "dtype: float64"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df[['year','c_beer','beer_tax']].mean(axis = 0) # numpy.mean() and DataFrame.mean() are the same\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e477aeb5-8d3a-48d2-aad8-2a6054d07f68",
   "metadata": {},
   "source": [
    "### `describe`"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "bee35831-c0f7-4f80-b60a-7687e3b695ee",
   "metadata": {},
   "source": [
    "`.describe()` helps us easily compute several statistics from a DataFrame."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "id": "b3e93b46-97a9-4e36-80c0-fade09daad93",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>year</th>\n",
       "      <th>c_beer</th>\n",
       "      <th>beer_tax</th>\n",
       "      <th>btax_dollars</th>\n",
       "      <th>population</th>\n",
       "      <th>salestax</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>count</th>\n",
       "      <td>1708.000000</td>\n",
       "      <td>1708.000000</td>\n",
       "      <td>1708.000000</td>\n",
       "      <td>1708.000000</td>\n",
       "      <td>1708.000000</td>\n",
       "      <td>1708.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>mean</th>\n",
       "      <td>1986.677986</td>\n",
       "      <td>108031.059719</td>\n",
       "      <td>6.525340</td>\n",
       "      <td>0.524762</td>\n",
       "      <td>4837.031030</td>\n",
       "      <td>4.288511</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>std</th>\n",
       "      <td>9.761205</td>\n",
       "      <td>119280.769185</td>\n",
       "      <td>8.147708</td>\n",
       "      <td>0.523300</td>\n",
       "      <td>5275.527054</td>\n",
       "      <td>1.909266</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>min</th>\n",
       "      <td>1970.000000</td>\n",
       "      <td>5372.000000</td>\n",
       "      <td>0.289643</td>\n",
       "      <td>0.045000</td>\n",
       "      <td>304.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25%</th>\n",
       "      <td>1978.000000</td>\n",
       "      <td>27886.750000</td>\n",
       "      <td>2.301018</td>\n",
       "      <td>0.188700</td>\n",
       "      <td>1219.000000</td>\n",
       "      <td>3.312250</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50%</th>\n",
       "      <td>1987.000000</td>\n",
       "      <td>68306.000000</td>\n",
       "      <td>3.879157</td>\n",
       "      <td>0.360000</td>\n",
       "      <td>3303.000000</td>\n",
       "      <td>4.462500</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>75%</th>\n",
       "      <td>1995.000000</td>\n",
       "      <td>128838.500000</td>\n",
       "      <td>7.042739</td>\n",
       "      <td>0.603875</td>\n",
       "      <td>5703.250000</td>\n",
       "      <td>5.106250</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>max</th>\n",
       "      <td>2003.000000</td>\n",
       "      <td>691050.000000</td>\n",
       "      <td>72.341130</td>\n",
       "      <td>2.407500</td>\n",
       "      <td>35484.000000</td>\n",
       "      <td>9.000000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "              year         c_beer     beer_tax  btax_dollars    population  \\\n",
       "count  1708.000000    1708.000000  1708.000000   1708.000000   1708.000000   \n",
       "mean   1986.677986  108031.059719     6.525340      0.524762   4837.031030   \n",
       "std       9.761205  119280.769185     8.147708      0.523300   5275.527054   \n",
       "min    1970.000000    5372.000000     0.289643      0.045000    304.000000   \n",
       "25%    1978.000000   27886.750000     2.301018      0.188700   1219.000000   \n",
       "50%    1987.000000   68306.000000     3.879157      0.360000   3303.000000   \n",
       "75%    1995.000000  128838.500000     7.042739      0.603875   5703.250000   \n",
       "max    2003.000000  691050.000000    72.341130      2.407500  35484.000000   \n",
       "\n",
       "          salestax  \n",
       "count  1708.000000  \n",
       "mean      4.288511  \n",
       "std       1.909266  \n",
       "min       0.000000  \n",
       "25%       3.312250  \n",
       "50%       4.462500  \n",
       "75%       5.106250  \n",
       "max       9.000000  "
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.describe()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "bad5d706-15f7-4b50-8d8a-a83054bf8090",
   "metadata": {},
   "source": [
    "Finally, there are a whole host of intuitive DataFrame methods that calculate statistics such as the average, standard deviation, median, percentiles, etc. While we cannot possibly describe them all here, we strongly encourage you to google around to learn more about them. [Here](https://medium.com/codex/9-efficient-ways-for-describing-and-summarizing-a-pandas-dataframe-316234f46e6) is one link to get you started."
   ]
  },
  {
   "cell_type": "markdown",
   "id": "7ce77d45-36bf-4dd0-9966-b1d45c4dfbd7",
   "metadata": {},
   "source": [
    "## Sorting, Counting and Uniqueness"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b2aebf4b-8721-4d50-9c7b-c79a5e30a06f",
   "metadata": {},
   "source": [
    "Here are some of the methods used to deal with sorting, counting and uniqueness problems. Again, this list is not meant to be comprehensive, and you are encouraged to google around for other methods you may find useful."
   ]
  },
  {
   "cell_type": "markdown",
   "id": "084d81af-5f81-41a0-b1c5-b7e202b5a0c5",
   "metadata": {},
   "source": [
    "### Sorting"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4a5dd67c-c856-445e-9f3f-c8a911d159d4",
   "metadata": {},
   "source": [
    "As the name implies, `.sort_values()` can be used to sort DataFrames and Series! If sorting a DataFrame, you need to specify which column you need to sort by, but there is naturally no such restriction for sorting series. Examples are shown below; note the `ascending` parameter."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "id": "8c4af4b2-d53e-4170-aed3-cdaa981c9a52",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>st_name</th>\n",
       "      <th>year</th>\n",
       "      <th>c_beer</th>\n",
       "      <th>beer_tax</th>\n",
       "      <th>btax_dollars</th>\n",
       "      <th>population</th>\n",
       "      <th>salestax</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>169</th>\n",
       "      <td>CA</td>\n",
       "      <td>2003</td>\n",
       "      <td>663750</td>\n",
       "      <td>2.896433</td>\n",
       "      <td>0.4500</td>\n",
       "      <td>35484</td>\n",
       "      <td>7.25</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>168</th>\n",
       "      <td>CA</td>\n",
       "      <td>2002</td>\n",
       "      <td>659475</td>\n",
       "      <td>2.962444</td>\n",
       "      <td>0.4500</td>\n",
       "      <td>35002</td>\n",
       "      <td>7.25</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>167</th>\n",
       "      <td>CA</td>\n",
       "      <td>2001</td>\n",
       "      <td>652950</td>\n",
       "      <td>3.009280</td>\n",
       "      <td>0.4500</td>\n",
       "      <td>34533</td>\n",
       "      <td>7.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>166</th>\n",
       "      <td>CA</td>\n",
       "      <td>2000</td>\n",
       "      <td>637080</td>\n",
       "      <td>3.094911</td>\n",
       "      <td>0.4500</td>\n",
       "      <td>34000</td>\n",
       "      <td>7.25</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>165</th>\n",
       "      <td>CA</td>\n",
       "      <td>1999</td>\n",
       "      <td>638017</td>\n",
       "      <td>3.198941</td>\n",
       "      <td>0.4500</td>\n",
       "      <td>33499</td>\n",
       "      <td>7.25</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>38</th>\n",
       "      <td>AK</td>\n",
       "      <td>1974</td>\n",
       "      <td>7598</td>\n",
       "      <td>10.810215</td>\n",
       "      <td>0.5625</td>\n",
       "      <td>341</td>\n",
       "      <td>0.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>37</th>\n",
       "      <td>AK</td>\n",
       "      <td>1973</td>\n",
       "      <td>6453</td>\n",
       "      <td>12.003234</td>\n",
       "      <td>0.5625</td>\n",
       "      <td>331</td>\n",
       "      <td>0.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>36</th>\n",
       "      <td>AK</td>\n",
       "      <td>1972</td>\n",
       "      <td>6038</td>\n",
       "      <td>12.749847</td>\n",
       "      <td>0.5625</td>\n",
       "      <td>324</td>\n",
       "      <td>0.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>35</th>\n",
       "      <td>AK</td>\n",
       "      <td>1971</td>\n",
       "      <td>6336</td>\n",
       "      <td>13.159102</td>\n",
       "      <td>0.5625</td>\n",
       "      <td>316</td>\n",
       "      <td>0.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>34</th>\n",
       "      <td>AK</td>\n",
       "      <td>1970</td>\n",
       "      <td>5372</td>\n",
       "      <td>13.735660</td>\n",
       "      <td>0.5625</td>\n",
       "      <td>304</td>\n",
       "      <td>0.00</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>1708 rows × 7 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "    st_name  year  c_beer   beer_tax  btax_dollars  population  salestax\n",
       "169      CA  2003  663750   2.896433        0.4500       35484      7.25\n",
       "168      CA  2002  659475   2.962444        0.4500       35002      7.25\n",
       "167      CA  2001  652950   3.009280        0.4500       34533      7.00\n",
       "166      CA  2000  637080   3.094911        0.4500       34000      7.25\n",
       "165      CA  1999  638017   3.198941        0.4500       33499      7.25\n",
       "..      ...   ...     ...        ...           ...         ...       ...\n",
       "38       AK  1974    7598  10.810215        0.5625         341      0.00\n",
       "37       AK  1973    6453  12.003234        0.5625         331      0.00\n",
       "36       AK  1972    6038  12.749847        0.5625         324      0.00\n",
       "35       AK  1971    6336  13.159102        0.5625         316      0.00\n",
       "34       AK  1970    5372  13.735660        0.5625         304      0.00\n",
       "\n",
       "[1708 rows x 7 columns]"
      ]
     },
     "execution_count": 33,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.sort_values('population', ascending = False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "id": "0a5b0526-c047-48c6-bba2-f9ec800c2d68",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "34       304\n",
       "35       316\n",
       "36       324\n",
       "37       331\n",
       "38       341\n",
       "       ...  \n",
       "165    33499\n",
       "166    34000\n",
       "167    34533\n",
       "168    35002\n",
       "169    35484\n",
       "Name: population, Length: 1708, dtype: int64"
      ]
     },
     "execution_count": 35,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df['population'].sort_values(ascending = True)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0ca8fb2f-a6ff-48a4-9a0d-5553bcbec8f3",
   "metadata": {},
   "source": [
    "### Counting"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4ee49844-7014-429b-a785-42cd18f20842",
   "metadata": {},
   "source": [
    "Often used for Series (although it can be used for DataFrames as well), the `.value_counts()` method counts the number of times a value (for Series) or row (for DataFrames) appears, and returns the values sorted from most common to least. For example,"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "id": "b5a14bbb-3d8a-4228-ae07-6b54dd44194b",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "AL    34\n",
       "VA    34\n",
       "AK    34\n",
       "MT    34\n",
       "NE    34\n",
       "NV    34\n",
       "NH    34\n",
       "NJ    34\n",
       "NM    34\n",
       "NY    34\n",
       "NC    34\n",
       "ND    34\n",
       "OH    34\n",
       "OK    34\n",
       "OR    34\n",
       "PA    34\n",
       "RI    34\n",
       "SC    34\n",
       "SD    34\n",
       "TN    34\n",
       "TX    34\n",
       "MS    34\n",
       "MI    34\n",
       "VT    34\n",
       "MA    34\n",
       "AZ    34\n",
       "AR    34\n",
       "CA    34\n",
       "CO    34\n",
       "CT    34\n",
       "DE    34\n",
       "DC    34\n",
       "FL    34\n",
       "GA    34\n",
       "WI    34\n",
       "ID    34\n",
       "IL    34\n",
       "IN    34\n",
       "WV    34\n",
       "WA    34\n",
       "KY    34\n",
       "LA    34\n",
       "ME    34\n",
       "MD    34\n",
       "UT    34\n",
       "MO    32\n",
       "MN    32\n",
       "KS    32\n",
       "IA    32\n",
       "WY    32\n",
       "HI    18\n",
       "Name: st_name, dtype: int64"
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df['st_name'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "id": "24d6804f-531e-432a-9a7f-90c6653279c0",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "st_name  year  c_beer  beer_tax   btax_dollars  population  salestax\n",
       "AK       1970  5372    13.735660  0.5625        304         0.0         1\n",
       "NY       1985  369214  1.362627   0.1238        17792       4.0         1\n",
       "         1983  394026  1.376949   0.1158        17687       4.0         1\n",
       "         1982  400138  1.227274   0.1000        17590       4.0         1\n",
       "         1981  395584  1.302881   0.1000        17568       4.0         1\n",
       "                                                                       ..\n",
       "KY       1978  64414   3.296839   0.1815        3610        5.0         1\n",
       "         1977  70755   3.547095   0.1815        3574        5.0         1\n",
       "         1976  59481   3.777749   0.1815        3529        5.0         1\n",
       "         1975  60230   3.995426   0.1815        3468        5.0         1\n",
       "WY       2003  15535   0.289643   0.0450        501         4.0         1\n",
       "Length: 1708, dtype: int64"
      ]
     },
     "execution_count": 47,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.value_counts()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b43c44df-9648-48f7-86fb-b15e56cdaa15",
   "metadata": {},
   "source": [
    "### Unique Values and Duplicates"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4d1b0ade-a6fc-4f9a-8174-8ce89cb43f77",
   "metadata": {},
   "source": [
    "If we're interested in just obtaining the unique values from a Series, we can use the `.unique()` method."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "id": "3ef3960c-dcbd-4e7c-87fd-2bf1dfe7cac5",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array(['AL', 'AK', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DE', 'DC', 'FL', 'GA',\n",
       "       'HI', 'ID', 'IL', 'IN', 'IA', 'KS', 'KY', 'LA', 'ME', 'MD', 'MA',\n",
       "       'MI', 'MN', 'MS', 'MO', 'MT', 'NE', 'NV', 'NH', 'NJ', 'NM', 'NY',\n",
       "       'NC', 'ND', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC', 'SD', 'TN', 'TX',\n",
       "       'UT', 'VT', 'VA', 'WA', 'WV', 'WI', 'WY'], dtype=object)"
      ]
     },
     "execution_count": 36,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df['st_name'].unique()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "9daf2b98-f3dc-4610-8e16-0e44cb117d6c",
   "metadata": {},
   "source": [
    "Alternatively, if we want all the duplicated values in a series or duplicated rows in a DataFrame, we can use `.duplicated()`."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "id": "a67cabb6-e19c-4514-94b2-74c445b795d2",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0       False\n",
       "1        True\n",
       "2        True\n",
       "3        True\n",
       "4        True\n",
       "        ...  \n",
       "1703     True\n",
       "1704     True\n",
       "1705     True\n",
       "1706     True\n",
       "1707     True\n",
       "Name: st_name, Length: 1708, dtype: bool"
      ]
     },
     "execution_count": 44,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df['st_name'].duplicated()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "id": "3feca317-de1b-4928-8cd9-f2bed1a978f8",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0       False\n",
       "1       False\n",
       "2       False\n",
       "3       False\n",
       "4       False\n",
       "        ...  \n",
       "1703    False\n",
       "1704    False\n",
       "1705    False\n",
       "1706    False\n",
       "1707    False\n",
       "Length: 1708, dtype: bool"
      ]
     },
     "execution_count": 43,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.duplicated() # Our dataframe has no duplicate rows!"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "27e5042c-782a-4e7b-97b5-2bdb68310f60",
   "metadata": {},
   "source": [
    "If we want to drop all the duplicated values, we can use `.drop_duplicates()`. For a series, this will return all non-duplicated values and for a DataFrame, it will return all non-duplicated rows."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "id": "f7d57af1-b68b-40a3-ab70-d37fa8af8bce",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0       AL\n",
       "34      AK\n",
       "68      AZ\n",
       "102     AR\n",
       "136     CA\n",
       "170     CO\n",
       "204     CT\n",
       "238     DE\n",
       "272     DC\n",
       "306     FL\n",
       "340     GA\n",
       "374     HI\n",
       "392     ID\n",
       "426     IL\n",
       "460     IN\n",
       "494     IA\n",
       "526     KS\n",
       "558     KY\n",
       "592     LA\n",
       "626     ME\n",
       "660     MD\n",
       "694     MA\n",
       "728     MI\n",
       "762     MN\n",
       "794     MS\n",
       "828     MO\n",
       "860     MT\n",
       "894     NE\n",
       "928     NV\n",
       "962     NH\n",
       "996     NJ\n",
       "1030    NM\n",
       "1064    NY\n",
       "1098    NC\n",
       "1132    ND\n",
       "1166    OH\n",
       "1200    OK\n",
       "1234    OR\n",
       "1268    PA\n",
       "1302    RI\n",
       "1336    SC\n",
       "1370    SD\n",
       "1404    TN\n",
       "1438    TX\n",
       "1472    UT\n",
       "1506    VT\n",
       "1540    VA\n",
       "1574    WA\n",
       "1608    WV\n",
       "1642    WI\n",
       "1676    WY\n",
       "Name: st_name, dtype: object"
      ]
     },
     "execution_count": 39,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df['st_name'].drop_duplicates()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "id": "8ca143f6-3964-47c2-a856-e1ed6f307a46",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>st_name</th>\n",
       "      <th>year</th>\n",
       "      <th>c_beer</th>\n",
       "      <th>beer_tax</th>\n",
       "      <th>btax_dollars</th>\n",
       "      <th>population</th>\n",
       "      <th>salestax</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>AL</td>\n",
       "      <td>1970</td>\n",
       "      <td>33098</td>\n",
       "      <td>72.341130</td>\n",
       "      <td>2.370</td>\n",
       "      <td>3450</td>\n",
       "      <td>4.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>AL</td>\n",
       "      <td>1971</td>\n",
       "      <td>37598</td>\n",
       "      <td>69.304600</td>\n",
       "      <td>2.370</td>\n",
       "      <td>3497</td>\n",
       "      <td>4.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>AL</td>\n",
       "      <td>1972</td>\n",
       "      <td>42719</td>\n",
       "      <td>67.149190</td>\n",
       "      <td>2.370</td>\n",
       "      <td>3539</td>\n",
       "      <td>4.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>AL</td>\n",
       "      <td>1973</td>\n",
       "      <td>46203</td>\n",
       "      <td>63.217026</td>\n",
       "      <td>2.370</td>\n",
       "      <td>3580</td>\n",
       "      <td>4.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>AL</td>\n",
       "      <td>1974</td>\n",
       "      <td>49769</td>\n",
       "      <td>56.933796</td>\n",
       "      <td>2.370</td>\n",
       "      <td>3627</td>\n",
       "      <td>4.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1703</th>\n",
       "      <td>WY</td>\n",
       "      <td>1999</td>\n",
       "      <td>12423</td>\n",
       "      <td>0.319894</td>\n",
       "      <td>0.045</td>\n",
       "      <td>492</td>\n",
       "      <td>4.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1704</th>\n",
       "      <td>WY</td>\n",
       "      <td>2000</td>\n",
       "      <td>12595</td>\n",
       "      <td>0.309491</td>\n",
       "      <td>0.045</td>\n",
       "      <td>494</td>\n",
       "      <td>4.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1705</th>\n",
       "      <td>WY</td>\n",
       "      <td>2001</td>\n",
       "      <td>12808</td>\n",
       "      <td>0.300928</td>\n",
       "      <td>0.045</td>\n",
       "      <td>494</td>\n",
       "      <td>4.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1706</th>\n",
       "      <td>WY</td>\n",
       "      <td>2002</td>\n",
       "      <td>13191</td>\n",
       "      <td>0.296244</td>\n",
       "      <td>0.045</td>\n",
       "      <td>499</td>\n",
       "      <td>4.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1707</th>\n",
       "      <td>WY</td>\n",
       "      <td>2003</td>\n",
       "      <td>15535</td>\n",
       "      <td>0.289643</td>\n",
       "      <td>0.045</td>\n",
       "      <td>501</td>\n",
       "      <td>4.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>1708 rows × 7 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "     st_name  year  c_beer   beer_tax  btax_dollars  population  salestax\n",
       "0         AL  1970   33098  72.341130         2.370        3450       4.0\n",
       "1         AL  1971   37598  69.304600         2.370        3497       4.0\n",
       "2         AL  1972   42719  67.149190         2.370        3539       4.0\n",
       "3         AL  1973   46203  63.217026         2.370        3580       4.0\n",
       "4         AL  1974   49769  56.933796         2.370        3627       4.0\n",
       "...      ...   ...     ...        ...           ...         ...       ...\n",
       "1703      WY  1999   12423   0.319894         0.045         492       4.0\n",
       "1704      WY  2000   12595   0.309491         0.045         494       4.0\n",
       "1705      WY  2001   12808   0.300928         0.045         494       4.0\n",
       "1706      WY  2002   13191   0.296244         0.045         499       4.0\n",
       "1707      WY  2003   15535   0.289643         0.045         501       4.0\n",
       "\n",
       "[1708 rows x 7 columns]"
      ]
     },
     "execution_count": 46,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.drop_duplicates() # We don't drop anything as we don't have duplicates"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "5e393e12-83c0-43da-934f-f56d033896b6",
   "metadata": {},
   "source": [
    "### Missing Values"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "5b6dead6-432d-4c5f-ace8-46e89a1f37f0",
   "metadata": {},
   "source": [
    "Missing values are often stored as `NaN` values in our dataset. While our dataset doesn't have any missing values, we could use `.isna()` to check for missing values."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "id": "1d69ce70-a545-4fd4-8157-563eff9d157c",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>st_name</th>\n",
       "      <th>year</th>\n",
       "      <th>c_beer</th>\n",
       "      <th>beer_tax</th>\n",
       "      <th>btax_dollars</th>\n",
       "      <th>population</th>\n",
       "      <th>salestax</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1703</th>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1704</th>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1705</th>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1706</th>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1707</th>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>1708 rows × 7 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "      st_name   year  c_beer  beer_tax  btax_dollars  population  salestax\n",
       "0       False  False   False     False         False       False     False\n",
       "1       False  False   False     False         False       False     False\n",
       "2       False  False   False     False         False       False     False\n",
       "3       False  False   False     False         False       False     False\n",
       "4       False  False   False     False         False       False     False\n",
       "...       ...    ...     ...       ...           ...         ...       ...\n",
       "1703    False  False   False     False         False       False     False\n",
       "1704    False  False   False     False         False       False     False\n",
       "1705    False  False   False     False         False       False     False\n",
       "1706    False  False   False     False         False       False     False\n",
       "1707    False  False   False     False         False       False     False\n",
       "\n",
       "[1708 rows x 7 columns]"
      ]
     },
     "execution_count": 48,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.isna()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "id": "a13084fb-ce4f-49e7-8305-b5c58648a239",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "st_name         False\n",
       "year            False\n",
       "c_beer          False\n",
       "beer_tax        False\n",
       "btax_dollars    False\n",
       "population      False\n",
       "salestax        False\n",
       "dtype: bool"
      ]
     },
     "execution_count": 54,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.isna().any(axis = 0) # See which columns have NaN values\n",
    "# Hint: read the documentation for df.any(): https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.any.html"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "id": "c01114a1-6d84-40b0-98a9-40d770879933",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0       False\n",
       "1       False\n",
       "2       False\n",
       "3       False\n",
       "4       False\n",
       "        ...  \n",
       "1703    False\n",
       "1704    False\n",
       "1705    False\n",
       "1706    False\n",
       "1707    False\n",
       "Length: 1708, dtype: bool"
      ]
     },
     "execution_count": 55,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.isna().any(axis = 1) # See which rows have NaN values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "id": "a857171b-0b7c-40d7-976c-99b1469188f6",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "False\n",
      "False\n"
     ]
    }
   ],
   "source": [
    "print(df.isna().any(axis = 0).any()) #These can be used to check if there are NaN's anywhere in the data\n",
    "print(df.isna().any(axis = 1).any()) "
   ]
  },
  {
   "cell_type": "markdown",
   "id": "264b77c7-47b5-4fde-8d28-c7055c9a1cca",
   "metadata": {},
   "source": [
    "`.isna()` also works for Series."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "id": "c40f815f-9aae-4671-aad8-7bbe5ef8939e",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0       False\n",
       "1       False\n",
       "2       False\n",
       "3       False\n",
       "4       False\n",
       "        ...  \n",
       "1703    False\n",
       "1704    False\n",
       "1705    False\n",
       "1706    False\n",
       "1707    False\n",
       "Name: st_name, Length: 1708, dtype: bool"
      ]
     },
     "execution_count": 56,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df['st_name'].isna()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 134,
   "id": "558f7dbd-5d8d-4757-b649-af3c6c48bb43",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "False"
      ]
     },
     "execution_count": 134,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df['st_name'].isna().any()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "9c40b9fa-9f14-40ab-86eb-ee3dfab02b92",
   "metadata": {},
   "source": [
    "## String Methods"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0d0e0136-c324-436f-8c05-ae6f39768392",
   "metadata": {},
   "source": [
    "String methods are a great tool for working with string data. They only work on Series. To use them, you add a `.str` after your Series to indicate that you want to use a string method, followed by the method you want to use. Some of the various string methods are given below, but again this list is not comprehensive."
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f7d28b17-43e4-4929-b397-7a7f26df53d1",
   "metadata": {},
   "source": [
    "### `len`\n",
    "\n",
    "Gives the length of the strings."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "id": "3403e436-6ff3-4aae-9b3f-48acd599fb0a",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0       2\n",
       "1       2\n",
       "2       2\n",
       "3       2\n",
       "4       2\n",
       "       ..\n",
       "1703    2\n",
       "1704    2\n",
       "1705    2\n",
       "1706    2\n",
       "1707    2\n",
       "Name: st_name, Length: 1708, dtype: int64"
      ]
     },
     "execution_count": 61,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df['st_name'].str.len()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "93c22615-41f2-4e50-8419-4e7786866727",
   "metadata": {},
   "source": [
    "### String slicing\n",
    "\n",
    "Like list slicing, but for strings. Review the [Python prereqs](../02-prereqs/Python.ipynb) if you're unsure."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "id": "ec40ca8e-d978-4520-a64b-de7cf7ff07d4",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0       A\n",
       "1       A\n",
       "2       A\n",
       "3       A\n",
       "4       A\n",
       "       ..\n",
       "1703    W\n",
       "1704    W\n",
       "1705    W\n",
       "1706    W\n",
       "1707    W\n",
       "Name: st_name, Length: 1708, dtype: object"
      ]
     },
     "execution_count": 62,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df['st_name'].str[0]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "12f590b3-b94e-440a-8009-fb377ab8d905",
   "metadata": {},
   "source": [
    "### `contains`\n",
    "\n",
    "Returns if a string contains the given substring."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "id": "046f3b72-1c61-4076-bfbd-aad5d5451beb",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0        True\n",
       "1        True\n",
       "2        True\n",
       "3        True\n",
       "4        True\n",
       "        ...  \n",
       "1703    False\n",
       "1704    False\n",
       "1705    False\n",
       "1706    False\n",
       "1707    False\n",
       "Name: st_name, Length: 1708, dtype: bool"
      ]
     },
     "execution_count": 64,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df['st_name'].str.contains('A')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "190bdd5c-64fa-4f06-8c4a-07548976c6e7",
   "metadata": {},
   "source": [
    "### `replace`\n",
    "\n",
    "Replaces a substring with the other substring."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 135,
   "id": "be9b6731-38d5-472b-86b3-e406b2d3d87a",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0       aL\n",
       "1       aL\n",
       "2       aL\n",
       "3       aL\n",
       "4       aL\n",
       "        ..\n",
       "1703    WY\n",
       "1704    WY\n",
       "1705    WY\n",
       "1706    WY\n",
       "1707    WY\n",
       "Name: st_name, Length: 1708, dtype: object"
      ]
     },
     "execution_count": 135,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df['st_name'].str.replace('A','a')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "fe45726d-ad4a-48eb-83be-8d44ac0e5e03",
   "metadata": {},
   "source": [
    "### `upper` and `lower`\n",
    "\n",
    "Capitalizes or uncapitalizes a Series."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 136,
   "id": "3671979d-e014-40a9-83ee-694aa506c54e",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0       AL\n",
       "1       AL\n",
       "2       AL\n",
       "3       AL\n",
       "4       AL\n",
       "        ..\n",
       "1703    WY\n",
       "1704    WY\n",
       "1705    WY\n",
       "1706    WY\n",
       "1707    WY\n",
       "Name: st_name, Length: 1708, dtype: object"
      ]
     },
     "execution_count": 136,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df['st_name'].str.upper()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 137,
   "id": "072b4cb2-08c7-446f-b3f4-a3812ff001d5",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0       al\n",
       "1       al\n",
       "2       al\n",
       "3       al\n",
       "4       al\n",
       "        ..\n",
       "1703    wy\n",
       "1704    wy\n",
       "1705    wy\n",
       "1706    wy\n",
       "1707    wy\n",
       "Name: st_name, Length: 1708, dtype: object"
      ]
     },
     "execution_count": 137,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df['st_name'].str.lower()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "04cd8c22-260e-4ad6-b519-28ab7d473b30",
   "metadata": {},
   "source": [
    "### `startswith` and `endswith`\n",
    "\n",
    "Returns if a string starts with or ends with the given substring."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "id": "512a494f-1a83-475d-b951-ab02b6db622e",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0        True\n",
       "1        True\n",
       "2        True\n",
       "3        True\n",
       "4        True\n",
       "        ...  \n",
       "1703    False\n",
       "1704    False\n",
       "1705    False\n",
       "1706    False\n",
       "1707    False\n",
       "Name: st_name, Length: 1708, dtype: bool"
      ]
     },
     "execution_count": 65,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df['st_name'].str.startswith('A')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "id": "829aae5d-58f3-4fd6-aca2-bd3218fd65c8",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0       False\n",
       "1       False\n",
       "2       False\n",
       "3       False\n",
       "4       False\n",
       "        ...  \n",
       "1703    False\n",
       "1704    False\n",
       "1705    False\n",
       "1706    False\n",
       "1707    False\n",
       "Name: st_name, Length: 1708, dtype: bool"
      ]
     },
     "execution_count": 67,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df['st_name'].str.endswith('A')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "1bd8c603-3456-45a0-98d6-fd68099f2a1b",
   "metadata": {},
   "source": [
    "### `split`\n",
    "\n",
    "Splits the strings on the given delimiter. Performs no splits if the delimiter doesn't exist in the string. Returns a list of all the split substrings. Best understood by looking at an example."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 75,
   "id": "2d97db40-f455-4c5b-b362-477294271768",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "68      AZ\n",
       "102     AR\n",
       "1574    WA\n",
       "592     LA\n",
       "1302    RI\n",
       "1642    WI\n",
       "Name: st_name, dtype: object"
      ]
     },
     "execution_count": 75,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "samp_series = df.loc[[68, 102, 1574, 592, 1302, 1642],'st_name']\n",
    "samp_series"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 77,
   "id": "2df6ee47-1bfc-4980-85dd-528579c7f07c",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "68      [, Z]\n",
       "102     [, R]\n",
       "1574    [W, ]\n",
       "592     [L, ]\n",
       "1302     [RI]\n",
       "1642     [WI]\n",
       "Name: st_name, dtype: object"
      ]
     },
     "execution_count": 77,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "samp_series.str.split('A')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "6dbece27-44ed-469e-8686-2241f99be88a",
   "metadata": {},
   "source": [
    "As you can see above, all the strings have been split on the letter `A`. Strings which don't have the letter `A` weren't split.\n",
    "\n",
    "If we want to return the output as a DataFrame, we can set the `expand` parameter to be `True`, as shown below."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 78,
   "id": "969fd704-63a5-430c-8951-03dcb0a5990a",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>68</th>\n",
       "      <td></td>\n",
       "      <td>Z</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>102</th>\n",
       "      <td></td>\n",
       "      <td>R</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1574</th>\n",
       "      <td>W</td>\n",
       "      <td></td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>592</th>\n",
       "      <td>L</td>\n",
       "      <td></td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1302</th>\n",
       "      <td>RI</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1642</th>\n",
       "      <td>WI</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       0     1\n",
       "68           Z\n",
       "102          R\n",
       "1574   W      \n",
       "592    L      \n",
       "1302  RI  None\n",
       "1642  WI  None"
      ]
     },
     "execution_count": 78,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "samp_series.str.split('A', expand = True)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d444fd0b-61a2-4211-972b-f00d6a053295",
   "metadata": {},
   "source": [
    "## DateTime Objects"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "622aae23-b623-4ff6-8409-0602fbc5970d",
   "metadata": {},
   "source": [
    "`DateTime` objects are incredibly helpful for working with dates. They help us easily perform calculations like add/subtract days/dates, convert timezones, sort dates, etc. We can convert a Series to a DateTime object by using `pd.to_datetime()`, as done below. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 123,
   "id": "bec2ad90-c892-49a5-973f-11208eb54cf4",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0      1970-01-01\n",
       "1      1971-01-01\n",
       "2      1972-01-01\n",
       "3      1973-01-01\n",
       "4      1974-01-01\n",
       "          ...    \n",
       "1703   1999-01-01\n",
       "1704   2000-01-01\n",
       "1705   2001-01-01\n",
       "1706   2002-01-01\n",
       "1707   2003-01-01\n",
       "Name: year, Length: 1708, dtype: datetime64[ns]"
      ]
     },
     "execution_count": 123,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.to_datetime(df['year'], format='%Y')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "2973aaad-7e53-4235-a1fe-8c32fea314c6",
   "metadata": {},
   "source": [
    "Notice how we specify the format of our dates to ensure the conversion happens correctly. We don't always need to do this (especially if we have the year, month and day in a well-known format), but it is good practice to ensure the conversion happens correctly.\n",
    "\n",
    "Let us convert the `year` column in the DataFrame to a DateTime object and perform some calculations"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 124,
   "id": "72c784da-0e83-435c-8c20-6e11dc50b612",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>st_name</th>\n",
       "      <th>year</th>\n",
       "      <th>c_beer</th>\n",
       "      <th>beer_tax</th>\n",
       "      <th>btax_dollars</th>\n",
       "      <th>population</th>\n",
       "      <th>salestax</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>AL</td>\n",
       "      <td>1970-01-01</td>\n",
       "      <td>33098</td>\n",
       "      <td>72.341130</td>\n",
       "      <td>2.370</td>\n",
       "      <td>3450</td>\n",
       "      <td>4.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>AL</td>\n",
       "      <td>1971-01-01</td>\n",
       "      <td>37598</td>\n",
       "      <td>69.304600</td>\n",
       "      <td>2.370</td>\n",
       "      <td>3497</td>\n",
       "      <td>4.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>AL</td>\n",
       "      <td>1972-01-01</td>\n",
       "      <td>42719</td>\n",
       "      <td>67.149190</td>\n",
       "      <td>2.370</td>\n",
       "      <td>3539</td>\n",
       "      <td>4.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>AL</td>\n",
       "      <td>1973-01-01</td>\n",
       "      <td>46203</td>\n",
       "      <td>63.217026</td>\n",
       "      <td>2.370</td>\n",
       "      <td>3580</td>\n",
       "      <td>4.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>AL</td>\n",
       "      <td>1974-01-01</td>\n",
       "      <td>49769</td>\n",
       "      <td>56.933796</td>\n",
       "      <td>2.370</td>\n",
       "      <td>3627</td>\n",
       "      <td>4.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1703</th>\n",
       "      <td>WY</td>\n",
       "      <td>1999-01-01</td>\n",
       "      <td>12423</td>\n",
       "      <td>0.319894</td>\n",
       "      <td>0.045</td>\n",
       "      <td>492</td>\n",
       "      <td>4.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1704</th>\n",
       "      <td>WY</td>\n",
       "      <td>2000-01-01</td>\n",
       "      <td>12595</td>\n",
       "      <td>0.309491</td>\n",
       "      <td>0.045</td>\n",
       "      <td>494</td>\n",
       "      <td>4.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1705</th>\n",
       "      <td>WY</td>\n",
       "      <td>2001-01-01</td>\n",
       "      <td>12808</td>\n",
       "      <td>0.300928</td>\n",
       "      <td>0.045</td>\n",
       "      <td>494</td>\n",
       "      <td>4.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1706</th>\n",
       "      <td>WY</td>\n",
       "      <td>2002-01-01</td>\n",
       "      <td>13191</td>\n",
       "      <td>0.296244</td>\n",
       "      <td>0.045</td>\n",
       "      <td>499</td>\n",
       "      <td>4.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1707</th>\n",
       "      <td>WY</td>\n",
       "      <td>2003-01-01</td>\n",
       "      <td>15535</td>\n",
       "      <td>0.289643</td>\n",
       "      <td>0.045</td>\n",
       "      <td>501</td>\n",
       "      <td>4.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>1708 rows × 7 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "     st_name       year  c_beer   beer_tax  btax_dollars  population  salestax\n",
       "0         AL 1970-01-01   33098  72.341130         2.370        3450       4.0\n",
       "1         AL 1971-01-01   37598  69.304600         2.370        3497       4.0\n",
       "2         AL 1972-01-01   42719  67.149190         2.370        3539       4.0\n",
       "3         AL 1973-01-01   46203  63.217026         2.370        3580       4.0\n",
       "4         AL 1974-01-01   49769  56.933796         2.370        3627       4.0\n",
       "...      ...        ...     ...        ...           ...         ...       ...\n",
       "1703      WY 1999-01-01   12423   0.319894         0.045         492       4.0\n",
       "1704      WY 2000-01-01   12595   0.309491         0.045         494       4.0\n",
       "1705      WY 2001-01-01   12808   0.300928         0.045         494       4.0\n",
       "1706      WY 2002-01-01   13191   0.296244         0.045         499       4.0\n",
       "1707      WY 2003-01-01   15535   0.289643         0.045         501       4.0\n",
       "\n",
       "[1708 rows x 7 columns]"
      ]
     },
     "execution_count": 124,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df['year'] = pd.to_datetime(df['year'], format='%Y')\n",
    "df"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "79d877d5-7bd6-4df8-b5e8-19fcefd2a8b2",
   "metadata": {},
   "source": [
    "Notice how the dates all default to 1st January. Let's say we know that the measurements were actually done 4 days after the given date for each year. We can implement this in our DataFrame."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 125,
   "id": "0e83771e-7b70-4848-bf09-0f2ab5dabf54",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>st_name</th>\n",
       "      <th>year</th>\n",
       "      <th>c_beer</th>\n",
       "      <th>beer_tax</th>\n",
       "      <th>btax_dollars</th>\n",
       "      <th>population</th>\n",
       "      <th>salestax</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>AL</td>\n",
       "      <td>1970-01-05</td>\n",
       "      <td>33098</td>\n",
       "      <td>72.341130</td>\n",
       "      <td>2.370</td>\n",
       "      <td>3450</td>\n",
       "      <td>4.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>AL</td>\n",
       "      <td>1971-01-05</td>\n",
       "      <td>37598</td>\n",
       "      <td>69.304600</td>\n",
       "      <td>2.370</td>\n",
       "      <td>3497</td>\n",
       "      <td>4.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>AL</td>\n",
       "      <td>1972-01-05</td>\n",
       "      <td>42719</td>\n",
       "      <td>67.149190</td>\n",
       "      <td>2.370</td>\n",
       "      <td>3539</td>\n",
       "      <td>4.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>AL</td>\n",
       "      <td>1973-01-05</td>\n",
       "      <td>46203</td>\n",
       "      <td>63.217026</td>\n",
       "      <td>2.370</td>\n",
       "      <td>3580</td>\n",
       "      <td>4.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>AL</td>\n",
       "      <td>1974-01-05</td>\n",
       "      <td>49769</td>\n",
       "      <td>56.933796</td>\n",
       "      <td>2.370</td>\n",
       "      <td>3627</td>\n",
       "      <td>4.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1703</th>\n",
       "      <td>WY</td>\n",
       "      <td>1999-01-05</td>\n",
       "      <td>12423</td>\n",
       "      <td>0.319894</td>\n",
       "      <td>0.045</td>\n",
       "      <td>492</td>\n",
       "      <td>4.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1704</th>\n",
       "      <td>WY</td>\n",
       "      <td>2000-01-05</td>\n",
       "      <td>12595</td>\n",
       "      <td>0.309491</td>\n",
       "      <td>0.045</td>\n",
       "      <td>494</td>\n",
       "      <td>4.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1705</th>\n",
       "      <td>WY</td>\n",
       "      <td>2001-01-05</td>\n",
       "      <td>12808</td>\n",
       "      <td>0.300928</td>\n",
       "      <td>0.045</td>\n",
       "      <td>494</td>\n",
       "      <td>4.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1706</th>\n",
       "      <td>WY</td>\n",
       "      <td>2002-01-05</td>\n",
       "      <td>13191</td>\n",
       "      <td>0.296244</td>\n",
       "      <td>0.045</td>\n",
       "      <td>499</td>\n",
       "      <td>4.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1707</th>\n",
       "      <td>WY</td>\n",
       "      <td>2003-01-05</td>\n",
       "      <td>15535</td>\n",
       "      <td>0.289643</td>\n",
       "      <td>0.045</td>\n",
       "      <td>501</td>\n",
       "      <td>4.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>1708 rows × 7 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "     st_name       year  c_beer   beer_tax  btax_dollars  population  salestax\n",
       "0         AL 1970-01-05   33098  72.341130         2.370        3450       4.0\n",
       "1         AL 1971-01-05   37598  69.304600         2.370        3497       4.0\n",
       "2         AL 1972-01-05   42719  67.149190         2.370        3539       4.0\n",
       "3         AL 1973-01-05   46203  63.217026         2.370        3580       4.0\n",
       "4         AL 1974-01-05   49769  56.933796         2.370        3627       4.0\n",
       "...      ...        ...     ...        ...           ...         ...       ...\n",
       "1703      WY 1999-01-05   12423   0.319894         0.045         492       4.0\n",
       "1704      WY 2000-01-05   12595   0.309491         0.045         494       4.0\n",
       "1705      WY 2001-01-05   12808   0.300928         0.045         494       4.0\n",
       "1706      WY 2002-01-05   13191   0.296244         0.045         499       4.0\n",
       "1707      WY 2003-01-05   15535   0.289643         0.045         501       4.0\n",
       "\n",
       "[1708 rows x 7 columns]"
      ]
     },
     "execution_count": 125,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df['year'] = df['year'] + pd.DateOffset(days=4)\n",
    "df"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4d07813c-0262-4797-b717-dbf7d1df3b6c",
   "metadata": {},
   "source": [
    "Alternatively, if we instead know that the measurement was always taken on June 5th, we can specify that as well. Feel free to ignore the `PerformanceWarning` for now, but you can read more [here](https://stackoverflow.com/questions/44003107/pandas-vectorized-date-offset-operations-with-vector-of-differing-offsets) if you're interested."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 128,
   "id": "afac7c59-8949-4424-8f03-6905da460a7a",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/var/folders/td/dtym05z13119cjklqb5tyd040000gn/T/ipykernel_89001/3513634716.py:1: PerformanceWarning: Non-vectorized DateOffset being applied to Series or DatetimeIndex.\n",
      "  df['year'] = df['year'] + pd.DateOffset(month=6, day=5)\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>st_name</th>\n",
       "      <th>year</th>\n",
       "      <th>c_beer</th>\n",
       "      <th>beer_tax</th>\n",
       "      <th>btax_dollars</th>\n",
       "      <th>population</th>\n",
       "      <th>salestax</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>AL</td>\n",
       "      <td>1970-06-05</td>\n",
       "      <td>33098</td>\n",
       "      <td>72.341130</td>\n",
       "      <td>2.370</td>\n",
       "      <td>3450</td>\n",
       "      <td>4.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>AL</td>\n",
       "      <td>1971-06-05</td>\n",
       "      <td>37598</td>\n",
       "      <td>69.304600</td>\n",
       "      <td>2.370</td>\n",
       "      <td>3497</td>\n",
       "      <td>4.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>AL</td>\n",
       "      <td>1972-06-05</td>\n",
       "      <td>42719</td>\n",
       "      <td>67.149190</td>\n",
       "      <td>2.370</td>\n",
       "      <td>3539</td>\n",
       "      <td>4.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>AL</td>\n",
       "      <td>1973-06-05</td>\n",
       "      <td>46203</td>\n",
       "      <td>63.217026</td>\n",
       "      <td>2.370</td>\n",
       "      <td>3580</td>\n",
       "      <td>4.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>AL</td>\n",
       "      <td>1974-06-05</td>\n",
       "      <td>49769</td>\n",
       "      <td>56.933796</td>\n",
       "      <td>2.370</td>\n",
       "      <td>3627</td>\n",
       "      <td>4.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1703</th>\n",
       "      <td>WY</td>\n",
       "      <td>1999-06-05</td>\n",
       "      <td>12423</td>\n",
       "      <td>0.319894</td>\n",
       "      <td>0.045</td>\n",
       "      <td>492</td>\n",
       "      <td>4.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1704</th>\n",
       "      <td>WY</td>\n",
       "      <td>2000-06-05</td>\n",
       "      <td>12595</td>\n",
       "      <td>0.309491</td>\n",
       "      <td>0.045</td>\n",
       "      <td>494</td>\n",
       "      <td>4.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1705</th>\n",
       "      <td>WY</td>\n",
       "      <td>2001-06-05</td>\n",
       "      <td>12808</td>\n",
       "      <td>0.300928</td>\n",
       "      <td>0.045</td>\n",
       "      <td>494</td>\n",
       "      <td>4.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1706</th>\n",
       "      <td>WY</td>\n",
       "      <td>2002-06-05</td>\n",
       "      <td>13191</td>\n",
       "      <td>0.296244</td>\n",
       "      <td>0.045</td>\n",
       "      <td>499</td>\n",
       "      <td>4.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1707</th>\n",
       "      <td>WY</td>\n",
       "      <td>2003-06-05</td>\n",
       "      <td>15535</td>\n",
       "      <td>0.289643</td>\n",
       "      <td>0.045</td>\n",
       "      <td>501</td>\n",
       "      <td>4.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>1708 rows × 7 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "     st_name       year  c_beer   beer_tax  btax_dollars  population  salestax\n",
       "0         AL 1970-06-05   33098  72.341130         2.370        3450       4.0\n",
       "1         AL 1971-06-05   37598  69.304600         2.370        3497       4.0\n",
       "2         AL 1972-06-05   42719  67.149190         2.370        3539       4.0\n",
       "3         AL 1973-06-05   46203  63.217026         2.370        3580       4.0\n",
       "4         AL 1974-06-05   49769  56.933796         2.370        3627       4.0\n",
       "...      ...        ...     ...        ...           ...         ...       ...\n",
       "1703      WY 1999-06-05   12423   0.319894         0.045         492       4.0\n",
       "1704      WY 2000-06-05   12595   0.309491         0.045         494       4.0\n",
       "1705      WY 2001-06-05   12808   0.300928         0.045         494       4.0\n",
       "1706      WY 2002-06-05   13191   0.296244         0.045         499       4.0\n",
       "1707      WY 2003-06-05   15535   0.289643         0.045         501       4.0\n",
       "\n",
       "[1708 rows x 7 columns]"
      ]
     },
     "execution_count": 128,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df['year'] = df['year'] + pd.DateOffset(month=6, day=5)\n",
    "df"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "7d159776-6280-4692-9df6-e58ee585fee9",
   "metadata": {},
   "source": [
    " Note the difference between the `day` and the `days` parameter; `days` adds to the current day while `day` replaces the current day. Similar differences exist for `month` vs `months`, `year` vs `years`, etc."
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f7a42933-3067-4ba9-84ba-edca31d28437",
   "metadata": {},
   "source": [
    "Now, let's look at how a datapoint is stored."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 129,
   "id": "733267e0-7b99-4744-b169-05b41c065e6b",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Timestamp('1970-06-05 00:00:00')"
      ]
     },
     "execution_count": 129,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.loc[0,'year']"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b1bb016b-091f-45fd-9fb6-3749fa6c556a",
   "metadata": {},
   "source": [
    "In the 20th index, the year is 1990. Let's subtract the dates."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 130,
   "id": "a28af408-7db0-47a9-b042-e56864dba325",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Timedelta('7305 days 00:00:00')"
      ]
     },
     "execution_count": 130,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.loc[20,'year'] - df.loc[0,'year']"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "5980e069-0c93-4ac8-8941-c33306b89091",
   "metadata": {},
   "source": [
    "Interesting, we can also subtract the year's directly."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 131,
   "id": "17a29bb8-ccd7-4f7c-95fa-cc0999c7e31b",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "20"
      ]
     },
     "execution_count": 131,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.loc[20,'year'].year - df.loc[0,'year'].year"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "6391925f-adcb-41a3-bb40-0d63e1f1a830",
   "metadata": {},
   "source": [
    "Let's also try subtracting the months."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 132,
   "id": "da892e61-e039-44de-8de2-f7bd617456eb",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0"
      ]
     },
     "execution_count": 132,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.loc[20,'year'].month - df.loc[0,'year'].month"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b51edfee-8917-4c83-9213-6fe2a4ae7caf",
   "metadata": {},
   "source": [
    "The months are subtracted directly. As both of these months were set to June, there is no difference. Instead, to see the number of months between two dates, we must first subtract the years, multiply that by 12, and then subtract the months and add that. The calculation has been done below."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 97,
   "id": "cff93a44-8540-4774-9250-34ef0357fc9e",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "240"
      ]
     },
     "execution_count": 97,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "(df.loc[20,'year'].year - df.loc[0,'year'].year) * 12 + (df.loc[20,'year'].month - df.loc[0,'year'].month)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b7d0e702-e690-4617-a18c-45d24e5a9c7d",
   "metadata": {},
   "source": [
    "Finally, using DateTime objects also helps us sort dates easily!"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 133,
   "id": "a0f1b9b8-8910-4dcb-8903-4dc8d8d8d9fe",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>st_name</th>\n",
       "      <th>year</th>\n",
       "      <th>c_beer</th>\n",
       "      <th>beer_tax</th>\n",
       "      <th>btax_dollars</th>\n",
       "      <th>population</th>\n",
       "      <th>salestax</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>AL</td>\n",
       "      <td>1970-06-05</td>\n",
       "      <td>33098</td>\n",
       "      <td>72.341130</td>\n",
       "      <td>2.3700</td>\n",
       "      <td>3450</td>\n",
       "      <td>4.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>170</th>\n",
       "      <td>CO</td>\n",
       "      <td>1970-06-05</td>\n",
       "      <td>42145</td>\n",
       "      <td>4.120698</td>\n",
       "      <td>0.1350</td>\n",
       "      <td>2224</td>\n",
       "      <td>3.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>592</th>\n",
       "      <td>LA</td>\n",
       "      <td>1970-06-05</td>\n",
       "      <td>68578</td>\n",
       "      <td>25.456755</td>\n",
       "      <td>0.8340</td>\n",
       "      <td>3650</td>\n",
       "      <td>2.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1302</th>\n",
       "      <td>RI</td>\n",
       "      <td>1970-06-05</td>\n",
       "      <td>20360</td>\n",
       "      <td>5.686564</td>\n",
       "      <td>0.1863</td>\n",
       "      <td>951</td>\n",
       "      <td>5.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>860</th>\n",
       "      <td>MT</td>\n",
       "      <td>1970-06-05</td>\n",
       "      <td>17179</td>\n",
       "      <td>6.648059</td>\n",
       "      <td>0.2178</td>\n",
       "      <td>697</td>\n",
       "      <td>0.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>135</th>\n",
       "      <td>AR</td>\n",
       "      <td>2003-06-05</td>\n",
       "      <td>52045</td>\n",
       "      <td>3.507258</td>\n",
       "      <td>0.5449</td>\n",
       "      <td>2726</td>\n",
       "      <td>5.13</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>893</th>\n",
       "      <td>MT</td>\n",
       "      <td>2003-06-05</td>\n",
       "      <td>26100</td>\n",
       "      <td>2.008837</td>\n",
       "      <td>0.3121</td>\n",
       "      <td>918</td>\n",
       "      <td>0.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1471</th>\n",
       "      <td>TX</td>\n",
       "      <td>2003-06-05</td>\n",
       "      <td>558837</td>\n",
       "      <td>2.867468</td>\n",
       "      <td>0.4455</td>\n",
       "      <td>22119</td>\n",
       "      <td>6.25</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1675</th>\n",
       "      <td>WI</td>\n",
       "      <td>2003-06-05</td>\n",
       "      <td>151000</td>\n",
       "      <td>0.934582</td>\n",
       "      <td>0.1452</td>\n",
       "      <td>5472</td>\n",
       "      <td>5.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1707</th>\n",
       "      <td>WY</td>\n",
       "      <td>2003-06-05</td>\n",
       "      <td>15535</td>\n",
       "      <td>0.289643</td>\n",
       "      <td>0.0450</td>\n",
       "      <td>501</td>\n",
       "      <td>4.00</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>1708 rows × 7 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "     st_name       year  c_beer   beer_tax  btax_dollars  population  salestax\n",
       "0         AL 1970-06-05   33098  72.341130        2.3700        3450      4.00\n",
       "170       CO 1970-06-05   42145   4.120698        0.1350        2224      3.00\n",
       "592       LA 1970-06-05   68578  25.456755        0.8340        3650      2.00\n",
       "1302      RI 1970-06-05   20360   5.686564        0.1863         951      5.00\n",
       "860       MT 1970-06-05   17179   6.648059        0.2178         697      0.00\n",
       "...      ...        ...     ...        ...           ...         ...       ...\n",
       "135       AR 2003-06-05   52045   3.507258        0.5449        2726      5.13\n",
       "893       MT 2003-06-05   26100   2.008837        0.3121         918      0.00\n",
       "1471      TX 2003-06-05  558837   2.867468        0.4455       22119      6.25\n",
       "1675      WI 2003-06-05  151000   0.934582        0.1452        5472      5.00\n",
       "1707      WY 2003-06-05   15535   0.289643        0.0450         501      4.00\n",
       "\n",
       "[1708 rows x 7 columns]"
      ]
     },
     "execution_count": 133,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.sort_values('year')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python [conda env:sklearn-env]",
   "language": "python",
   "name": "conda-env-sklearn-env-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}