{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "9aad9273",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import statsmodels.api as sm\n",
    "from statsmodels.api import OLS"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "334763be",
   "metadata": {},
   "source": [
    "# Linear Regression"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "038df8c3",
   "metadata": {},
   "source": [
    "Running linear regressions with `pandas` DataFrames is easy! Let us begin by loading in dataset that has the hourly wage, years of schooling, and other information on thousands of people sampled in the March 2012 Current Population Survey."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "d4f26af6",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>state</th>\n",
       "      <th>age</th>\n",
       "      <th>wagesal</th>\n",
       "      <th>imm</th>\n",
       "      <th>hispanic</th>\n",
       "      <th>black</th>\n",
       "      <th>asian</th>\n",
       "      <th>educ</th>\n",
       "      <th>wage</th>\n",
       "      <th>logwage</th>\n",
       "      <th>female</th>\n",
       "      <th>fedwkr</th>\n",
       "      <th>statewkr</th>\n",
       "      <th>localwkr</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>11</td>\n",
       "      <td>44</td>\n",
       "      <td>18000</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>14</td>\n",
       "      <td>9.109312</td>\n",
       "      <td>2.209297</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>11</td>\n",
       "      <td>39</td>\n",
       "      <td>18000</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>14</td>\n",
       "      <td>18.000000</td>\n",
       "      <td>2.890372</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>11</td>\n",
       "      <td>39</td>\n",
       "      <td>35600</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>12</td>\n",
       "      <td>17.115385</td>\n",
       "      <td>2.839978</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>11</td>\n",
       "      <td>39</td>\n",
       "      <td>8000</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>14</td>\n",
       "      <td>5.128205</td>\n",
       "      <td>1.634756</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>11</td>\n",
       "      <td>39</td>\n",
       "      <td>100000</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>16</td>\n",
       "      <td>38.461538</td>\n",
       "      <td>3.649659</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>21902</th>\n",
       "      <td>95</td>\n",
       "      <td>36</td>\n",
       "      <td>125000</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>18</td>\n",
       "      <td>60.096154</td>\n",
       "      <td>4.095946</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>21903</th>\n",
       "      <td>95</td>\n",
       "      <td>38</td>\n",
       "      <td>70000</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>18</td>\n",
       "      <td>26.923077</td>\n",
       "      <td>3.292984</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>21904</th>\n",
       "      <td>95</td>\n",
       "      <td>43</td>\n",
       "      <td>48208</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>14</td>\n",
       "      <td>20.601709</td>\n",
       "      <td>3.025374</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>21905</th>\n",
       "      <td>95</td>\n",
       "      <td>43</td>\n",
       "      <td>75000</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>18</td>\n",
       "      <td>36.057692</td>\n",
       "      <td>3.585120</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>21906</th>\n",
       "      <td>95</td>\n",
       "      <td>44</td>\n",
       "      <td>50000</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>20</td>\n",
       "      <td>24.038462</td>\n",
       "      <td>3.179655</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>21907 rows × 14 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "       state  age  wagesal  imm  hispanic  black  asian  educ       wage  \\\n",
       "0         11   44    18000    0         0      0      0    14   9.109312   \n",
       "1         11   39    18000    0         0      0      0    14  18.000000   \n",
       "2         11   39    35600    0         0      0      0    12  17.115385   \n",
       "3         11   39     8000    0         0      0      0    14   5.128205   \n",
       "4         11   39   100000    0         0      0      0    16  38.461538   \n",
       "...      ...  ...      ...  ...       ...    ...    ...   ...        ...   \n",
       "21902     95   36   125000    0         0      0      0    18  60.096154   \n",
       "21903     95   38    70000    0         0      0      1    18  26.923077   \n",
       "21904     95   43    48208    0         0      0      0    14  20.601709   \n",
       "21905     95   43    75000    0         0      0      0    18  36.057692   \n",
       "21906     95   44    50000    1         0      0      1    20  24.038462   \n",
       "\n",
       "        logwage  female  fedwkr  statewkr  localwkr  \n",
       "0      2.209297       1       1         0         0  \n",
       "1      2.890372       0       0         0         0  \n",
       "2      2.839978       0       0         0         1  \n",
       "3      1.634756       1       0         0         0  \n",
       "4      3.649659       0       1         0         0  \n",
       "...         ...     ...     ...       ...       ...  \n",
       "21902  4.095946       0       0         1         0  \n",
       "21903  3.292984       1       0         0         0  \n",
       "21904  3.025374       1       0         0         0  \n",
       "21905  3.585120       0       0         0         0  \n",
       "21906  3.179655       1       0         1         0  \n",
       "\n",
       "[21907 rows x 14 columns]"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cps_df = pd.read_csv('data/cps.csv')\n",
    "cps_df"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "cfb44bd0",
   "metadata": {},
   "source": [
    "statsmodels is a popular Python package used to create and analyze various statistical models. To create a linear regression model in statsmodels, which is generally import as sm, we can use the following skeleton code:\n",
    "\n",
    "    x = data[[]]                                \n",
    "    y = data[]                                  \n",
    "    model = sm.OLS(y, sm.add_constant(x))      \n",
    "    result = model.fit()                        \n",
    "    result.summary()                            \n",
    "    \n",
    "In the above code, you begin by selecting your x-variables as a DataFrame and your y-variable as a Series. You then initialize an OLS model, adding an intercept term (with `sm.add_constant()`) if necessary. Finally, you fit the OLS model and display the results. For example, below we run a regression where we estimate people's log wage (`logwage`) based on their number of years of educ (`educ`), race (`hispanic`, `black`, `asian`) and sex (`female`). Note how we deliberately do not include the sex `male` and the race `white` in our regression to avoid [linear dependency](https://stats.stackexchange.com/questions/143324/what-is-the-significance-of-a-linear-dependency-in-a-polynomial-regression).  "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "d43d63f9",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<table class=\"simpletable\">\n",
       "<caption>OLS Regression Results</caption>\n",
       "<tr>\n",
       "  <th>Dep. Variable:</th>         <td>logwage</td>     <th>  R-squared:         </th> <td>   0.250</td> \n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Model:</th>                   <td>OLS</td>       <th>  Adj. R-squared:    </th> <td>   0.250</td> \n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Method:</th>             <td>Least Squares</td>  <th>  F-statistic:       </th> <td>   1462.</td> \n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Date:</th>             <td>Wed, 10 Jan 2024</td> <th>  Prob (F-statistic):</th>  <td>  0.00</td>  \n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Time:</th>                 <td>15:13:30</td>     <th>  Log-Likelihood:    </th> <td> -19851.</td> \n",
       "</tr>\n",
       "<tr>\n",
       "  <th>No. Observations:</th>      <td> 21907</td>      <th>  AIC:               </th> <td>3.971e+04</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Df Residuals:</th>          <td> 21901</td>      <th>  BIC:               </th> <td>3.976e+04</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Df Model:</th>              <td>     5</td>      <th>                     </th>     <td> </td>    \n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Covariance Type:</th>      <td>nonrobust</td>    <th>                     </th>     <td> </td>    \n",
       "</tr>\n",
       "</table>\n",
       "<table class=\"simpletable\">\n",
       "<tr>\n",
       "      <td></td>        <th>coef</th>     <th>std err</th>      <th>t</th>      <th>P>|t|</th>  <th>[0.025</th>    <th>0.975]</th>  \n",
       "</tr>\n",
       "<tr>\n",
       "  <th>const</th>    <td>    1.6476</td> <td>    0.022</td> <td>   73.311</td> <td> 0.000</td> <td>    1.604</td> <td>    1.692</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>educ</th>     <td>    0.1070</td> <td>    0.002</td> <td>   71.139</td> <td> 0.000</td> <td>    0.104</td> <td>    0.110</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>hispanic</th> <td>   -0.0717</td> <td>    0.011</td> <td>   -6.333</td> <td> 0.000</td> <td>   -0.094</td> <td>   -0.050</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>black</th>    <td>   -0.1250</td> <td>    0.014</td> <td>   -9.249</td> <td> 0.000</td> <td>   -0.152</td> <td>   -0.099</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>asian</th>    <td>   -0.0041</td> <td>    0.017</td> <td>   -0.244</td> <td> 0.807</td> <td>   -0.037</td> <td>    0.029</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>female</th>   <td>   -0.2833</td> <td>    0.008</td> <td>  -34.885</td> <td> 0.000</td> <td>   -0.299</td> <td>   -0.267</td>\n",
       "</tr>\n",
       "</table>\n",
       "<table class=\"simpletable\">\n",
       "<tr>\n",
       "  <th>Omnibus:</th>       <td>1131.830</td> <th>  Durbin-Watson:     </th> <td>   1.852</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Prob(Omnibus):</th>  <td> 0.000</td>  <th>  Jarque-Bera (JB):  </th> <td>3713.696</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Skew:</th>           <td> 0.188</td>  <th>  Prob(JB):          </th> <td>    0.00</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Kurtosis:</th>       <td> 4.982</td>  <th>  Cond. No.          </th> <td>    82.6</td>\n",
       "</tr>\n",
       "</table><br/><br/>Notes:<br/>[1] Standard Errors assume that the covariance matrix of the errors is correctly specified."
      ],
      "text/latex": [
       "\\begin{center}\n",
       "\\begin{tabular}{lclc}\n",
       "\\toprule\n",
       "\\textbf{Dep. Variable:}    &     logwage      & \\textbf{  R-squared:         } &     0.250   \\\\\n",
       "\\textbf{Model:}            &       OLS        & \\textbf{  Adj. R-squared:    } &     0.250   \\\\\n",
       "\\textbf{Method:}           &  Least Squares   & \\textbf{  F-statistic:       } &     1462.   \\\\\n",
       "\\textbf{Date:}             & Wed, 10 Jan 2024 & \\textbf{  Prob (F-statistic):} &     0.00    \\\\\n",
       "\\textbf{Time:}             &     15:13:30     & \\textbf{  Log-Likelihood:    } &   -19851.   \\\\\n",
       "\\textbf{No. Observations:} &       21907      & \\textbf{  AIC:               } & 3.971e+04   \\\\\n",
       "\\textbf{Df Residuals:}     &       21901      & \\textbf{  BIC:               } & 3.976e+04   \\\\\n",
       "\\textbf{Df Model:}         &           5      & \\textbf{                     } &             \\\\\n",
       "\\textbf{Covariance Type:}  &    nonrobust     & \\textbf{                     } &             \\\\\n",
       "\\bottomrule\n",
       "\\end{tabular}\n",
       "\\begin{tabular}{lcccccc}\n",
       "                  & \\textbf{coef} & \\textbf{std err} & \\textbf{t} & \\textbf{P$> |$t$|$} & \\textbf{[0.025} & \\textbf{0.975]}  \\\\\n",
       "\\midrule\n",
       "\\textbf{const}    &       1.6476  &        0.022     &    73.311  &         0.000        &        1.604    &        1.692     \\\\\n",
       "\\textbf{educ}     &       0.1070  &        0.002     &    71.139  &         0.000        &        0.104    &        0.110     \\\\\n",
       "\\textbf{hispanic} &      -0.0717  &        0.011     &    -6.333  &         0.000        &       -0.094    &       -0.050     \\\\\n",
       "\\textbf{black}    &      -0.1250  &        0.014     &    -9.249  &         0.000        &       -0.152    &       -0.099     \\\\\n",
       "\\textbf{asian}    &      -0.0041  &        0.017     &    -0.244  &         0.807        &       -0.037    &        0.029     \\\\\n",
       "\\textbf{female}   &      -0.2833  &        0.008     &   -34.885  &         0.000        &       -0.299    &       -0.267     \\\\\n",
       "\\bottomrule\n",
       "\\end{tabular}\n",
       "\\begin{tabular}{lclc}\n",
       "\\textbf{Omnibus:}       & 1131.830 & \\textbf{  Durbin-Watson:     } &    1.852  \\\\\n",
       "\\textbf{Prob(Omnibus):} &   0.000  & \\textbf{  Jarque-Bera (JB):  } & 3713.696  \\\\\n",
       "\\textbf{Skew:}          &   0.188  & \\textbf{  Prob(JB):          } &     0.00  \\\\\n",
       "\\textbf{Kurtosis:}      &   4.982  & \\textbf{  Cond. No.          } &     82.6  \\\\\n",
       "\\bottomrule\n",
       "\\end{tabular}\n",
       "%\\caption{OLS Regression Results}\n",
       "\\end{center}\n",
       "\n",
       "Notes: \\newline\n",
       " [1] Standard Errors assume that the covariance matrix of the errors is correctly specified."
      ],
      "text/plain": [
       "<class 'statsmodels.iolib.summary.Summary'>\n",
       "\"\"\"\n",
       "                            OLS Regression Results                            \n",
       "==============================================================================\n",
       "Dep. Variable:                logwage   R-squared:                       0.250\n",
       "Model:                            OLS   Adj. R-squared:                  0.250\n",
       "Method:                 Least Squares   F-statistic:                     1462.\n",
       "Date:                Wed, 10 Jan 2024   Prob (F-statistic):               0.00\n",
       "Time:                        15:13:30   Log-Likelihood:                -19851.\n",
       "No. Observations:               21907   AIC:                         3.971e+04\n",
       "Df Residuals:                   21901   BIC:                         3.976e+04\n",
       "Df Model:                           5                                         \n",
       "Covariance Type:            nonrobust                                         \n",
       "==============================================================================\n",
       "                 coef    std err          t      P>|t|      [0.025      0.975]\n",
       "------------------------------------------------------------------------------\n",
       "const          1.6476      0.022     73.311      0.000       1.604       1.692\n",
       "educ           0.1070      0.002     71.139      0.000       0.104       0.110\n",
       "hispanic      -0.0717      0.011     -6.333      0.000      -0.094      -0.050\n",
       "black         -0.1250      0.014     -9.249      0.000      -0.152      -0.099\n",
       "asian         -0.0041      0.017     -0.244      0.807      -0.037       0.029\n",
       "female        -0.2833      0.008    -34.885      0.000      -0.299      -0.267\n",
       "==============================================================================\n",
       "Omnibus:                     1131.830   Durbin-Watson:                   1.852\n",
       "Prob(Omnibus):                  0.000   Jarque-Bera (JB):             3713.696\n",
       "Skew:                           0.188   Prob(JB):                         0.00\n",
       "Kurtosis:                       4.982   Cond. No.                         82.6\n",
       "==============================================================================\n",
       "\n",
       "Notes:\n",
       "[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n",
       "\"\"\""
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "x = cps_df[['educ','hispanic','black','asian','female']]                                \n",
    "y = cps_df['logwage']                                  \n",
    "model = sm.OLS(y, sm.add_constant(x))      \n",
    "result = model.fit()                        \n",
    "result.summary() "
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0c1467e0",
   "metadata": {},
   "source": [
    "The regression looks good!\n",
    "\n",
    "For more detailed information on running various types of regressions, feel free to look at the [`Econometrics` chapter](https://aeturrell.github.io/coding-for-economists/econmt-regression.html) from the online textbook [Coding for Economists](https://aeturrell.github.io/coding-for-economists/intro.html), or various chapters from the online textbook [Causal Inference for The Brave and True](https://matheusfacure.github.io/python-causality-handbook/landing-page.html)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "94d355c7-ffcf-4865-ac0f-056bf4ade721",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python [conda env:sklearn-env]",
   "language": "python",
   "name": "conda-env-sklearn-env-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}