From fe25bbacb6eb82e461219783fcdabba59debcf60 Mon Sep 17 00:00:00 2001
From: taureandyernv <tdyer@nvidia.com>
Date: Fri, 21 Jun 2019 15:29:46 -0700
Subject: [PATCH] starting version of census dask

---
 .../census_education2income_demo_dask.ipynb   | 717 ++++++++++++++++++
 1 file changed, 717 insertions(+)
 create mode 100644 intermediate_notebooks/E2E/census/census_education2income_demo_dask.ipynb

diff --git a/intermediate_notebooks/E2E/census/census_education2income_demo_dask.ipynb b/intermediate_notebooks/E2E/census/census_education2income_demo_dask.ipynb
new file mode 100644
index 00000000..8a06eddd
--- /dev/null
+++ b/intermediate_notebooks/E2E/census/census_education2income_demo_dask.ipynb
@@ -0,0 +1,717 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Census Notebook\n",
+    "## Doing' it Dask-y Style!\n",
+    "\n",
+    "Held every 10 years, the US census gives a detailed snapshot in time about the makeup of the country.  The last census in 2010 surveyed nearly 309 million people.  IPUMS.org provides researchers an open source data set with 1% to 10% of the census data set.  In this notebook, we want to see how education affects total income earned in the US based on data from each census from the 1970 to 2010 and see if we can predict some results if the census was held today, according to the national average.  We will go through the ETL, training the model, and then testing the prediction.  We'll make every effort to get as balanced of a dataset as we can.  We'll also pull some extra variables to allow for further self-exploration of gender based education and income breakdowns.  On a single Titan RTX, you can run the whole notebook workflow on the 4GB dataset of 14 million rows by 44 columns in less than 3 minutes.  "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**Let's begin!**"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Imports"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "import cuml\n",
+    "import cudf\n",
+    "import dask_cudf\n",
+    "import dask_cuml\n",
+    "import sys\n",
+    "import os\n",
+    "import gzip\n",
+    "from pprint import pprint\n",
+    "import warnings\n",
+    "warnings.filterwarnings('ignore')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Get your data!"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Requirement already satisfied: requests in /conda/envs/rapids/lib/python3.7/site-packages (2.22.0)\n",
+      "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /conda/envs/rapids/lib/python3.7/site-packages (from requests) (1.25.3)\n",
+      "Requirement already satisfied: certifi>=2017.4.17 in /conda/envs/rapids/lib/python3.7/site-packages (from requests) (2019.6.16)\n",
+      "Requirement already satisfied: idna<2.9,>=2.5 in /conda/envs/rapids/lib/python3.7/site-packages (from requests) (2.8)\n",
+      "Requirement already satisfied: chardet<3.1.0,>=3.0.2 in /conda/envs/rapids/lib/python3.7/site-packages (from requests) (3.0.4)\n",
+      "data (Delayed('int-dc8804f3-cde4-428b-a37d-bb37cdc7808a'), 45)\n"
+     ]
+    }
+   ],
+   "source": [
+    "# for those of us using dask\n",
+    "!pip install requests\n",
+    "\"\"\"dtype={'YEAR': 'float64',\n",
+    "       'DATANUM': 'float64',\n",
+    "       'CBSERIAL': 'float64',\n",
+    "       'HHWT': 'float64',\n",
+    "       'CPI99': 'float64',\n",
+    "       'GQ': 'float64',\n",
+    "       'QGQ': 'float64',\n",
+    "       'PERNUM': 'float64',\n",
+    "       'PERWT': 'float64',\n",
+    "       'SEX': 'float64',\n",
+    "       'AGE': 'float64',\n",
+    "       'EDUC': 'float64',\n",
+    "       'EDUCD': 'float64',\n",
+    "       'INCTOT': 'float64',\n",
+    "       'SEX_HEAD': 'float64',\n",
+    "       'SEX_MOM': 'float64',\n",
+    "       'SEX_POP': 'float64',\n",
+    "       'SEX_SP': 'float64',\n",
+    "       'SEX_MOM2': 'float64',\n",
+    "       'SEX_POP2': 'float64',\n",
+    "       'AGE_HEAD': 'float64',\n",
+    "       'AGE_MOM': 'float64',\n",
+    "       'AGE_POP': 'float64',\n",
+    "       'AGE_SP': 'float64',\n",
+    "       'AGE_MOM2': 'float64',\n",
+    "       'AGE_POP2': 'float64',\n",
+    "       'EDUC_HEAD': 'float64',\n",
+    "       'EDUC_MOM': 'float64',\n",
+    "       'EDUC_POP': 'float64',\n",
+    "       'EDUC_SP': 'float64',\n",
+    "       'EDUC_MOM2': 'float64',\n",
+    "       'EDUC_POP2': 'float64',\n",
+    "       'EDUCD_HEAD': 'float64',\n",
+    "       'EDUCD_MOM': 'float64',\n",
+    "       'EDUCD_POP': 'float64',\n",
+    "       'EDUCD_SP': 'float64',\n",
+    "       'EDUCD_MOM2': 'float64',\n",
+    "       'EDUCD_POP2': 'float64',\n",
+    "       'INCTOT_HEAD': 'float64',\n",
+    "       'INCTOT_MOM': 'float64',\n",
+    "       'INCTOT_POP': 'float64',\n",
+    "       'INCTOT_SP': 'float64',\n",
+    "       'INCTOT_MOM2': 'float64',\n",
+    "       'INCTOT_POP2': 'float64']\"\"\"\n",
+    "\n",
+    "df = dask_cudf.read_csv('https://rapidsai-data.s3.us-east-2.amazonaws.com/datasets/ipums_education2income_1970-2010.csv.gz', assume_missing=True, compression='gzip')\n",
+    "print('data',df.shape)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "     YEAR  DATANUM  SERIAL  CBSERIAL   HHWT  CPI99   GQ  QGQ  PERNUM  PERWT  \\\n",
+      "0  1970.0      2.0     1.0       NaN  100.0   4.54  1.0  0.0     1.0  100.0   \n",
+      "1  1970.0      2.0     1.0       NaN  100.0   4.54  1.0  0.0     2.0  100.0   \n",
+      "2  1970.0      2.0     2.0       NaN  100.0   4.54  1.0  0.0     1.0  100.0   \n",
+      "3  1970.0      2.0     2.0       NaN  100.0   4.54  1.0  0.0     2.0  100.0   \n",
+      "4  1970.0      2.0     4.0       NaN  100.0   4.54  1.0  0.0     1.0  100.0   \n",
+      "\n",
+      "      ...       EDUCD_POP  EDUCD_SP  EDUCD_MOM2  EDUCD_POP2  INCTOT_HEAD  \\\n",
+      "0     ...             NaN      30.0         NaN         NaN      12450.0   \n",
+      "1     ...             NaN      60.0         NaN         NaN      12450.0   \n",
+      "2     ...             NaN      60.0         NaN         NaN       9050.0   \n",
+      "3     ...             NaN      70.0         NaN         NaN       9050.0   \n",
+      "4     ...             NaN      23.0         NaN         NaN       7450.0   \n",
+      "\n",
+      "   INCTOT_MOM  INCTOT_POP  INCTOT_SP  INCTOT_MOM2  INCTOT_POP2  \n",
+      "0         NaN         NaN     3450.0          NaN          NaN  \n",
+      "1         NaN         NaN    12450.0          NaN          NaN  \n",
+      "2         NaN         NaN        0.0          NaN          NaN  \n",
+      "3         NaN         NaN     9050.0          NaN          NaN  \n",
+      "4         NaN         NaN      650.0          NaN          NaN  \n",
+      "\n",
+      "[5 rows x 45 columns]\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(df.head(5).to_pandas())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "YEAR           float64\n",
+       "DATANUM        float64\n",
+       "SERIAL         float64\n",
+       "CBSERIAL       float64\n",
+       "HHWT           float64\n",
+       "CPI99          float64\n",
+       "GQ             float64\n",
+       "QGQ            float64\n",
+       "PERNUM         float64\n",
+       "PERWT          float64\n",
+       "SEX            float64\n",
+       "AGE            float64\n",
+       "EDUC           float64\n",
+       "EDUCD          float64\n",
+       "INCTOT         float64\n",
+       "SEX_HEAD       float64\n",
+       "SEX_MOM        float64\n",
+       "SEX_POP        float64\n",
+       "SEX_SP         float64\n",
+       "SEX_MOM2       float64\n",
+       "SEX_POP2       float64\n",
+       "AGE_HEAD       float64\n",
+       "AGE_MOM        float64\n",
+       "AGE_POP        float64\n",
+       "AGE_SP         float64\n",
+       "AGE_MOM2       float64\n",
+       "AGE_POP2       float64\n",
+       "EDUC_HEAD      float64\n",
+       "EDUC_MOM       float64\n",
+       "EDUC_POP       float64\n",
+       "EDUC_SP        float64\n",
+       "EDUC_MOM2      float64\n",
+       "EDUC_POP2      float64\n",
+       "EDUCD_HEAD     float64\n",
+       "EDUCD_MOM      float64\n",
+       "EDUCD_POP      float64\n",
+       "EDUCD_SP       float64\n",
+       "EDUCD_MOM2     float64\n",
+       "EDUCD_POP2     float64\n",
+       "INCTOT_HEAD    float64\n",
+       "INCTOT_MOM     float64\n",
+       "INCTOT_POP     float64\n",
+       "INCTOT_SP      float64\n",
+       "INCTOT_MOM2    float64\n",
+       "INCTOT_POP2    float64\n",
+       "dtype: object"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df.dtypes"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<dask_cudf.Series | 6 tasks | 1 npartitions>\n"
+     ]
+    }
+   ],
+   "source": [
+    "original_counts = df.YEAR.value_counts()\n",
+    "print(original_counts) ### Remember these numbers!"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## ETL"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Cleaning Income data\n",
+    "First, let's focus on cleaning out the bad values for Total Income `INCTOT`. First, let's see if there are an `N/A` values, as when we did `head()`, we saw some in other columns, like CBSERIAL"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df['INCTOT_NA'] = df['INCTOT'].isna()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<dask_cudf.Series | 9 tasks | 1 npartitions>\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(df.INCTOT_NA.value_counts())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Okay, great, there are no `N/A`s...or are there?  Let's drop `INCTOT_NA` and see what our value counts look like"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "NotImplementedError",
+     "evalue": "Drop currently only works for axis=1",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mNotImplementedError\u001b[0m                       Traceback (most recent call last)",
+      "\u001b[0;32m<ipython-input-13-606e03e298e8>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mdf\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdrop\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'INCTOT_NA'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      2\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mINCTOT\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvalue_counts\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mto_pandas\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m  \u001b[0;31m### Wow, look how many people in America make $10,000,000!  Wait a minutes...\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m/conda/envs/rapids/lib/python3.7/site-packages/dask/dataframe/core.py\u001b[0m in \u001b[0;36mdrop\u001b[0;34m(self, labels, axis, errors)\u001b[0m\n\u001b[1;32m   2936\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0maxis\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   2937\u001b[0m             \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmap_partitions\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mM\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdrop\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlabels\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0maxis\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0merrors\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0merrors\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 2938\u001b[0;31m         \u001b[0;32mraise\u001b[0m \u001b[0mNotImplementedError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Drop currently only works for axis=1\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   2939\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   2940\u001b[0m     def merge(self, right, how='inner', on=None, left_on=None, right_on=None,\n",
+      "\u001b[0;31mNotImplementedError\u001b[0m: Drop currently only works for axis=1"
+     ]
+    }
+   ],
+   "source": [
+    "df=df.drop('INCTOT_NA')\n",
+    "print(df.INCTOT.value_counts().to_pandas())  ### Wow, look how many people in America make $10,000,000!  Wait a minutes... "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Not that many people make $10M a year. Checking https://usa.ipums.org/usa-action/variables/INCTOT#codes_section, `9999999`is INCTOT's code for `N/A`.  That was why when we ran `isna`, RAPIDS won't find any.  Let's first create a new dataframe that is only NA values, then let's pull those encoded `N/A`s out of our working dataframe!"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "data (Delayed('int-751d1b42-b1e5-4452-b42b-9e4e577cebc8'), 46)\n"
+     ]
+    }
+   ],
+   "source": [
+    "print('data',df.shape)\n",
+    "tdf = df.query('INCTOT == 9999999')\n",
+    "df = df.query('INCTOT != 9999999')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "working data (Delayed('int-71c4905f-4db3-4431-8a68-a86e1e11c5c8'), 46)\n",
+      "junk count data (Delayed('int-57901cce-2bc1-4c06-80d3-bf7529deac03'), 46)\n"
+     ]
+    }
+   ],
+   "source": [
+    "print('working data',df.shape)\n",
+    "print('junk count data',tdf.shape)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We're down by nearly 1/4 of our original dataset size.  For the curious, now we should be able to get accurate Total Income data, by year, not taking into account inflation"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<dask_cudf.DataFrame | 13 tasks | 1 npartitions>\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(df.groupby('YEAR')['INCTOT'].mean()) # without that cleanup, the average would have bene in the millions...."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Normalize Income for inflation\n",
+    "Now that we have reduced our dataframe to a baseline clean data to answer our question, we should normalize the amounts for inflation.  `CPI99`is the value that IPUMS uses to contian the inflation factor.  All we have to do is multipy by year.  Let's see how that changes the Total Income values from just above!"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<dask_cudf.DataFrame | 21 tasks | 1 npartitions>\n",
+      "<dask_cudf.DataFrame | 25 tasks | 1 npartitions>\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "<cudf.DataFrame ncols=46 nrows=16833597 >"
+      ]
+     },
+     "execution_count": 19,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "print(df.groupby('YEAR')['CPI99'].mean()) ## it just returns the CPI99\n",
+    "df['INCTOT'] = df['INCTOT'] * df['CPI99']\n",
+    "print(df.groupby('YEAR')['INCTOT'].mean()) ## let's see what we got!\n",
+    "df.compute()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Cleaning Education Data\n",
+    "Okay, great!  Now we have income cleaned up, it should also have cleaned much of our next sets of values of interes, namely Education and Education Detailed.  However, there are still some `N/A`s in key variables to worry about, which can cause problmes later.  Let's create a list of them..."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "suspect = ['CBSERIAL','EDUC', 'EDUCD', 'EDUC_HEAD', 'EDUC_POP', 'EDUC_MOM','EDUCD_MOM2','EDUCD_POP2', 'INCTOT_MOM','INCTOT_POP','INCTOT_MOM2','INCTOT_POP2', 'INCTOT_HEAD']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "CBSERIAL <dask_cudf.Series | 25 tasks | 1 npartitions>\n",
+      "EDUC <dask_cudf.Series | 28 tasks | 1 npartitions>\n",
+      "EDUCD <dask_cudf.Series | 31 tasks | 1 npartitions>\n",
+      "EDUC_HEAD <dask_cudf.Series | 34 tasks | 1 npartitions>\n",
+      "EDUC_POP <dask_cudf.Series | 37 tasks | 1 npartitions>\n",
+      "EDUC_MOM <dask_cudf.Series | 40 tasks | 1 npartitions>\n",
+      "EDUCD_MOM2 <dask_cudf.Series | 43 tasks | 1 npartitions>\n",
+      "EDUCD_POP2 <dask_cudf.Series | 46 tasks | 1 npartitions>\n",
+      "INCTOT_MOM <dask_cudf.Series | 49 tasks | 1 npartitions>\n",
+      "INCTOT_POP <dask_cudf.Series | 52 tasks | 1 npartitions>\n",
+      "INCTOT_MOM2 <dask_cudf.Series | 55 tasks | 1 npartitions>\n",
+      "INCTOT_POP2 <dask_cudf.Series | 58 tasks | 1 npartitions>\n",
+      "INCTOT_HEAD <dask_cudf.Series | 61 tasks | 1 npartitions>\n"
+     ]
+    }
+   ],
+   "source": [
+    "for i in range(0, len(suspect)):\n",
+    "    df[suspect[i]] = df[suspect[i]].fillna(-1)\n",
+    "    print(suspect[i], df[suspect[i]].value_counts())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Let's get drop any rows of any `-1`s in Education and Education Detailed."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "EDUC\n",
+      "EDUCD\n"
+     ]
+    }
+   ],
+   "source": [
+    "totincome = ['EDUC','EDUCD']\n",
+    "for i in range(0, len(totincome)):\n",
+    "    query = totincome[i] + ' != -1'\n",
+    "    df = df.query(query)\n",
+    "    print(totincome[i])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "(Delayed('int-6622dc1c-5aa7-4b4a-ac91-4dcc7ccee0f3'), 46)\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(df.shape)\n",
+    "df.head().to_pandas().head()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Well, the good news is that we lost no further rows, start to normalize the data so when we do our OLS, one year doesn't unfairly dominate the data"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Normalize the Data\n",
+    "The in the last step, need to keep our data at about the same ratio as we when started (1% of the population), with the exception of 1980, which was a 5% and needs to be reduced.  This is why we kept the temp dataframe `tdf` - to get the counts per year.   we will find out just how many have to realize"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print('Working data: \\n', df.YEAR.value_counts())\n",
+    "print('junk count data: \\n', tdf.YEAR.value_counts())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "And now, so that we can do MSE, let's make all the dtypes the same. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df.dtypes"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "keep_cols = ['YEAR', 'DATANUM', 'SERIAL', 'CBSERIAL', 'HHWT', 'GQ', 'PERNUM', 'SEX', 'AGE', 'INCTOT', 'EDUC', 'EDUCD', 'EDUC_HEAD', 'EDUC_POP', 'EDUC_MOM','EDUCD_MOM2','EDUCD_POP2', 'INCTOT_MOM','INCTOT_POP','INCTOT_MOM2','INCTOT_POP2', 'INCTOT_HEAD', 'SEX_HEAD']\n",
+    "df = df.loc[:, keep_cols]\n",
+    "#df = df.drop(col for col in df.columns if col not in keep_cols)\n",
+    "for i in range(0, len(keep_cols)):\n",
+    "    df[keep_cols[i]] = df[keep_cols[i]].fillna(-1)\n",
+    "    print(keep_cols[i], df[keep_cols[i]].value_counts())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "## I WANTED TO REDUCE THE 1980 SAMPLE HERE, BUT .SAMPLE() IS NEEDED AND NOT WORKING, UNLESS THERE IS A WORK AROUND..."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "With the important data now clean and normalized, let's start doing the regression"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Ridge Regression\n",
+    "We have 44 variables.  The other variables may provide important predictive information.  The Ridge Regression technique with cross validation to identify the best hyperparamters may be the best way to get the most accurate model.  We'll have to \n",
+    "\n",
+    "* define our performance metrics\n",
+    "* split our data into train and test sets\n",
+    "* train and test our model\n",
+    "\n",
+    "Let's begin and see what we get!"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# As our performance metrics we'll use a basic mean squared error and coefficient of determination implementation\n",
+    "def mse(y_test, y_pred):\n",
+    "    return ((y_test - y_pred) ** 2).mean()\n",
+    "\n",
+    "def cod(y_test, y_pred):\n",
+    "    y_bar = y_test.mean()\n",
+    "    total = ((y_test - y_bar) ** 2).sum()\n",
+    "    residuals = ((y_test - y_pred) ** 2).sum()\n",
+    "    return 1 - (residuals / total)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from cuml.preprocessing.model_selection import train_test_split\n",
+    "trainsize = .9\n",
+    "yCol = \"EDUC\"\n",
+    "from cuml.preprocessing.model_selection import train_test_split\n",
+    "from cuml.linear_model.ridge import Ridge\n",
+    "\n",
+    "def train_and_score(data, clf, train_frac=0.8, n_runs=20):\n",
+    "    mse_scores, cod_scores = [], []\n",
+    "    for _ in range(n_runs):\n",
+    "        X_train, X_test, y_train, y_test = cuml.preprocessing.model_selection.train_test_split(df, yCol, train_size=.9)\n",
+    "        y_pred = clf.fit(X_train, y_train).predict(X_test)\n",
+    "        mse_scores.append(mse(y_test, y_pred))\n",
+    "        cod_scores.append(cod(y_test, y_pred))\n",
+    "    return mse_scores, cod_scores"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    " ## Results\n",
+    " **Moment of truth!  Let's see how our regression training does!**"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "n_runs = 20\n",
+    "clf = Ridge()\n",
+    "mse_scores, cod_scores = train_and_score(df, clf, n_runs=n_runs)\n",
+    "print(f\"median MSE ({n_runs} runs): {np.median(mse_scores)}\")\n",
+    "print(f\"median COD ({n_runs} runs): {np.median(cod_scores)}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**Fun fact:** if you made INCTOT the y axis, your prediction results would not be so pretty!  It just shows that your education level can be an indicator for your income, but your income is NOT a great predictor for your education level.  You have better odds flipping a coin!\n",
+    "\n",
+    "* median MSE (50 runs): 518189521.07548225\n",
+    "* median COD (50 runs): 0.425769113846303"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Next Steps/Self Study\n",
+    "* You can pickle the model and use it in another workflow\n",
+    "* You can redo the workflow with based on head of household using `EDUC`, `SEX`, and `INCTOT` for X in `X`_HEAD\n",
+    "* You can see the growing role of education with women in their changing role in the workforce and income with \"EDUC_MOM\" and \"EDUC_POP"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}