diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..b740863 --- /dev/null +++ b/.gitignore @@ -0,0 +1,117 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class +.pytest_cache +tests/.cache + +# C extensions +*.so + +# neptune, pycharm +.cache +.cache/ +.idea/ +.idea_modules/ +out/ +output +output/ +*.log +target/ +devbook.ipynb +devbook_local.ipynb + +# Distribution / packaging +.Python +env/ +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.coverage.* +nosetests.xml +coverage.xml +*.cover +.hypothesis/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# Jupyter Notebook +Untitled*.ipynb +.ipynb_checkpoints + +# pyenv +.python-version + +# celery beat schedule file +celerybeat-schedule + +# SageMath parsed files +*.sage.py + +# dotenv +.env + +# virtualenv +.venv +venv/ +ENV/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ + +# Working directories +examples/cache/ +tutorials/examples/cache/ diff --git a/tutorials/1-getting-started.ipynb b/tutorials/1-getting-started.ipynb new file mode 100644 index 0000000..dc7340b --- /dev/null +++ b/tutorials/1-getting-started.ipynb @@ -0,0 +1,332 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Getting started with steps\n", + "\n", + "This notebook shows how to **create** steps, **fit** them to data, **transform** new data and take advantage of persistence" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2\n", + "\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "%matplotlib inline\n", + "\n", + "from steppy.base import Step, BaseTransformer" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# By default pipelines will cache some results so we delete the cache to ba sure we're starting from scratch\n", + "!rm -r ./cache" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Grabbing some data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We'll import a dataset from scikit-learn for our experiments and divide it into training and test sets" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.datasets import load_digits\n", + "from sklearn.model_selection import train_test_split\n", + "\n", + "digits = load_digits()\n", + "X_digits, y_digits = digits.data, digits.target\n", + "\n", + "X_train, X_test, y_train, y_test = train_test_split(X_digits, y_digits, test_size=0.2, stratify=y_digits, random_state=42)\n", + "\n", + "print('{} samples for training'.format(len(y_train)))\n", + "print('{} samples for test'.format(len(y_test)))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Steps communicate data between each other with plain **Python dictionaries**. This makes it easy to pass collections of **arbitrary data types** (Numpy arrays, Pandas dataframes, etc.). The basic structure is as follows (you can get much more fancy but we leave that to the next example)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "data_train = {'input':\n", + " {\n", + " 'X': X_train,\n", + " 'y': y_train,\n", + " }\n", + " }\n", + "\n", + "data_test = {'input':\n", + " {\n", + " 'X': X_test,\n", + " 'y': y_test,\n", + " }\n", + " }" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Creating steps" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The main component of a step is a transformer. You just have to define a class following a **simple API** of ` BaseTransformer` and then it's up to you to be as creative as you want!\n", + "\n", + "... or you can just **wrap you favorite Scikit-learn estimator** like we do here:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.ensemble import RandomForestClassifier\n", + "from sklearn.externals import joblib\n", + "\n", + "class RandomForestTransformer(BaseTransformer):\n", + " def __init__(self):\n", + " self.estimator = RandomForestClassifier()\n", + " \n", + " def fit(self, X, y):\n", + " self.estimator.fit(X, y)\n", + " return self\n", + "\n", + " def transform(self, X, **kwargs):\n", + " y_pred = self.estimator.predict(X)\n", + " return {'y_pred': y_pred}\n", + " \n", + " def save(self, filepath):\n", + " joblib.dump(self.estimator, filepath)\n", + " \n", + " def load(self, filepath):\n", + " self.estimator = joblib.load(filepath)\n", + " return self" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "So what does the transformer do? It must be able to:\n", + "* **initialize** itself\n", + "* **fit** and **transform** the incoming data prepared by the adapter; when transforming, the result should be returned as a **dictionary** that can be **passed on to the next step**\n", + "* **save** and **load** its parameters; this is handy when you're trying to avoid re-computing things over and over.\n", + "\n", + "See how flexible this is? You can just as easily wrap your Keras or Pytorch models." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now let's turn our transformer into a step:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "classifier_step = Step(name='classifier',\n", + " transformer=RandomForestTransformer(),\n", + " input_data=['input'], \n", + " cache_dirpath='./cache'\n", + " )" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "And that's our one-step pipeline finished. You can visualize it too:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "classifier_step" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This is just about the simplest pipeline you can imagine. Now let's train it!" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Training\n", + "\n", + "Training a pipeline is a one-liner:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "preds_train = classifier_step.fit_transform(data_train);" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's see how well we do on our training data:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.metrics import accuracy_score\n", + "acc_train = accuracy_score(data_train['input']['y'], preds_train['y_pred'])\n", + "print('Training accuracy = {:.4f}'.format(acc_train))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Generating test predictions" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Running test data through our pipeline is a one-liner too:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "preds_test = classifier_step.transform(data_test)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "How good is our test score?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "acc_test = accuracy_score(data_test['input']['y'], preds_test['y_pred'])\n", + "print('Test accuracy = {:.4f}'.format(acc_test))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "That's pretty good for a first attempt!\n", + "\n", + "Let's have a look at some predictions to make sure they're sensible" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "fix, axs = plt.subplots(4, 8, figsize=(10, 6))\n", + "for i, ax in enumerate(axs.ravel()):\n", + " ax.imshow(data_test['input']['X'][i].reshape(8, 8), cmap='gray')\n", + " ax.axis('off')\n", + " ax.set_title('pred = {}'.format(preds_test['y_pred'][i]))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "And that's about it for a start! As you can see:\n", + "* It's easy to create steps by inheriting from `BaseTransformer`\n", + "* Transferring data between steps with Python dicts gives you a lot of flexibility\n", + "* Steps wrap easily around Scikit-learn estimators\n", + "* You can display a graph showing the structure of your pipeline\n", + "* Training and testing are pretty much one-liners\n", + "\n", + "At this point it may seem like a lot of work for not much benefit but once we start moving towards more complex pipelines, the reasoning behind all the components will become more clear. Have a look at the next notebook for a more advanced, multi-step pipeline!" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.5.2" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/tutorials/2-multi-step.ipynb b/tutorials/2-multi-step.ipynb new file mode 100644 index 0000000..326eb12 --- /dev/null +++ b/tutorials/2-multi-step.ipynb @@ -0,0 +1,405 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Adapters: Creating steps with multiple inputs\n", + "\n", + "This notebook shows how to create a more complex pipeline, including steps with multiple inputs" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2\n", + "\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "%matplotlib inline\n", + "\n", + "from steppy.base import Step, BaseTransformer\n", + "from steppy.adapter import Adapter, E" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# By default pipelines will cache some results so we delete the cache to ba sure we're starting from scratch\n", + "!rm -r ./cache" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As before, we'll import a dataset from Scikit-learn for our experiments and divide it into training and test sets" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.datasets import load_breast_cancer\n", + "from sklearn.model_selection import train_test_split\n", + "\n", + "dset = load_breast_cancer()\n", + "X_dset, y_dset = dset.data, dset.target\n", + "\n", + "X_train, X_test, y_train, y_test = train_test_split(X_dset, y_dset, test_size=0.2, stratify=y_dset, random_state=42)\n", + "\n", + "print('{} samples for training'.format(len(y_train)))\n", + "print('{} samples for test'.format(len(y_test)))\n", + "\n", + "data_train = {'input':\n", + " {\n", + " 'X': X_train,\n", + " 'y': y_train,\n", + " }\n", + " }\n", + "\n", + "data_test = {'input':\n", + " {\n", + " 'X': X_test,\n", + " 'y': y_test,\n", + " }\n", + " }" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Creating pipeline components\n", + "\n", + "This time we want to build a more fancy pipeline. We'll normalize our data, run PCA to compute some features of a different flavour and then combine them with our original features in a final logistic regression step." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Our first step will be a normalization step. We could use the one from Scikit-learn but we'll write a pure Numpy implementation just to show how this could be done:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.externals import joblib\n", + "\n", + "class NormalizationTransformer(BaseTransformer):\n", + " def __init__(self):\n", + " self.mean = None\n", + " self.std = None\n", + " \n", + " # Having only X as input ensures that we don't accidentally fit y\n", + " def fit(self, X):\n", + " self.mean = np.mean(X, axis=0)\n", + " self.std = np.std(X, axis=0)\n", + " return self\n", + "\n", + " def transform(self, X, **kwargs):\n", + " X_tfm = (X - self.mean) / self.std\n", + " return {'X': X_tfm}\n", + " \n", + " def save(self, filepath):\n", + " joblib.dump([self.mean, self.std], filepath)\n", + " \n", + " def load(self, filepath):\n", + " self.mean, self.std = joblib.load(filepath)\n", + " return self" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We'll also construct a PCA transformer for our normalized features:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.decomposition import PCA\n", + "\n", + "class PCATransformer(BaseTransformer):\n", + " def __init__(self):\n", + " self.estimator = PCA(n_components=10)\n", + " \n", + " def fit(self, X):\n", + " self.estimator.fit(X)\n", + " return self\n", + "\n", + " def transform(self, X, **kwargs):\n", + " X_tfm = self.estimator.transform(X)\n", + " return {'X': X_tfm}\n", + " \n", + " def save(self, filepath):\n", + " joblib.dump(self.estimator, filepath)\n", + " \n", + " def load(self, filepath):\n", + " self.estimator = joblib.load(filepath)\n", + " return self" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Finally, we'll use logistic regression as our classifier:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.linear_model import LogisticRegression\n", + "\n", + "class LogRegTransformer(BaseTransformer):\n", + " def __init__(self):\n", + " self.estimator = LogisticRegression()\n", + " \n", + " def fit(self, X, y):\n", + " self.estimator.fit(X, y)\n", + " return self\n", + "\n", + " def transform(self, X, **kwargs):\n", + " y_pred = self.estimator.predict(X)\n", + " return {'y_pred': y_pred}\n", + " \n", + " def save(self, filepath):\n", + " joblib.dump(self.estimator, filepath)\n", + " \n", + " def load(self, filepath):\n", + " self.estimator = joblib.load(filepath)\n", + " return self" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Assembling the pipeline\n", + "Now we'll create steps from our transformers and link them all together:" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Our normalization step will only require the features from the input, not the labels. In fact, we would like to *avoid* giving it the labels just in case there could be data leak in the implementation (the first rule of data science is you don't trust anyone). To achieve this, we will use a special `adapter` argument to the step constructors, which allows us to extract just the required variables from the data dictionary." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "norm_step = Step(name='Normalizer',\n", + " transformer=NormalizationTransformer(),\n", + " input_data=['input'],\n", + " adapter=Adapter({\n", + " 'X': E('input', 'X')\n", + " }),\n", + " cache_dirpath='./cache')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pca_step = Step(name='PCA',\n", + " transformer=PCATransformer(),\n", + " input_steps=[norm_step], \n", + " cache_dirpath='./cache')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Our classifier step will have to combine two data flows: the features processed by PCA, and the labels fed directly from input. Therefore, we will have to use the `adapter` argument to specify how to map those inputs to transformer arguments." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "lr_step = Step(name='LogReg',\n", + " transformer=LogRegTransformer(),\n", + " input_steps=[pca_step],\n", + " input_data=['input'],\n", + " adapter=Adapter({\n", + " 'X': E('PCA', 'X'),\n", + " 'y': E('input', 'y')\n", + " }),\n", + " cache_dirpath='./cache')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "One may think it's a bit cumbersome to create your transformers and then have to wrap them with steps. However, there is an advantage to this - think about it:\n", + "* The **transformer** is the ***implementation*** of a machine learning algorithm. It has an input and outputs but it doesn't even know what these are connected to.\n", + "* The **steps** define the ***connections*** between different transformers. At this level of abstraction, all the algorithmic details are hidden. The code that defines steps and connects them together is compact and it's easier to see what is connected to what." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "So what does our pipeline look like?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "lr_step" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This looks about right - let's move on to training!" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Training\n", + "\n", + "Training a pipeline is a one-liner. When we fit the final logistic regression step, it will go back to its input steps and fit them too (assuming there's no cache or persistent outputs - that's why we delete any leftover cache at the start of the notebook). This also works recursively, so the parent steps will ask the grandparent steps to fit etc." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "preds_train = lr_step.fit_transform(data_train)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's see how well we do on our training data:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.metrics import accuracy_score\n", + "acc_train = accuracy_score(data_train['input']['y'], preds_train['y_pred'])\n", + "print('Training accuracy = {:.4f}'.format(acc_train))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Generating test predictions" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Running test data through our pipeline is a one-liner too:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "preds_test = lr_step.transform(data_test)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "What is our test score?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "acc_test = accuracy_score(data_test['input']['y'], preds_test['y_pred'])\n", + "print('Test accuracy = {:.4f}'.format(acc_test))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "That seems pretty good. Have a look at the next notebook for even more complex pipelines with parallel branches." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.5.2" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/tutorials/3-adapter_advanced.ipynb b/tutorials/3-adapter_advanced.ipynb new file mode 100644 index 0000000..d41c485 --- /dev/null +++ b/tutorials/3-adapter_advanced.ipynb @@ -0,0 +1,708 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Adapters in bigger pipelines" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this tutorial we show how to use adapters to create more complicated pipelines in Steps." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2\n", + "\n", + "import numpy as np\n", + "import pandas as pd\n", + "import xgboost\n", + "import traceback\n", + "\n", + "from sklearn.datasets import load_digits\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "from sklearn.externals import joblib\n", + "from sklearn.metrics import log_loss\n", + "\n", + "import matplotlib.pyplot as plt\n", + "%matplotlib inline" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from steppy.base import Step, BaseTransformer, NoOperation, make_transformer\n", + "from steppy.adapter import Adapter, E" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## The problem" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's recreate the pipeline for digits recognition from notebook #1.\n", + "\n", + "We start off by fetching the data. In the latter part of this notebook we will create a model ensembling, hence this time we split the data into three parts." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "CACHE_DIR = './cache'\n", + "digits = load_digits()\n", + "X_digits, y_digits = digits.data, digits.target\n", + "\n", + "X_train, X_test, y_train, y_test = train_test_split(X_digits, y_digits, test_size=0.15, stratify=y_digits, random_state=643793)\n", + "X_train, X_ens, y_train, y_ens = train_test_split(X_train, y_train, test_size=0.35, stratify=y_train, random_state=976542)\n", + "\n", + "print('{} samples for training'.format(len(y_train)))\n", + "print('{} samples for ensembling'.format(len(y_ens)))\n", + "print('{} samples for test'.format(len(y_test)))\n", + "\n", + "data_train = {\n", + " 'input': {\n", + " 'images': X_train,\n", + " 'labels': y_train,\n", + " }\n", + "}\n", + "\n", + "data_ensembling = {\n", + " 'input': {\n", + " 'images': X_ens,\n", + " 'labels': y_ens\n", + " }\n", + "}\n", + "\n", + "data_test = {\n", + " 'input': {\n", + " 'images': X_test,\n", + " 'labels': y_test\n", + " }\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!rm -r ./cache" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We define `RandomForestTransformer` in similar manner as before. With one difference, though. `Transform` will use RandomForest's `predict_proba` instead of `predict` which will be useful in the latter part of this notebook." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "class RandomForestTransformer(BaseTransformer):\n", + " def __init__(self, random_state=None):\n", + " self.estimator = RandomForestClassifier(random_state=random_state)\n", + " \n", + " def fit(self, X, y):\n", + " self.estimator.fit(X, y)\n", + " return self\n", + "\n", + " def transform(self, X, **kwargs):\n", + " y_proba = self.estimator.predict_proba(X)\n", + " return {'y_proba': y_proba}\n", + " \n", + " def save(self, filepath):\n", + " joblib.dump(self.estimator, filepath)\n", + " \n", + " def load(self, filepath):\n", + " self.estimator = joblib.load(filepath)\n", + " return self" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "rf_step = Step(name='random_forest',\n", + " transformer=RandomForestTransformer(),\n", + " input_data=['input'], \n", + " cache_dirpath=CACHE_DIR)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "rf_step" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The graph looks just like in notebook #1. Let's try to execute it!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "try:\n", + " preds_train_rf = rf_step.fit_transform(data_train)\n", + "except:\n", + " traceback.print_exc()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As we can see, something went wrong. The problem is that `input` dictionary in `data_train` contains fields `images` and `labels`, whereas `RandomForestTransformer` expects arguments `X` and `y`." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## The solution: adapter" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To handle such issues, `Step`'s initializer has `adapter` argument. `Adapter` describes how to reshape the data from the input nodes into the form expected by the transformer or further steps. \n", + "\n", + "The basic usage is as follows:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "rf_step = Step(name='random_forest',\n", + " transformer=RandomForestTransformer(),\n", + " input_data=['input'],\n", + " adapter=Adapter({\n", + " 'X': E('input', 'images'),\n", + " 'y': E('input', 'labels')\n", + " }),\n", + " cache_dirpath=CACHE_DIR)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "rf_step" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We created a new step which gets its data from `input` node.\n", + "\n", + "When the program flow gets to `rename_step`, first `adapter`-related code is executed. `RandomForestTransformer`'s `fit_transform` and `transform` methods expect arguments `X` and `y`. The `adapter` is basically a dictionary which for each expected argument tells how to get it. For instance `'X': [('input', 'images')]` tells the step, that value for `X` is stored under `images` key in the dictionary returned by `input` node.\n", + "\n", + "Let's try to fit Random Forest again!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "rf_step.fit_transform(data_train)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This time it worked like charm - we see class probabilites for the train cases." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Pipeline with model ensembling" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Very often when we have multiple models which perform on the same level it makes sense to combine them. The created model ensembling tends to be more stable and can even improve results a little.\n", + "\n", + "To take advantage of that fact, we will train a couple of forests. Thanks to a different random seeds each forest will make somewhat different predictions, and therefore their combination will improve performance of the entire pipeline." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "NR_OF_FORESTS = 4\n", + "random_seeds = [np.random.randint(1000000) for _ in range(NR_OF_FORESTS)]\n", + "\n", + "rf_steps = [Step(name='random_forest_{}'.format(i),\n", + " transformer=RandomForestTransformer(random_state=seed),\n", + " input_data=['input'], \n", + " adapter=Adapter({\n", + " 'X': E('input', 'images'),\n", + " 'y': E('input', 'labels')\n", + " }), \n", + " cache_dirpath=CACHE_DIR)\n", + " for i, seed in enumerate(random_seeds)]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "rf_steps[0]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "For ensembling we will use boosting trees. First we need to create a transformer that will wrap XGBoost. What we need to do is really analogous to what we did for Random Forests." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "class XGBoostTransformer(BaseTransformer):\n", + " def __init__(self, xgb_params, num_boost_round):\n", + " self.estimator = None\n", + " self.xgb_params = xgb_params\n", + " self.num_boost_round = num_boost_round\n", + " \n", + " def fit(self, X, y):\n", + " tr_mat = xgboost.DMatrix(X, label=y)\n", + " evals = [(tr_mat, 'train')]\n", + " self.estimator = xgboost.train(self.xgb_params,\n", + " tr_mat,\n", + " num_boost_round=self.num_boost_round,\n", + " verbose_eval=False,\n", + " evals=evals)\n", + " return self\n", + "\n", + " def transform(self, X, **kwargs):\n", + " test_mat = xgboost.DMatrix(X)\n", + " y_proba = self.estimator.predict(test_mat)\n", + " return {'y_proba': y_proba}\n", + " \n", + " def save(self, filepath):\n", + " joblib.dump({'estimator': self.estimator,\n", + " 'xgb_params': self.xgb_params,\n", + " 'num_boost_round': self.num_boost_round},\n", + " filepath)\n", + " \n", + " def load(self, filepath):\n", + " d = joblib.load(filepath)\n", + " self.estimator = d['estimator']\n", + " self.xgb_params = d['xgb_params']\n", + " self.num_boost_round = d['num_boost_round']\n", + " return self\n", + " \n", + "def get_xgb_params():\n", + " return {\n", + " 'objective': 'multi:softprob',\n", + " \"num_class\": 10,\n", + " 'eta': 0.5,\n", + " 'max_depth': 4,\n", + " 'silent': True,\n", + " 'nthread': -1,\n", + " 'lambda': 2.0,\n", + " 'eval_metric': [\"mlogloss\", \"merror\"]\n", + " }\n", + " " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To connect ensembling step with random forests we need to do some more advanced adapting." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "gather_step = Step(\n", + " name='gather_step',\n", + " transformer=make_transformer(lambda lst, y: {'X': np.hstack(lst), 'y': y}),\n", + " input_steps=rf_steps,\n", + " input_data=['input'],\n", + " adapter=Adapter({\n", + " 'lst': [E(rf_step.name, 'y_proba') for rf_step in rf_steps],\n", + " 'y': E('input', 'labels')\n", + " }),\n", + " cache_dirpath=CACHE_DIR\n", + ")\n", + "\n", + "ensemble_step = Step(name='ensembler',\n", + " transformer=XGBoostTransformer(xgb_params=get_xgb_params(), num_boost_round=10),\n", + " input_steps=[gather_step],\n", + " cache_dirpath=CACHE_DIR)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ensemble_step" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We used a little different syntax in `adapter` this time. Recipe for `X` consists of two things:\n", + "- a list of objects returned by input steps that should be used to construct `X`,\n", + "- a function which merges them into a final `X` object.\n", + "\n", + "So `[(rf_step.name, 'y_proba') for rf_step in rf_steps]` tells the adapter to extract `y_proba` arrays from dictionaries returned by all random forests. All these `y_proba`s are put on a list which is then passed to `lambda lst: np.hstack(lst))`. This function will merge outputs of all forests into one big array, which is eventually passed to the `XGBoostTransformer`.\n", + "\n", + "An adapter is actually a description of how to build arguments for `fit_transform` and `transform`. Let _brick description_ mean a pair of node name and key in the dictionary returned by that node. An adapter is a dictionary, where:\n", + "- keys must agree with transormer's `fit_transform` and `transform` arguments,\n", + "- values must be either:\n", + " 1. a brick description,\n", + " 2. a list of brick descriptions,\n", + " 3. a pair of:\n", + " - a list of brick descriptions,\n", + " - a function that adjusts objects extracted according to the above list,\n", + "\n", + "Step with an adapter proceeds like this:\n", + "1. It gathers results from preceeding nodes.\n", + "2. It builds a dictionary with the same keys as the adapter and with values built according to descriptions:\n", + " - if the key in the adapter maps to a single brick description, an appropriate object is extracted from the results of input nodes,\n", + " - if list of brick descriptions is given, objects are extracted according to brick descriptions and added to a list,\n", + " - if a function is also passed, it will be applied to the list from the previous step, and its returned value will be assigned to the key.\n", + "3. Arguments of `fit_transform` and `transform` are filled using the above dictionary." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's check if our ensembling works. To properly fit the pipeline we have to fit random forests first using the train data, and then fit the ensembler using part of the data for this purpose." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "for rf_step in rf_steps:\n", + " rf_step.fit_transform(data_train)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ensemble_step.fit_transform(data_ensembling)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Looks fine! However, often we are interested only in the class with the highest probability. Let's make a step that will find this class for us." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "class GuessesTransformer(BaseTransformer):\n", + " def transform(self, y_proba):\n", + " return {'y_pred': np.argmax(y_proba, axis=1)}\n", + "\n", + "guesses_step = Step(name='guesses_maker',\n", + " transformer=GuessesTransformer(),\n", + " input_steps=[ensemble_step], \n", + " cache_dirpath=CACHE_DIR\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "guesses_step" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You should be already familiar with everything that happened here. New step, `guesses_maker`, takes its input from `ensembler`. Adapter will create just one element: `y_pred`. List of bricks used to build `y_pred` has only one element: `y_proba` found in `ensembler`'s result. Function `lambda lst: np.argmax(lst[0], axis=1)` takes this list and performs row-wise `argmax` on its only element." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": false + }, + "outputs": [], + "source": [ + "guesses_step.fit_transform(data_train)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We have created a quite complicated pipeline, so for sure everyone is anxious to see how it performs. Our final step will carry out the evaluation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "class EvaluationTransformer(BaseTransformer):\n", + " def transform(self, y_true, y_proba, y_pred):\n", + " return {'Log-loss': log_loss(y_pred=y_proba, y_true=y_true),\n", + " 'Acc:': '{:.2f}'.format(sum(y_true == y_pred) / len(y_pred))\n", + " }\n", + "\n", + "evaluation_step = Step(name='evaluator',\n", + " transformer=EvaluationTransformer(),\n", + " input_steps=[ensemble_step, guesses_step],\n", + " input_data=['input'],\n", + " adapter=Adapter({\n", + " 'y_proba': E(ensemble_step.name, 'y_proba'),\n", + " 'y_pred': E(guesses_step.name, 'y_pred'),\n", + " 'y_true': E('input', 'labels')\n", + " }),\n", + " cache_dirpath=CACHE_DIR\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "evaluation_step" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "evaluation_step.fit_transform(data_train)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "evaluation_step.transform(data_test)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "scrolled": false + }, + "source": [ + "As we can see thanks to ensembling we improved in comparison to a single model." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Peek on pipeline predictions" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Comparing images with model's predictions is always a very rewarding feeling. As a last example we show a step that displays a few images with the predicted probability distributions!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model_names = [rf_step.name for rf_step in rf_steps] + [ensemble_step.name]\n", + "class LookAtPredictions(BaseTransformer):\n", + " def transform(self, probas, images): \n", + " pd.options.display.float_format = '{:5.2f}'.format\n", + " for img_nr in range(5):\n", + " df = pd.DataFrame({model_names[j]: probas[j][img_nr]\n", + " for j in range(len(model_names))\n", + " },\n", + " index=list(range(10)))\n", + " df = df[model_names]\n", + " plt.figure(figsize=(6,2))\n", + " left = plt.subplot(1, 2, 1)\n", + " right = plt.subplot(1, 2, 2)\n", + " left.imshow(images[img_nr].reshape(8, 8), cmap='gray')\n", + " right.axis('off')\n", + " right.text(0, 0.3, str(df.T), fontsize=14, fontname='monospace')\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "display_step = Step(\n", + " name='display',\n", + " transformer=LookAtPredictions(),\n", + " input_steps=[ensemble_step] + rf_steps,\n", + " input_data=['input'],\n", + " adapter=Adapter({\n", + " 'probas': [E(rf_step.name, 'y_proba') for rf_step in rf_steps] +\n", + " [E(ensemble_step.name, 'y_proba')],\n", + " 'images': E('input', 'images')\n", + " }),\n", + " cache_dirpath=CACHE_DIR\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "display_step" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": false + }, + "outputs": [], + "source": [ + "display_step.fit_transform(data_train)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": false + }, + "outputs": [], + "source": [ + "display_step.transform(data_test)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.5.2" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/tutorials/4-caching-persistence.ipynb b/tutorials/4-caching-persistence.ipynb new file mode 100644 index 0000000..d402240 --- /dev/null +++ b/tutorials/4-caching-persistence.ipynb @@ -0,0 +1,778 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Data persistence and data caching" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This notebook presents data persistence and data caching features in steps.\n", + "* Persistence helps to avoid re-running early steps of a pipeline when subsequent steps are changed\n", + "* Caching makes it possible to run complex, multi-path pipelines without re-computing the results of early steps\n", + "\n", + "Note that the features presented here are different from *model persistence*, which saves the transformers as the steps are trained." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2\n", + "\n", + "import numpy as np\n", + "import pandas as pd\n", + "\n", + "from sklearn.externals import joblib\n", + "from sklearn.metrics import log_loss\n", + "\n", + "import matplotlib.pyplot as plt\n", + "%matplotlib inline" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from steps.base import Step, BaseTransformer\n", + "from steps.adapter import Adapter, E\n", + "CACHE_DIR = './cache'\n", + "CACHE_DIR_2 = './cache_2'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# By default pipelines will cache some results so we delete the cache to ba sure we're starting from scratch\n", + "import os\n", + "import shutil\n", + "\n", + "shutil.rmtree(CACHE_DIR, ignore_errors=True)\n", + "shutil.rmtree(CACHE_DIR_2, ignore_errors=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This time we'll hava a go at text classification. We'll use the classic 20newsgroups dataset, but without the headers, footers or quotes which would make the task too easy." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.datasets import fetch_20newsgroups\n", + "\n", + "newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))\n", + "newsgroups_test = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.model_selection import train_test_split\n", + "\n", + "X_train, y_train = newsgroups_train.data, newsgroups_train.target\n", + "\n", + "X_fit, X_val, y_fit, y_val = train_test_split(X_train, y_train, test_size=0.1, stratify=y_train, random_state=42)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's use a label encoder to ensure out labels are well-behaved" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.preprocessing import LabelEncoder\n", + "input_label_enc = LabelEncoder().fit(newsgroups_train.target)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This time we have pre-defined training and test sets but we would like to have a hold-out set of training data available for ensembling" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "data_fit = {'input':\n", + " {\n", + " 'text': X_fit,\n", + " 'label': input_label_enc.transform(y_fit),\n", + " }\n", + " }\n", + "\n", + "data_val = {'input':\n", + " {\n", + " 'text': X_val,\n", + " 'label': input_label_enc.transform(y_val),\n", + " }\n", + " }\n", + "\n", + "data_test = {'input':\n", + " {\n", + " 'text': newsgroups_test.data,\n", + " 'label': input_label_enc.transform(newsgroups_test.target),\n", + " }\n", + " }\n", + "\n", + "def print_data_summary(data, title):\n", + " print(title)\n", + " print(' Num. documents: {}'.format(len(data['input']['text'])))\n", + " print(' Num. categories: {}'.format(len(np.unique(data['input']['label']))))\n", + "\n", + "for data, title in [(data_fit, 'Model fitting data'), (data_val, 'Validation data'), (data_test, 'Testing data')]:\n", + " print_data_summary(data, title)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Text processing transformers" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We define a transformer that does count vectorization on our documents - again, we can just wrap the one from sklearn:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.feature_extraction.text import CountVectorizer\n", + "\n", + "class CountVecTransformer(BaseTransformer):\n", + " def __init__(self, max_features):\n", + " self.estimator = CountVectorizer(max_features=max_features)\n", + " \n", + " def fit(self, X):\n", + " self.estimator.fit(X)\n", + " return self\n", + "\n", + " def transform(self, X, **kwargs):\n", + " X_tfm = self.estimator.transform(X)\n", + " return {'X': X_tfm}\n", + " \n", + " def save(self, filepath):\n", + " joblib.dump(self.estimator, filepath)\n", + " \n", + " def load(self, filepath):\n", + " self.estimator = joblib.load(filepath)\n", + " return self" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Similarly for the IDFs in our TF-IDF model:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.feature_extraction.text import TfidfTransformer\n", + "\n", + "class StepsTfidfTransformer(BaseTransformer):\n", + " def __init__(self):\n", + " self.estimator = TfidfTransformer()\n", + " \n", + " def fit(self, X):\n", + " self.estimator.fit(X)\n", + " return self\n", + "\n", + " def transform(self, X, **kwargs):\n", + " X_tfm = self.estimator.transform(X)\n", + " return {'X': X_tfm}\n", + " \n", + " def save(self, filepath):\n", + " joblib.dump(self.estimator, filepath)\n", + " \n", + " def load(self, filepath):\n", + " self.estimator = joblib.load(filepath)\n", + " return self" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This will give us a bunch of features to train on." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Linear model" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As a first attempt, we'll try to do our predictions with (sparse) logistic regression" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.linear_model import LogisticRegression\n", + "\n", + "class SparseLogRegProbaTransformer(BaseTransformer):\n", + " def __init__(self):\n", + " self.estimator = LogisticRegression(penalty='l1')\n", + " \n", + " def fit(self, X, y):\n", + " self.estimator.fit(X, y)\n", + " return self\n", + "\n", + " def transform(self, X, **kwargs):\n", + " y_proba = self.estimator.predict_proba(X)\n", + " return {'y_proba': y_proba}\n", + " \n", + " def save(self, filepath):\n", + " joblib.dump(self.estimator, filepath)\n", + " \n", + " def load(self, filepath):\n", + " self.estimator = joblib.load(filepath)\n", + " return self" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "count_vec_step = Step(name='CountVec',\n", + " transformer=CountVecTransformer(max_features=1000),\n", + " input_data=['input'],\n", + " adapter=Adapter({'X': E('input', 'text')}),\n", + " cache_dirpath=CACHE_DIR)\n", + "\n", + "tfidf_step = Step(name='TF-IDF',\n", + " transformer=StepsTfidfTransformer(),\n", + " input_steps=[count_vec_step], \n", + " cache_dirpath=CACHE_DIR,\n", + " save_output=True,\n", + " load_saved_output=True # This breaks when switching from training data to val data or test data!\n", + " )\n", + "\n", + "logreg_step = Step(name='SparseLogReg',\n", + " transformer=SparseLogRegProbaTransformer(),\n", + " input_steps=[tfidf_step],\n", + " input_data=['input'],\n", + " adapter=Adapter({'X': E('TF-IDF', 'X'),\n", + " 'y': E('input', 'label')\n", + " }),\n", + " cache_dirpath=CACHE_DIR)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Note that we have passed `save_output=True` to the `tfidf_step` constructor. This will make this step save its output so that once it's been computed once, it can later just be loaded from disk. Therefore, we will be able to work on the logistic regression classifier without having to re-compute the outputs of its ancestor nodes." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "logreg_step" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "preds_linear_fit = logreg_step.fit_transform(data_fit)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.metrics import accuracy_score\n", + "\n", + "acc_linear_fit = accuracy_score(y_true=data_fit['input']['label'], y_pred=np.argmax(preds_linear_fit['y_proba'], axis=1))\n", + "print('Model fitting accuracy: {:.4f}'.format(acc_linear_fit))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Bug workaround: manually delete saved output when switching datasets\n", + "os.remove(os.path.join(CACHE_DIR, 'outputs', 'TF-IDF'))\n", + "preds_linear_val = logreg_step.transform(data_val)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "acc_linear_val = accuracy_score(y_true=data_val['input']['label'], y_pred=np.argmax(preds_linear_val['y_proba'], axis=1))\n", + "print('Validation accuracy: {:.4f}'.format(acc_linear_val))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Random forest model" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As an alternative, we'll also build a neural net model on top of the same TF-IDF features. We'll use the multi-layer perceptron (MLP) which is available in Scikit-learn" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.ensemble import RandomForestClassifier\n", + "\n", + "class RfClfTransformer(BaseTransformer):\n", + " def __init__(self, n_estimators, max_depth):\n", + " self.estimator = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth)\n", + " \n", + " def fit(self, X, y):\n", + " self.estimator.fit(X, y)\n", + " return self\n", + "\n", + " def transform(self, X, **kwargs):\n", + " y_proba = self.estimator.predict_proba(X)\n", + " return {'y_proba': y_proba}\n", + " \n", + " def save(self, filepath):\n", + " joblib.dump(self.estimator, filepath)\n", + " \n", + " def load(self, filepath):\n", + " self.estimator = joblib.load(filepath)\n", + " return self" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "rf_step = Step(name='RF',\n", + " transformer=RfClfTransformer(n_estimators=200, max_depth=8),\n", + " input_steps=[tfidf_step],\n", + " input_data=['input'],\n", + " adapter=Adapter({'X': E('TF-IDF', 'X'),\n", + " 'y': E('input', 'label')\n", + " }),\n", + " cache_dirpath=CACHE_DIR)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "rf_step" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "OK, so it was easy to add a different model on top of TF-IDF features. Indeed, this time we will be able to use the **saved** TF-IDF output, so we can get straight to fitting the random forest." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Bug workaround: manually delete saved output when switching datasets\n", + "os.remove(os.path.join(CACHE_DIR, 'outputs', 'TF-IDF'))\n", + "\n", + "preds_rf_fit = rf_step.fit_transform(data_fit)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "acc_rf_fit = accuracy_score(y_true=data_fit['input']['label'], y_pred=np.argmax(preds_rf_fit['y_proba'], axis=1))\n", + "print('Model fitting accuracy: {:.4f}'.format(acc_rf_fit))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Bug workaround: manually delete saved output when switching datasets\n", + "os.remove(os.path.join(CACHE_DIR, 'outputs', 'TF-IDF'))\n", + "\n", + "preds_rf_val = rf_step.transform(data_val)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "acc_rf_val = accuracy_score(y_true=data_val['input']['label'], \n", + " y_pred=np.argmax(preds_rf_val['y_proba'], axis=1))\n", + "print('Validation accuracy: {:.4f}'.format(acc_rf_val))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Ensembling" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We'll do simple ensembling by averaging predictions:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "class AvgTransformer(BaseTransformer):\n", + " def __init__(self):\n", + " pass\n", + " \n", + " def fit(self, y_proba_1, y_proba_2):\n", + " return self\n", + "\n", + " def transform(self, y_proba_1, y_proba_2, **kwargs):\n", + " y_proba = (y_proba_1 + y_proba_2) / 2\n", + " return {'y_proba': y_proba}\n", + " \n", + " def save(self, filepath):\n", + " joblib.dump({}, filepath)\n", + " \n", + " def load(self, filepath):\n", + " self.estimator = joblib.load(filepath)\n", + " return self" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ens_step = Step(name='Ensembler',\n", + " transformer=AvgTransformer(),\n", + " input_steps=[logreg_step, rf_step],\n", + " adapter=Adapter({'y_proba_1': E('SparseLogReg', 'y_proba'),\n", + " 'y_proba_2': E('RF', 'y_proba'),\n", + " }),\n", + " cache_dirpath=CACHE_DIR)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ens_step" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Note that for the TF-IDF step we set `cache_output` to `True`. What does this do? Note that the output of the TF-IDF step is used both by RF and SparseLogReg. This means that when we run the Ensemble node on some data, it will in turn call MLP and SparseLogReg, which will both call TF-IDF. Without caching, this would mean we're computing the output of the TF-IDF step twice, which is definitely a waste of precious compute time and could possibly lead to some inconsistencies in the data (e.g. if the TF-IDF step was randomized in some way). Caching solves both problems without keeping anything in memory - the caching is done on disk, not in RAM." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Bug workaround: manually delete saved output when switching datasets\n", + "os.remove(os.path.join(CACHE_DIR, 'outputs', 'TF-IDF'))\n", + "\n", + "# This is just a dummy step to \"activate\" the ensembler\n", + "preds_ens_fit = ens_step.fit_transform(data_val)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Bug workaround: manually delete saved output when switching datasets\n", + "os.remove(os.path.join(CACHE_DIR, 'outputs', 'TF-IDF'))\n", + "\n", + "preds_ens_val = ens_step.transform(data_val)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Bug workaround: manually delete saved output when switching datasets\n", + "os.remove(os.path.join(CACHE_DIR, 'outputs', 'TF-IDF'))\n", + "\n", + "preds_ens_test = ens_step.transform(data_test)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "acc_ens_val = accuracy_score(y_true=data_val['input']['label'], y_pred=np.argmax(preds_ens_val['y_proba'], axis=1))\n", + "print('Validation accuracy: {:.4f}'.format(acc_ens_val))\n", + "\n", + "acc_ens_test = accuracy_score(y_true=data_test['input']['label'], y_pred=np.argmax(preds_ens_test['y_proba'], axis=1))\n", + "print('Test accuracy: {:.4f}'.format(acc_ens_test))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Caching: saving output within one run only" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Sometimes you want to keep your output within one run of your pipeline but discard it at the end. This use case is handled by **caching**. Let's build a new pipeline that uses caching instead of saving to avoid re-computing results:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "new_count_vec_step = Step(name='CountVec',\n", + " transformer=CountVecTransformer(max_features=1000),\n", + " input_data=['input'],\n", + " adapter=Adapter({'X': E('input', 'text')}),\n", + " cache_dirpath=CACHE_DIR_2)\n", + "\n", + "new_tfidf_step = Step(name='TF-IDF',\n", + " transformer=StepsTfidfTransformer(),\n", + " input_steps=[new_count_vec_step], \n", + " cache_dirpath=CACHE_DIR_2,\n", + " cache_output=True)\n", + "\n", + "new_logreg_step = Step(name='SparseLogReg',\n", + " transformer=SparseLogRegProbaTransformer(),\n", + " input_steps=[new_tfidf_step],\n", + " input_data=['input'],\n", + " adapter=Adapter({'X': E('TF-IDF', 'X'),\n", + " 'y': E('input', 'label')\n", + " }),\n", + " cache_dirpath=CACHE_DIR_2)\n", + "\n", + "new_rf_step = Step(name='RF',\n", + " transformer=RfClfTransformer(n_estimators=200, max_depth=8),\n", + " input_steps=[new_tfidf_step],\n", + " input_data=['input'],\n", + " adapter=Adapter({'X': E('TF-IDF', 'X'),\n", + " 'y': E('input', 'label')\n", + " }),\n", + " cache_dirpath=CACHE_DIR_2)\n", + "\n", + "new_ens_step = Step(name='Ensembler',\n", + " transformer=AvgTransformer(),\n", + " input_steps=[new_logreg_step, new_rf_step],\n", + " adapter=Adapter({'y_proba_1': E('SparseLogReg', 'y_proba'),\n", + " 'y_proba_2': E('RF', 'y_proba')\n", + " }),\n", + " cache_dirpath=CACHE_DIR_2)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "new_ens_step" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "new_ens_step.clean_cache()\n", + "new_preds_ens_fit = new_ens_step.fit_transform(data_fit)\n", + "new_ens_step.clean_cache()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If you look carefully at the training log above, you should see that when training the second branch, TF-IDF just loaded outputs instead of re-computing them." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "new_ens_step.clean_cache()\n", + "new_preds_ens_val = new_ens_step.transform(data_val)\n", + "new_ens_step.clean_cache()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "new_ens_step.clean_cache()\n", + "new_preds_ens_test = new_ens_step.transform(data_test)\n", + "new_ens_step.clean_cache()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "new_acc_ens_fit = accuracy_score(y_true=data_fit['input']['label'], y_pred=np.argmax(new_preds_ens_fit['y_proba'], axis=1))\n", + "print('New fitting accuracy: {:.4f}'.format(new_acc_ens_fit))\n", + "\n", + "new_acc_ens_val = accuracy_score(y_true=data_val['input']['label'], y_pred=np.argmax(new_preds_ens_val['y_proba'], axis=1))\n", + "print('New validation accuracy: {:.4f}'.format(new_acc_ens_val))\n", + "\n", + "new_acc_ens_test = accuracy_score(y_true=data_test['input']['label'], y_pred=np.argmax(new_preds_ens_test['y_proba'], axis=1))\n", + "print('New test accuracy: {:.4f}'.format(new_acc_ens_test))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now you should be familiar with data persistence features. The next few notebooks will focus on building deep learning pipelines with steps." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.5" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/tutorials/5-steps-with-keras.ipynb b/tutorials/5-steps-with-keras.ipynb new file mode 100644 index 0000000..bf6f4cd --- /dev/null +++ b/tutorials/5-steps-with-keras.ipynb @@ -0,0 +1,548 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "scrolled": false + }, + "source": [ + "# Using Keras and Steps" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "scrolled": false + }, + "source": [ + "In this notebook we show how a Keras model for image recognition can be incorporated into Steps pipeline." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": false + }, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2\n", + "\n", + "import numpy as np\n", + "import pandas as pd\n", + "from pathlib import Path\n", + "\n", + "from sklearn.datasets import load_digits\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.metrics import log_loss\n", + "from sklearn.externals import joblib\n", + "\n", + "from keras.models import Sequential, Model, load_model\n", + "from keras.layers import Input, Conv2D, MaxPooling2D, Flatten, Dense, Dropout\n", + "from keras import optimizers, regularizers\n", + "from keras.preprocessing.image import ImageDataGenerator\n", + "from keras.callbacks import ModelCheckpoint\n", + "from keras.optimizers import Adam\n", + "\n", + "import matplotlib.pyplot as plt\n", + "%matplotlib inline" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from steppy.base import Step, BaseTransformer\n", + "from steppy.adapter import Adapter, E\n", + "from steppy_toolkit.keras.models import ClassifierGenerator" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We start off by loading our favourite dataset for digits recognition." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "digits = load_digits()\n", + "X_digits, y_digits = digits.data, digits.target\n", + "\n", + "X_train, X_valid, y_train, y_valid = train_test_split(X_digits, y_digits, test_size=0.2, stratify=y_digits, random_state=643793)\n", + "\n", + "print('{} samples for training'.format(len(y_train)))\n", + "print('{} samples for test'.format(len(y_valid)))\n", + "\n", + "data = {\n", + " 'input': {\n", + " 'images': X_train,\n", + " 'labels': y_train,\n", + " },\n", + " 'input_valid': {\n", + " 'images': X_valid,\n", + " 'labels': y_valid\n", + " }\n", + "}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "For convenience let's define a few constants." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "TARGET_SHAPE = (8, 8, 1) # Shape of images. In this dataset we have 8x8 pictures.\n", + " # Third dimension stands for the number of channels. We uses grayscale images, so 1 channel only.\n", + "N_CLASSES = 10 # Number of categories in this classification problem\n", + "CACHE_DIR = './cache' # directory for saved transformers and outputs" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "scrolled": false + }, + "source": [ + "To ensure that each run of the notebook trains the net from scratch (instead of just loading previously trained model), we start by removing cache." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!rm -r ./cache" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "scrolled": false + }, + "source": [ + "## Data loader" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Before we train a neural net we have to prepare the data properly.\n", + "\n", + "Sklearn keeps the digit images as one-dimensional vectors. It's fine for models like XGBoost or RandomForest, because they ignore the two-dimensional nature of images anyway. However, CNNs don't. That's why the first transformer that we define recovers this structure." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "class ReshapeData(BaseTransformer): \n", + " def transform(self, X, y, **kwargs):\n", + " X = X.reshape((X.shape[0], ) + TARGET_SHAPE)\n", + " return {\n", + " 'X': X,\n", + " 'y': y\n", + " }" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Further we use Keras' tool, ImageDataGenerator, for preparation of image stream. It takes care of mundane tasks like standarization, shuffling, augmenting or portioning the stream into batches. Let's create a generator with quite a few online augmentations." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "class PrepareDatagen(BaseTransformer):\n", + " def fit(self, X, **kwargs): \n", + " self.datagen = ImageDataGenerator(\n", + " featurewise_center=True,\n", + " featurewise_std_normalization=True,\n", + " rotation_range=10,\n", + " width_shift_range=0.1,\n", + " height_shift_range=0.1)\n", + " self.datagen.fit(X)\n", + " \n", + " def transform(self, X, **kwargs): \n", + " return {\n", + " 'datagen': self.datagen,\n", + " }\n", + " \n", + " def save(self, filepath):\n", + " joblib.dump(self.datagen, filepath)\n", + " \n", + " def load(self, filepath):\n", + " self.datagen = joblib.load(filepath)\n", + " return self" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now we can put together the first steps of the pipeline." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "reshape_step = Step(\n", + " name=\"reshape\",\n", + " transformer=ReshapeData(),\n", + " input_data=['input'],\n", + " adapter=Adapter({\n", + " 'X': E('input', 'images'),\n", + " 'y': E('input', 'labels')\n", + " }),\n", + " cache_dirpath=CACHE_DIR\n", + ")\n", + "\n", + "reshape_valid_step = Step(\n", + " name=\"reshape_valid\",\n", + " transformer=ReshapeData(),\n", + " input_data=['input_valid'],\n", + " adapter=Adapter({\n", + " 'X': E('input_valid', 'images'),\n", + " 'y': E('input_valid', 'labels')\n", + " }),\n", + " cache_dirpath=CACHE_DIR\n", + ")\n", + "\n", + "datagen_step = Step(\n", + " name=\"loader\",\n", + " transformer=PrepareDatagen(),\n", + " input_steps=[reshape_step],\n", + " cache_dirpath=CACHE_DIR\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "First, we created a step that reshapes vector representations of train images into two-dimensional arrays. Later we will need validation images in the same form, so we also created an analogical step for them. The third step creates an instance of ImageDataGenerator. It takes as input reshaped train images, so that it can calculate means and variances for standarization.\n", + "\n", + "To check that what we did actually works, let's define an auxilliary step that displays the generated image data stream." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "class DataDisplay(BaseTransformer):\n", + " def transform(self, datagen, X, y, **kwargs):\n", + " img_batch, lbl_batch = datagen.flow(X, y, batch_size=32).next()\n", + " n_row = 4\n", + " fix, axs = plt.subplots(n_row, 8, figsize=(8, 2 * n_row))\n", + " for i, ax in enumerate(axs.ravel()):\n", + " ax.imshow(img_batch[i].reshape(8, 8), cmap='gray')\n", + " ax.axis('off')\n", + " ax.set_title('lbl = {}'.format(lbl_batch[i]))\n", + " \n", + "display_step = Step(\n", + " name=\"display\",\n", + " transformer=DataDisplay(),\n", + " input_steps=[reshape_step, datagen_step],\n", + " adapter=Adapter({\n", + " 'datagen': E(datagen_step.name, 'datagen'),\n", + " 'X': E(reshape_step.name, 'X'),\n", + " 'y': E(reshape_step.name, 'y')\n", + " }),\n", + " cache_dirpath=CACHE_DIR\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "display_step" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": false + }, + "outputs": [], + "source": [ + "display_step.fit_transform(data)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Steps for CNN training" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We proceed to the crux of this notebook: step/transformer wrapping a Keras model. Steps library contains classes that facilitate this task. We will use `ClassifierGenerator` which extends `KerasModelTransformer`. Their design follows a _template method pattern_ which means that the main part of the code is defined in abstract classes and the user has to derive from them and implement some auxiliary methods, in this case: `_build_optimizer`, `_build_loss`, `_build_model`, `_create_callbacks`. That's what we do below." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "class KerasCnn(ClassifierGenerator):\n", + " def _build_optimizer(self, **kwargs):\n", + " return Adam(lr=kwargs['learning_rate'])\n", + "\n", + " def _build_loss(self, **kwargs):\n", + " return 'sparse_categorical_crossentropy'\n", + " \n", + " def _build_model(self, **kwargs):\n", + " dropout_ratio = kwargs['dropout_ratio']\n", + " regularization = kwargs['regularization']\n", + " \n", + " input_img = Input(shape=TARGET_SHAPE)\n", + "\n", + " layer = Conv2D(8, kernel_size=(3, 3), padding='same', activation='relu')(input_img)\n", + " layer = Conv2D(8, kernel_size=(3, 3), padding='same', activation='relu')(layer)\n", + " layer = MaxPooling2D((2, 2), padding='same')(layer)\n", + "\n", + " layer = Conv2D(16, kernel_size=(3, 3), padding='same', activation='relu')(layer)\n", + " layer = Conv2D(16, kernel_size=(3, 3), padding='same', activation='relu')(layer)\n", + " layer = MaxPooling2D((2, 2), padding='same')(layer)\n", + "\n", + " layer = Conv2D(32, kernel_size=(3, 3), padding='same', activation='relu')(layer)\n", + " layer = Conv2D(32, kernel_size=(3, 3), padding='same', activation='relu')(layer)\n", + " layer = MaxPooling2D((2, 2), padding='same')(layer)\n", + "\n", + " layer = Flatten()(layer)\n", + " layer = Dense(64, activation='relu', kernel_regularizer=regularizers.l2(regularization))(layer)\n", + " if dropout_ratio > 0:\n", + " layer = Dropout(dropout_ratio)(layer)\n", + " predictions = Dense(N_CLASSES, activation='softmax')(layer)\n", + "\n", + " model = Model(input_img, predictions)\n", + " return model\n", + "\n", + " def _create_callbacks(self, **kwargs):\n", + " checkpoint_filepath = kwargs['model_checkpoint']['filepath']\n", + " Path(checkpoint_filepath).parents[0].mkdir(parents=True, exist_ok=True)\n", + " model_checkpoint = ModelCheckpoint(**kwargs['model_checkpoint'])\n", + " return [model_checkpoint]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "`KerasModelTransformer`'s initializer takes 3 arguments.\n", + "1. `architecture_config` - contains model and optimizer parameters.\n", + "2. `training_config` - contains parameters for model's `fit_generator` and generator's `flow` methods.\n", + "3. `callbacks_config` - contains parameters for callbacks instantiated in `_create_callbacks` methods.\n", + "\n", + "The exact structure of these arguments is best explained on an example." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "architecture_config = {\n", + " 'model_params': {\n", + " 'dropout_ratio': 0.5,\n", + " 'regularization': 0.01\n", + " },\n", + " 'optimizer_params': {\n", + " 'learning_rate': 1e-3\n", + " }\n", + "}\n", + "\n", + "training_config = {\n", + " 'fit_args': {\n", + " 'epochs': 100,\n", + " 'verbose': True\n", + " },\n", + " 'flow_args': {\n", + " 'batch_size': 64,\n", + " }\n", + "}\n", + "\n", + "callbacks_config = {\n", + " 'model_checkpoint': {\n", + " 'filepath': str(Path(CACHE_DIR) / 'checkpoints' / 'best_model.hdf5'),\n", + " 'save_best_only': True\n", + " }\n", + "}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now we have all dependencies necessary to add the crucial step." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "cnn_step = Step(\n", + " name=\"CNN\",\n", + " transformer=KerasCnn(architecture_config, training_config, callbacks_config),\n", + " input_steps=[datagen_step, reshape_step, reshape_valid_step],\n", + " cache_dirpath=CACHE_DIR,\n", + " adapter=Adapter({\n", + " 'datagen': E(datagen_step.name, 'datagen'),\n", + " 'X': E(reshape_step.name, 'X'),\n", + " 'y': E(reshape_step.name, 'y'),\n", + " 'X_valid': E(reshape_valid_step.name, 'X'),\n", + " 'y_valid': E(reshape_valid_step.name, 'y')\n", + " }),\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "cnn_step" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Since we didn't specify `datagen_valid` the same generator will be used for train and validation data. In particular it means that validation images are augmented as well." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": false + }, + "outputs": [], + "source": [ + "result = cnn_step.fit_transform(data)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "A short function below summarizes the results." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def eval_pred(title, y_true, y_pred):\n", + " print(title)\n", + " print(\" Log-loss: \", log_loss(y_true=y_true, y_pred=y_pred))\n", + " choices = np.argmax(y_pred, axis=1)\n", + " print(\" Accuracy: {:.2%}\".format(np.sum(choices == y_true) / len(y_true)))\n", + " \n", + "eval_pred(\"Results on training\", y_true=y_train, y_pred=result['output'])\n", + "eval_pred(\"Results on validation\", y_true=y_valid, y_pred=result['output_valid'])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Because we do test time augmentation, it makes sense to run prediction phase a few times and average the results.\n", + "As we can see below it improves the overall score." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": false + }, + "outputs": [], + "source": [ + "results_valid = []\n", + "for i in range(10):\n", + " print(\"Iteration {}/10\".format(i+1))\n", + " results_valid.append(cnn_step.transform(data)['output_valid'])\n", + "y_avg_pred = np.mean(np.array(results_valid), axis=0)\n", + "eval_pred(\"Results on averaged predictions\", y_true=y_valid, y_pred=y_avg_pred)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.5.2" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/tutorials/intro.ipynb b/tutorials/intro.ipynb new file mode 100644 index 0000000..e9582d4 --- /dev/null +++ b/tutorials/intro.ipynb @@ -0,0 +1,904 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Why?\n", + "\n", + "* Sklearn Pipelines are awesome... \n", + "\n", + "```python\n", + "import numpy as np\n", + "import pandas as pd\n", + "from sklearn.pipeline import Pipeline\n", + "\n", + "pipeline = Pipeline([\n", + " ('vect', CountVectorizer()),\n", + " ('tfidf', TfidfTransformer()),\n", + " ('clf', SGDClassifier()),\n", + "])\n", + "\n", + "...\n", + "\n", + "grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)\n", + "grid_search.fit(data.data, data.target)\n", + "\n", + "\n", + "```\n", + "http://scikit-learn.org/stable/auto_examples/model_selection/grid_search_text_feature_extraction.html" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "data": { + "image/jpeg": "\n", + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from IPython.display import YouTubeVideo, HTML\n", + "YouTubeVideo(\"URdnFlZnlaE\", width=600,height=400)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "* ... but sometimes not enough\n", + " * wrapping keras/pytorch models in transformers is tricky\n", + " * caching/saving intermediate outputs is not easy\n", + " * it has to be X,y input" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "* Airflow does it all but is just to much\n", + "\n", + "\n", + "\n", + "\n", + "https://airflow.apache.org/" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Why not build one?\n", + "\n", + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Transformer\n", + "\n", + "* Almost like sklearn transformers\n", + "* Every transformer has `fit_transform` and `transform`\n", + "\n", + "```python\n", + "def fit_transform(self, X, y):\n", + " return\n", + "\n", + "def transform(self, X):\n", + " return\n", + "```\n", + "\n", + "* Those methods return `dict`\n", + "* Inputs can be named **however** you like and can be **whatever** you like\n", + "* Every transformer implements `save` and `load` methods\n", + "\n", + "```python\n", + "from keras.models import load_model\n", + "\n", + "def save(self, filepath):\n", + " self.model.save(filepath)\n", + "\n", + "def load(self, filepath):\n", + " self.model = load_model(filepath)\n", + " return self\n", + "```\n", + "\n", + "* They can do much **more than** just **transform data**." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[nltk_data] Downloading package wordnet to\n", + "[nltk_data] /home/jakub.czakon/nltk_data...\n", + "[nltk_data] Package wordnet is already up-to-date!\n", + "[nltk_data] Downloading package stopwords to\n", + "[nltk_data] /home/jakub.czakon/nltk_data...\n", + "[nltk_data] Package stopwords is already up-to-date!\n" + ] + } + ], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2\n", + "\n", + "from steps.preprocessing import TextCounter" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "text_counter = TextCounter()\n", + "\n", + "outputs = text_counter.fit_transform(['calculate featueres for this text',\n", + " 'Get Some Features For This As Well !!!'])\n" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'X'" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
char_countdigit_countlower_case_countnewline_countpunctuation_countspace_countupper_case_countword_countcaps_vs_lengthnum_symbolsnum_wordsnum_unique_wordswords_vs_uniquemean_word_len
033029004050.0000000551.05.800
138021037780.1842110881.03.875
\n", + "
" + ], + "text/plain": [ + " char_count digit_count lower_case_count newline_count \\\n", + "0 33 0 29 0 \n", + "1 38 0 21 0 \n", + "\n", + " punctuation_count space_count upper_case_count word_count \\\n", + "0 0 4 0 5 \n", + "1 3 7 7 8 \n", + "\n", + " caps_vs_length num_symbols num_words num_unique_words words_vs_unique \\\n", + "0 0.000000 0 5 5 1.0 \n", + "1 0.184211 0 8 8 1.0 \n", + "\n", + " mean_word_len \n", + "0 5.800 \n", + "1 3.875 " + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "for key, output in outputs.items():\n", + " display(key)\n", + " display(output)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# %load steps/keras/models.py\n", + "import shutil\n", + "\n", + "from keras.models import load_model\n", + "\n", + "from ..base import BaseTransformer\n", + "from .contrib import AttentionWeightedAverage\n", + "from .architectures import vdcnn, scnn, dpcnn, cudnn_gru, cudnn_lstm\n", + "\n", + "\n", + "class KerasModelTransformer(BaseTransformer):\n", + " \"\"\"\n", + " Todo:\n", + " load the best model at the end of the fit and save it\n", + " \"\"\"\n", + "\n", + " def __init__(self, architecture_config, training_config, callbacks_config):\n", + " self.architecture_config = architecture_config\n", + " self.training_config = training_config\n", + " self.callbacks_config = callbacks_config\n", + "\n", + " def reset(self):\n", + " self.model = self._build_model(**self.architecture_config)\n", + "\n", + " def _compile_model(self, model_params, optimizer_params):\n", + " model = self._build_model(**model_params)\n", + " optimizer = self._build_optimizer(**optimizer_params)\n", + " loss = self._build_loss()\n", + " model.compile(optimizer=optimizer, loss=loss)\n", + " return model\n", + "\n", + " def _create_callbacks(self, **kwargs):\n", + " return NotImplementedError\n", + "\n", + " def _build_model(self, **kwargs):\n", + " return NotImplementedError\n", + "\n", + " def _build_optimizer(self, **kwargs):\n", + " return NotImplementedError\n", + "\n", + " def _build_loss(self, **kwargs):\n", + " return NotImplementedError\n", + "\n", + " def save(self, filepath):\n", + " checkpoint_callback = self.callbacks_config.get('model_checkpoint')\n", + " if checkpoint_callback:\n", + " checkpoint_filepath = checkpoint_callback['filepath']\n", + " shutil.copyfile(checkpoint_filepath, filepath)\n", + " else:\n", + " self.model.save(filepath)\n", + "\n", + " def load(self, filepath):\n", + " self.model = load_model(filepath,\n", + " custom_objects={'AttentionWeightedAverage': AttentionWeightedAverage})\n", + " return self\n", + "\n", + "\n", + "class ClassifierXY(KerasModelTransformer):\n", + " def fit(self, X, y, validation_data, *args, **kwargs):\n", + " self.callbacks = self._create_callbacks(**self.callbacks_config)\n", + " self.model = self._compile_model(**self.architecture_config)\n", + "\n", + " self.model.fit(X, y,\n", + " validation_data=validation_data,\n", + " callbacks=self.callbacks,\n", + " verbose=1,\n", + " **self.training_config)\n", + " return self\n", + "\n", + " def transform(self, X, y=None, validation_data=None, *args, **kwargs):\n", + " predictions = self.model.predict(X, verbose=1)\n", + " return {'prediction_probability': predictions}\n", + "\n", + "\n", + "class ClassifierGenerator(KerasModelTransformer):\n", + " def fit(self, datagen, validation_datagen, *args, **kwargs):\n", + " self.callbacks = self._create_callbacks(**self.callbacks_config)\n", + " self.model = self._compile_model(**self.architecture_config)\n", + "\n", + " train_flow, train_steps = datagen\n", + " valid_flow, valid_steps = validation_datagen\n", + " self.model.fit_generator(train_flow,\n", + " steps_per_epoch=train_steps,\n", + " validation_data=valid_flow,\n", + " validation_steps=valid_steps,\n", + " callbacks=self.callbacks,\n", + " verbose=1,\n", + " **self.training_config)\n", + " return self\n", + "\n", + " def transform(self, datagen, validation_datagen=None, *args, **kwargs):\n", + " test_flow, test_steps = datagen\n", + " predictions = self.model.predict_generator(test_flow, test_steps, verbose=1)\n", + " return {'prediction_probability': predictions}\n", + "\n", + "\n", + "class PretrainedEmbeddingModel(ClassifierXY):\n", + " def fit(self, X, y, validation_data, embedding_matrix):\n", + " X_valid, y_valid = validation_data\n", + " self.callbacks = self._create_callbacks(**self.callbacks_config)\n", + " self.architecture_config['model_params']['embedding_matrix'] = embedding_matrix\n", + " self.model = self._compile_model(**self.architecture_config)\n", + " self.model.fit(X, y,\n", + " validation_data=[X_valid, y_valid],\n", + " callbacks=self.callbacks,\n", + " verbose=1,\n", + " **self.training_config)\n", + " return self\n", + "\n", + " def transform(self, X, y=None, validation_data=None, embedding_matrix=None):\n", + " predictions = self.model.predict(X, verbose=1)\n", + " return {'prediction_probability': predictions}\n", + "\n", + "\n", + "class CharVDCNNTransformer(ClassifierXY):\n", + " def _build_model(self, embedding_size, maxlen, max_features,\n", + " filter_nr, kernel_size, repeat_block,\n", + " dense_size, repeat_dense, output_size, output_activation,\n", + " max_pooling, mean_pooling, weighted_average_attention, concat_mode,\n", + " dropout_embedding, conv_dropout, dense_dropout, dropout_mode,\n", + " conv_kernel_reg_l2, conv_bias_reg_l2,\n", + " dense_kernel_reg_l2, dense_bias_reg_l2,\n", + " use_prelu, use_batch_norm, batch_norm_first):\n", + " return vdcnn(embedding_size, maxlen, max_features,\n", + " filter_nr, kernel_size, repeat_block,\n", + " dense_size, repeat_dense, output_size, output_activation,\n", + " max_pooling, mean_pooling, weighted_average_attention, concat_mode,\n", + " dropout_embedding, conv_dropout, dense_dropout, dropout_mode,\n", + " conv_kernel_reg_l2, conv_bias_reg_l2,\n", + " dense_kernel_reg_l2, dense_bias_reg_l2,\n", + " use_prelu, use_batch_norm, batch_norm_first)\n", + "\n", + "\n", + "class WordSCNNTransformer(PretrainedEmbeddingModel):\n", + " def _build_model(self, embedding_matrix, embedding_size, trainable_embedding, maxlen, max_features,\n", + " filter_nr, kernel_size, repeat_block,\n", + " dense_size, repeat_dense, output_size, output_activation,\n", + " max_pooling, mean_pooling, weighted_average_attention, concat_mode,\n", + " dropout_embedding, conv_dropout, dense_dropout, dropout_mode,\n", + " conv_kernel_reg_l2, conv_bias_reg_l2,\n", + " dense_kernel_reg_l2, dense_bias_reg_l2,\n", + " use_prelu, use_batch_norm, batch_norm_first):\n", + " return scnn(embedding_matrix, embedding_size, trainable_embedding, maxlen, max_features,\n", + " filter_nr, kernel_size, repeat_block,\n", + " dense_size, repeat_dense, output_size, output_activation,\n", + " max_pooling, mean_pooling, weighted_average_attention, concat_mode,\n", + " dropout_embedding, conv_dropout, dense_dropout, dropout_mode,\n", + " conv_kernel_reg_l2, conv_bias_reg_l2,\n", + " dense_kernel_reg_l2, dense_bias_reg_l2,\n", + " use_prelu, use_batch_norm, batch_norm_first)\n", + "\n", + "\n", + "class WordDPCNNTransformer(PretrainedEmbeddingModel):\n", + " def _build_model(self, embedding_matrix, embedding_size, trainable_embedding, maxlen, max_features,\n", + " filter_nr, kernel_size, repeat_block,\n", + " dense_size, repeat_dense, output_size, output_activation,\n", + " max_pooling, mean_pooling, weighted_average_attention, concat_mode,\n", + " dropout_embedding, conv_dropout, dense_dropout, dropout_mode,\n", + " conv_kernel_reg_l2, conv_bias_reg_l2,\n", + " dense_kernel_reg_l2, dense_bias_reg_l2,\n", + " use_prelu, use_batch_norm, batch_norm_first):\n", + " \"\"\"\n", + " Implementation of http://ai.tencent.com/ailab/media/publications/ACL3-Brady.pdf\n", + " \"\"\"\n", + " return dpcnn(embedding_matrix, embedding_size, trainable_embedding, maxlen, max_features,\n", + " filter_nr, kernel_size, repeat_block,\n", + " dense_size, repeat_dense, output_size, output_activation,\n", + " max_pooling, mean_pooling, weighted_average_attention, concat_mode,\n", + " dropout_embedding, conv_dropout, dense_dropout, dropout_mode,\n", + " conv_kernel_reg_l2, conv_bias_reg_l2,\n", + " dense_kernel_reg_l2, dense_bias_reg_l2,\n", + " use_prelu, use_batch_norm, batch_norm_first)\n", + "\n", + "\n", + "class WordCuDNNLSTMTransformer(PretrainedEmbeddingModel):\n", + " def _build_model(self, embedding_matrix, embedding_size, trainable_embedding,\n", + " maxlen, max_features,\n", + " unit_nr, repeat_block,\n", + " dense_size, repeat_dense, output_size, output_activation,\n", + " max_pooling, mean_pooling, weighted_average_attention, concat_mode,\n", + " dropout_embedding, rnn_dropout, dense_dropout, dropout_mode,\n", + " rnn_kernel_reg_l2, rnn_recurrent_reg_l2, rnn_bias_reg_l2,\n", + " dense_kernel_reg_l2, dense_bias_reg_l2,\n", + " use_prelu, use_batch_norm, batch_norm_first):\n", + " return cudnn_lstm(embedding_matrix, embedding_size, trainable_embedding,\n", + " maxlen, max_features,\n", + " unit_nr, repeat_block,\n", + " dense_size, repeat_dense, output_size, output_activation,\n", + " max_pooling, mean_pooling, weighted_average_attention, concat_mode,\n", + " dropout_embedding, rnn_dropout, dense_dropout, dropout_mode,\n", + " rnn_kernel_reg_l2, rnn_recurrent_reg_l2, rnn_bias_reg_l2,\n", + " dense_kernel_reg_l2, dense_bias_reg_l2,\n", + " use_prelu, use_batch_norm, batch_norm_first)\n", + "\n", + "\n", + "class WordCuDNNGRUTransformer(PretrainedEmbeddingModel):\n", + " def _build_model(self, embedding_matrix, embedding_size, trainable_embedding,\n", + " maxlen, max_features,\n", + " unit_nr, repeat_block,\n", + " dense_size, repeat_dense, output_size, output_activation,\n", + " max_pooling, mean_pooling, weighted_average_attention, concat_mode,\n", + " dropout_embedding, rnn_dropout, dense_dropout, dropout_mode,\n", + " rnn_kernel_reg_l2, rnn_recurrent_reg_l2, rnn_bias_reg_l2,\n", + " dense_kernel_reg_l2, dense_bias_reg_l2,\n", + " use_prelu, use_batch_norm, batch_norm_first):\n", + " return cudnn_gru(embedding_matrix, embedding_size, trainable_embedding,\n", + " maxlen, max_features,\n", + " unit_nr, repeat_block,\n", + " dense_size, repeat_dense, output_size, output_activation,\n", + " max_pooling, mean_pooling, weighted_average_attention, concat_mode,\n", + " dropout_embedding, rnn_dropout, dense_dropout, dropout_mode,\n", + " rnn_kernel_reg_l2, rnn_recurrent_reg_l2, rnn_bias_reg_l2,\n", + " dense_kernel_reg_l2, dense_bias_reg_l2,\n", + " use_prelu, use_batch_norm, batch_norm_first)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step\n", + "\n", + "```python\n", + "glove_dpcnn = Step(name='glove_dpcnn',\n", + " transformer=WordDPCNN(**config.dpcnn_network),\n", + " input_data = [],\n", + " input_steps=[word_tokenizer, \n", + " preprocessed_input, \n", + " glove_embeddings],\n", + " adapter={'X': ([('word_tokenizer', 'X')]),\n", + " 'y': ([('cleaning_output', 'y')]),\n", + " 'embedding_matrix': ([('glove_embeddings', 'embeddings_matrix')]),\n", + " 'validation_data': (\n", + " [('word_tokenizer', 'X_valid'), ('cleaning_output', 'y_valid')],\n", + " to_tuple_inputs),\n", + " },\n", + " cache_dirpath=config.env.cache_dirpath,\n", + " cache_output = True,\n", + " save_output=False, \n", + " load_saved_output=False,\n", + " force_fitting=True\n", + " )\n", + "```\n", + "\n", + "* Building block of pipelines\n", + "* Wraps around transformer and adds functionality\n", + "* easy to plug in outputs from other steps and data sources with `input_steps`, `input_data` and `adapter`\n", + "* transformers are cached/persisted as the pipeline trains (not only after it has trained)\n", + "* outputs are cached by default but you can save outputs for debugging/inspection with `save_output`\n", + "* if you want to always fit step even if it was fit before use `force_fitting`\n", + "* objects are stored in the `cache_dirpath` folder" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "binary_fill\t drop_smaller\t output\t\t watershed_contour\r\n", + "contour_resize\t loader\t\t reader_inference\r\n", + "contour_thresholding mask_resize\t reader_train\r\n", + "detached\t mask_thresholding unet_multitask\r\n" + ] + } + ], + "source": [ + "! ls /mnt/ml-team/dsb_2018/kuba/trained_pipelines/weighted_loss/transformers/" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Pipeline\n", + "\n", + "DAG of steps" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "from steps.base import Step\n", + "from steps.preprocessing import XYSplit, TextCleaner\n", + "from steps.keras.loaders import Tokenizer\n", + "from steps.keras.embeddings import GloveEmbeddingsMatrix\n", + "from steps.keras.models import WordDPCNNTransformer" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [], + "source": [ + "CACHE_DIR = '/mnt/ml-team/minerva/debug/ml_seminar'\n", + "\n", + "xy_train = Step(name='xy_train',\n", + " transformer=XYSplit(x_columns=['comment_text'],\n", + " y_columns=['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']\n", + " ),\n", + " input_data=['input'],\n", + " adapter={'meta': ([('input', 'meta')]),\n", + " 'train_mode': ([('input', 'train_mode')])\n", + " },\n", + " cache_dirpath=CACHE_DIR)\n", + "\n", + "text_cleaner = Step(name='text_cleaner_train',\n", + " transformer=TextCleaner(drop_punctuation=True,\n", + " drop_newline=True,\n", + " drop_multispaces=True,\n", + " all_lower_case=True,\n", + " fill_na_with='',\n", + " deduplication_threshold=10,\n", + " anonymize=False,\n", + " apostrophes=False,\n", + " use_stopwords=True),\n", + " input_steps=[xy_train],\n", + " adapter={'X': ([('xy_train', 'X')])},\n", + " cache_dirpath=CACHE_DIR)\n", + "\n", + "word_tokenizer = Step(name='word_tokenizer',\n", + " transformer=Tokenizer(char_level=False,\n", + " maxlen=200,\n", + " num_words=10000),\n", + " input_steps=[text_cleaner],\n", + " adapter={'X': ([(text_cleaner.name, 'X')]),\n", + " 'train_mode': ([('cleaning_output', 'train_mode')])\n", + " },\n", + " cache_dirpath=CACHE_DIR)\n", + "\n", + "glove_embeddings = Step(name='glove_embeddings',\n", + " transformer=GloveEmbeddingsMatrix(pretrained_filepath='glove.840B.300d.txt',\n", + " max_features=10000,\n", + " embedding_size=300),\n", + " input_steps=[word_tokenizer],\n", + " adapter={'tokenizer': ([(word_tokenizer.name, 'tokenizer')]),\n", + " },\n", + " cache_dirpath=CACHE_DIR)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "\n", + "dpcnn_config = {\n", + " 'architecture_config': {'model_params': {'max_features': 300,\n", + " 'maxlen': 200,\n", + " 'embedding_size': 300,\n", + " 'trainable_embedding': True,\n", + " 'filter_nr': 64,\n", + " 'kernel_size': 3,\n", + " 'repeat_block': 6,\n", + " 'dense_size': 256,\n", + " 'repeat_dense': 2,\n", + " 'output_size': 6,\n", + " 'output_activation': 'sigmoid',\n", + " 'max_pooling': True,\n", + " 'mean_pooling': True,\n", + " 'weighted_average_attention': False,\n", + " 'concat_mode': 'concat',\n", + " 'dropout_embedding': 0.5,\n", + " 'conv_dropout': 0.25,\n", + " 'dense_dropout': 0.25,\n", + " 'dropout_mode': 'spatial',\n", + " 'conv_kernel_reg_l2': 0.0,\n", + " 'conv_bias_reg_l2': 0.0,\n", + " 'dense_kernel_reg_l2': 0.0,\n", + " 'dense_bias_reg_l2': 0.0,\n", + " 'use_prelu': True,\n", + " 'use_batch_norm': True,\n", + " 'batch_norm_first': True,\n", + " },\n", + " 'optimizer_params': {'lr': 0.01,\n", + " 'momentum': 0.9,\n", + " 'nesterov': True\n", + " },\n", + " },\n", + " 'training_config': {'epochs': 10,\n", + " 'shuffle': True,\n", + " 'batch_size': 128,\n", + " },\n", + " 'callbacks_config': {'model_checkpoint': {\n", + " 'filepath': os.path.join(CACHE_DIR, 'checkpoints', 'dpcnn_network', 'best_model.h5'),\n", + " 'save_best_only': True,\n", + " 'save_weights_only': False},\n", + " 'lr_scheduler': {'gamma': 0.95},\n", + " 'unfreeze_layers': {'unfreeze_on_epoch': 10},\n", + " 'early_stopping': {'patience': 5},\n", + " 'neptune_monitor': {'model_name': 'dpcnn'},\n", + " },\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [], + "source": [ + "glove_dpcnn = Step(name='glove_dpcnn',\n", + " transformer=WordDPCNNTransformer(**dpcnn_config),\n", + " input_steps=[word_tokenizer, xy_train, glove_embeddings],\n", + " adapter={'X': ([('word_tokenizer', 'X')]),\n", + " 'y': ([('xy_train', 'y')]),\n", + " 'embedding_matrix': ([('glove_embeddings', 'embeddings_matrix')]),\n", + " },\n", + " cache_dirpath=CACHE_DIR)" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "glove_dpcnn" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "intermediate_step = glove_dpcnn.get_step('word_tokenizer')\n", + "intermediate_step" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'char_level': False,\n", + " 'maxlen': 200,\n", + " 'num_words': 10000,\n", + " 'tokenizer': }" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "intermediate_step.transformer.__dict__" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Training/Inference\n", + "\n", + "Just run `fit_transform` on last the very last step and all steps will be fitted recursively\n", + "```python\n", + " data_train = {'input': {'meta': train,\n", + " 'meta_valid': valid,\n", + " 'train_mode': True,\n", + " },\n", + " }\n", + " train_predictions = glove_dpcnn.fit_transform(data_train)\n", + "```\n", + "\n", + "prediction will be done on `transform`\n", + "\n", + "```python\n", + " data_inference = {'input': {'meta': test,\n", + " 'meta_valid': None,\n", + " 'train_mode': False,\n", + " },\n", + " }\n", + " test_predictions = glove_dpcnn.transform(data_inference)\n", + "```\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# What is missing?\n", + "\n", + "* tests \n", + "* docstrings\n", + "* automatic sklearn/steps conversion\n", + "\n", + "```python\n", + "\n", + "from steps.base import make_step\n", + "\n", + "step_transformer = make_step(SklearnTransformer())\n", + "step_transformer = make_step(Pipeline())\n", + "```\n", + "\n", + "* automatic grid search\n", + "\n", + "```python\n", + "\n", + "xgboost_ensemble = Step(name='xgboost_ensemble',\n", + " transformer=XGBoostClassifierMultilabel(**config.xgboost_ensemble),\n", + " input_data=['input'],\n", + " cache_dirpath=CACHE_DIR,\n", + " grid_search_params=parameter_space,\n", + " grid_runs=100,\n", + " grid_search_method='hyperopt')\n", + "```\n", + "\n", + "* paralelization\n", + "* automatic multistep bagging" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Where is it?\n", + "https://github.com/neptune-ml/steps/tree/dev" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Let's talk toxic\n", + "https://github.com/neptune-ml/kaggle-toxic-starter" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "dl_py3", + "language": "python", + "name": "dl_py3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.5.2" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/tutorials/simple_step_example.ipynb b/tutorials/simple_step_example.ipynb new file mode 100644 index 0000000..d903645 --- /dev/null +++ b/tutorials/simple_step_example.ipynb @@ -0,0 +1,288 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2\n", + "\n", + "import steps\n", + "from steps.base import Step, BaseTransformer, hstack_inputs\n", + "from steps.sklearn.models import make_transformer" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.datasets import load_diabetes\n", + "import sklearn.preprocessing as prep \n", + "from sklearn.ensemble import RandomForestRegressor as RFR\n", + "from sklearn.externals import joblib" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "\n", + "class MinMaxScaler(BaseTransformer):\n", + " def __init__(self):\n", + " self.scaler = prep.MinMaxScaler()\n", + " \n", + " def fit(self, X):\n", + " self.scaler.fit(X)\n", + " return self\n", + "\n", + " def transform(self, X):\n", + " X_ = self.scaler.transform(X)\n", + " return {'X':X_}\n", + " \n", + " def save(self, filepath):\n", + " joblib.dump(self.scaler, filepath)\n", + " \n", + " def load(self, filepath):\n", + " self.scaler = joblib.load(filepath)\n", + " return self\n", + " \n", + "class Normalizer(BaseTransformer):\n", + " def __init__(self):\n", + " self.scaler = prep.Normalizer()\n", + " \n", + " def fit(self, X):\n", + " self.scaler.fit(X)\n", + " return self\n", + "\n", + " def transform(self, X):\n", + " X_ = self.scaler.transform(X)\n", + " return {'X':X_}\n", + " \n", + " def save(self, filepath):\n", + " joblib.dump(self.scaler, filepath)\n", + " \n", + " def load(self, filepath):\n", + " self.scaler = joblib.load(filepath)\n", + " return self\n", + " \n", + "class RandomForest(BaseTransformer):\n", + " def __init__(self):\n", + " self.estimator = RFR()\n", + " \n", + " def fit(self, X, y):\n", + " self.estimator.fit(X, y)\n", + " return self\n", + "\n", + " def transform(self, X, **kwargs):\n", + " y_pred = self.estimator.predict(X)\n", + " return {'y_pred':y_pred}\n", + " \n", + " def save(self, filepath):\n", + " joblib.dump(self.estimator, filepath)\n", + " \n", + " def load(self, filepath):\n", + " self.estimator = joblib.load(filepath)\n", + " return self\n", + " \n", + "def hstack_vector_inputs(inputs):\n", + " inputs_ = [input_.reshape(-1,1) for input_ in inputs]\n", + " return np.hstack(inputs_)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "X,y = load_diabetes(return_X_y=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!ls /mnt/ml-team/minerva/debug/example_problem/outputs" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "CACHE_DIR = '/mnt/ml-team/minerva/debug/example_problem'\n", + "\n", + "scaler = Step(name='scaler',\n", + " transformer=MinMaxScaler(),\n", + " input_data=['input'],\n", + " adapter={'X':[('input','X')]},\n", + " cache_dirpath=CACHE_DIR\n", + " )\n", + "\n", + "normalizer = Step(name='normalizer',\n", + " transformer=Normalizer(),\n", + " input_data=['input'],\n", + " adapter={'X':[('input','X')]},\n", + " cache_dirpath=CACHE_DIR,\n", + " cache_output=True\n", + " )\n", + "\n", + "classifer = Step(name='clf',\n", + " transformer=RandomForest(),\n", + " input_data=['input'],\n", + " input_steps=[scaler, normalizer], \n", + " adapter={'y':([('input','y')]),\n", + " 'X':([('scaler','X'),\n", + " ('normalizer','X')], hstack_inputs)\n", + " },\n", + " cache_dirpath=CACHE_DIR\n", + " )\n", + "\n", + "scaler1 = Step(name='scaler1',\n", + " transformer=MinMaxScaler(),\n", + " input_data=['input'],\n", + " adapter={'X':[('input','X')]},\n", + " cache_dirpath=CACHE_DIR\n", + " )\n", + "\n", + "normalizer = Step(name='normalizer',\n", + " transformer=Normalizer(),\n", + " input_data=['input'],\n", + " adapter={'X':[('input','X')]},\n", + " cache_dirpath=CACHE_DIR\n", + " )\n", + "\n", + "classifer1 = Step(name='clf1',\n", + " transformer=RandomForest(),\n", + " input_data=['input'],\n", + " input_steps=[scaler1, normalizer], \n", + " adapter={'y':([('input','y')]),\n", + " 'X':([('scaler1','X'),\n", + " ('normalizer','X')], hstack_inputs)\n", + " },\n", + " cache_dirpath=CACHE_DIR\n", + " )\n", + "\n", + "ensemble = Step(name='ensemble',\n", + " transformer=RandomForest(),\n", + " input_data=['input'],\n", + " input_steps=[classifer, classifer1], \n", + " adapter={'y':([('input','y')]),\n", + " 'X':([('clf','y_pred'),\n", + " ('clf1','y_pred')], hstack_vector_inputs)\n", + " },\n", + " cache_dirpath=CACHE_DIR,\n", + " force_fitting=True\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ensemble" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "data = {'input': {'X': X,\n", + " 'y': y,\n", + " },\n", + " }\n", + "\n", + "ensemble.clean_cache()\n", + "output = ensemble.fit_transform(data)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ensemble" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "output['y_pred'].shape" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!ls /mnt/ml-team/minerva/debug/example_problem/outputs" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "clf = joblib.load('/mnt/ml-team/minerva/debug/example_problem/outputs/clf')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "clf['y_pred']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "cpu py3", + "language": "python", + "name": "cpu_py3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.5.2" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}