From 08848d21a3b32814fc8b6fa51a0011c2b50defec Mon Sep 17 00:00:00 2001 From: Vilmara Date: Tue, 29 Oct 2019 11:57:38 -0500 Subject: [PATCH] Updated NYCTaxi-E2E notebook Additions to NYCTaxi-E2E notebook addressing issues on #214 --- .../E2E/taxi/NYCTaxi-E2E.ipynb | 76 ++++++++++++++++++- 1 file changed, 73 insertions(+), 3 deletions(-) diff --git a/intermediate_notebooks/E2E/taxi/NYCTaxi-E2E.ipynb b/intermediate_notebooks/E2E/taxi/NYCTaxi-E2E.ipynb index 224b42c1..90963a62 100644 --- a/intermediate_notebooks/E2E/taxi/NYCTaxi-E2E.ipynb +++ b/intermediate_notebooks/E2E/taxi/NYCTaxi-E2E.ipynb @@ -39,6 +39,51 @@ "client" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Rapids Memory Manager Functionality (RMM)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "### Rapids Memory Manager Functionality (RMM) \n", + "import rmm\n", + "from rmm import rmm_config as rmm_cfg\n", + "\n", + "def initialize_rmm_pool():\n", + " rmm_cfg.use_pool_allocator = True\n", + " return rmm.initialize()\n", + "\n", + "def initialize_rmm_no_pool():\n", + " rmm_cfg.use_pool_allocator = False\n", + " return rmm.initialize()\n", + "\n", + "def finalize_rmm():\n", + " return rmm.finalize()\n", + "\n", + "def run_dask_task(func, **kwargs):\n", + " task = func(**kwargs)\n", + " return task" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Initialize the gpu memory pool\n", + "\n", + "client.run(finalize_rmm)\n", + "client.run(initialize_rmm_pool) " + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -309,8 +354,9 @@ " outcols=dict(day_of_week=np.float32),\n", " kwargs=dict())\n", " \n", + " # Currently xgboost doesn't support consuming boolean values from cudf, as it specializes it to bitset according to arrow\n", + " df['is_weekend'] = (df['day_of_week']<2).astype(np.int32)\n", " \n", - " df = df.drop('day_of_week')\n", " return df" ] }, @@ -392,7 +438,6 @@ " 'silent': True,\n", " 'verbose_eval': True,\n", " 'tree_method':'gpu_hist',\n", - " 'n_gpus': 1\n", "}\n", "\n", "trained_model = dxgb_gpu.train(client, params, X_train, Y_train, num_boost_round=100)" @@ -423,12 +468,21 @@ " return df.partitions[nonempty]" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Pick a Test Set" + ] + }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ + "%%time\n", + "\n", "X_test = taxi_df.query('day >= 25').persist()\n", "X_test = drop_empty_partitions(X_test)\n", "\n", @@ -438,6 +492,9 @@ "# Drop the fare amount from X_test\n", "X_test = X_test[X_test.columns.difference(['fare_amount'])]\n", "\n", + "# this wont return until all data is in GPU memory\n", + "done = wait([X_test, Y_test])\n", + "\n", "# display test set size\n", "len(X_test)" ] @@ -448,6 +505,8 @@ "metadata": {}, "outputs": [], "source": [ + "%%time\n", + "\n", "# generate predictions on the test set\n", "Y_test['prediction'] = dxgb_gpu.predict(client, trained_model, X_test)" ] @@ -543,6 +602,17 @@ "math.sqrt(Y_test.squared_error.mean().compute())" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Finalize the gpu memory pool\n", + "\n", + "client.run(finalize_rmm)" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -577,7 +647,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.3" + "version": "3.6.7" } }, "nbformat": 4,