From 6458bb157b74d12fa522f1fe75b2575207b48512 Mon Sep 17 00:00:00 2001 From: vibhujawa Date: Thu, 23 Jan 2020 14:36:32 -0800 Subject: [PATCH 1/3] fix api call for v12 --- .../show_me_the_word_count_gutenberg.ipynb | 26 ++++++++++++------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/blog_notebooks/nlp/show_me_the_word_count_gutenberg/show_me_the_word_count_gutenberg.ipynb b/blog_notebooks/nlp/show_me_the_word_count_gutenberg/show_me_the_word_count_gutenberg.ipynb index 9c8b90ab..c7557fd8 100755 --- a/blog_notebooks/nlp/show_me_the_word_count_gutenberg/show_me_the_word_count_gutenberg.ipynb +++ b/blog_notebooks/nlp/show_me_the_word_count_gutenberg/show_me_the_word_count_gutenberg.ipynb @@ -191,7 +191,13 @@ "df = df.drop(labels=['label'])\n", "\n", "print(\"Number of lines in the DF = {:,}\".format(len(df)))\n", - "df.head(5).to_pandas()" + "df.head(5).to_pandas()\n", + "\n", + "## keeping only 90% because of the books\n", + "## becauss of temperaroy overhead 1.5x overhead \n", + "## introduced in 0.12\n", + "## this will away with 0.13\n", + "df = df.head(int(len(df)*0.90))" ] }, { @@ -328,7 +334,7 @@ "metadata": {}, "outputs": [], "source": [ - "text_col_sample['text_clean'] = nvtext.replace_tokens(text_col_sample['text_clean'].data, STOPWORDS, ' ')\n", + "text_col_sample['text_clean'] = nvtext.replace_tokens(text_col_sample['text_clean']._column.nvstrings, STOPWORDS, ' ')\n", "text_col_sample['text_clean'].to_pandas()" ] }, @@ -394,7 +400,7 @@ " \n", " # remove stopwords\n", " stopwords_gpu = nvstrings.to_device(stopwords)\n", - " input_strs = nvtext.replace_tokens(input_strs.data, stopwords_gpu, ' ')\n", + " input_strs = nvtext.replace_tokens(input_strs._column.nvstrings, stopwords_gpu, ' ')\n", " input_strs = cudf.Series(input_strs)\n", " \n", " # replace multiple spaces with single one and strip leading/trailing spaces\n", @@ -464,7 +470,7 @@ " df = cudf.DataFrame()\n", " # tokenize sentences into a string using nvtext.tokenize()\n", " # it into a single tall data-frame\n", - " df['string'] = nvtext.tokenize(str_col.data)\n", + " df['string'] = nvtext.tokenize(str_col._column.nvstrings)\n", " \n", " # Using Group by to do a value count for string columns\n", " # This will be natively supported soon\n", @@ -634,8 +640,8 @@ " \"\"\"\n", " import rmm\n", " \n", - " cat = nvcategory.from_strings(str_s.data).set_keys(keys)\n", - " device_array = rmm.device_array(str_s.data.size(), dtype=np.int32) \n", + " cat = nvcategory.from_strings(str_s._column.nvstrings).set_keys(keys)\n", + " device_array = rmm.device_array(str_s._column.nvstrings.size(), dtype=np.int32) \n", " cat.values(devptr=device_array.device_ctypes_pointer.value)\n", " \n", " return cudf.Series(device_array)\n", @@ -674,7 +680,7 @@ "%%time\n", "# keep only top 20k words in the dataset\n", "th = 20_000\n", - "keys = count_df['string'][:th].data\n", + "keys = count_df['string'][:th]._column.nvstrings\n", "encoded_wc_ls = []\n", "\n", "for auth_wc_df in author_wc_ls:\n", @@ -805,8 +811,8 @@ " count_sr = encoded_wc_df['counts']\n", " token_id_sr = encoded_wc_df['encoded_str_id']\n", " \n", - " count_ar = count_sr.data.to_gpu_array()\n", - " token_id_ar = token_id_sr.data.to_gpu_array()\n", + " count_ar = count_sr.to_gpu_array()\n", + " token_id_ar = token_id_sr.to_gpu_array()\n", " author_ar = count_dary[author_id]\n", " \n", " # See https://numba.pydata.org/numba-doc/0.13/CUDAJit.html\n", @@ -1015,7 +1021,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.4" + "version": "3.7.6" } }, "nbformat": 4, From 70732ed03210764cadc89fe641dc5cca9333cb6d Mon Sep 17 00:00:00 2001 From: vibhujawa Date: Thu, 23 Jan 2020 14:45:34 -0800 Subject: [PATCH 2/3] fix typos --- .../show_me_the_word_count_gutenberg.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/blog_notebooks/nlp/show_me_the_word_count_gutenberg/show_me_the_word_count_gutenberg.ipynb b/blog_notebooks/nlp/show_me_the_word_count_gutenberg/show_me_the_word_count_gutenberg.ipynb index c7557fd8..fd897e3f 100755 --- a/blog_notebooks/nlp/show_me_the_word_count_gutenberg/show_me_the_word_count_gutenberg.ipynb +++ b/blog_notebooks/nlp/show_me_the_word_count_gutenberg/show_me_the_word_count_gutenberg.ipynb @@ -194,7 +194,7 @@ "df.head(5).to_pandas()\n", "\n", "## keeping only 90% because of the books\n", - "## becauss of temperaroy overhead 1.5x overhead \n", + "## because of temporary overhead 1.5x overhead \n", "## introduced in 0.12\n", "## this will away with 0.13\n", "df = df.head(int(len(df)*0.90))" From 0c5f9438444aec01402a211eb8bf2d885fc0dded Mon Sep 17 00:00:00 2001 From: vibhujawa Date: Thu, 23 Jan 2020 14:47:10 -0800 Subject: [PATCH 3/3] 1 last typo --- .../show_me_the_word_count_gutenberg.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/blog_notebooks/nlp/show_me_the_word_count_gutenberg/show_me_the_word_count_gutenberg.ipynb b/blog_notebooks/nlp/show_me_the_word_count_gutenberg/show_me_the_word_count_gutenberg.ipynb index fd897e3f..d34b0de6 100755 --- a/blog_notebooks/nlp/show_me_the_word_count_gutenberg/show_me_the_word_count_gutenberg.ipynb +++ b/blog_notebooks/nlp/show_me_the_word_count_gutenberg/show_me_the_word_count_gutenberg.ipynb @@ -194,7 +194,7 @@ "df.head(5).to_pandas()\n", "\n", "## keeping only 90% because of the books\n", - "## because of temporary overhead 1.5x overhead \n", + "## because of temporary 1.5x overhead \n", "## introduced in 0.12\n", "## this will away with 0.13\n", "df = df.head(int(len(df)*0.90))"