Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix api changes for v12 for the strings blog #270

Open
wants to merge 7 commits into
base: 012audit
Choose a base branch
from
Open
Original file line number Diff line number Diff line change
Expand Up @@ -191,7 +191,13 @@
"df = df.drop(labels=['label'])\n",
"\n",
"print(\"Number of lines in the DF = {:,}\".format(len(df)))\n",
"df.head(5).to_pandas()"
"df.head(5).to_pandas()\n",
"\n",
"## keeping only 90% because of the books\n",
"## because of temporary 1.5x overhead \n",
"## introduced in 0.12\n",
"## this will away with 0.13\n",
"df = df.head(int(len(df)*0.90))"
]
},
{
Expand Down Expand Up @@ -328,7 +334,7 @@
"metadata": {},
"outputs": [],
"source": [
"text_col_sample['text_clean'] = nvtext.replace_tokens(text_col_sample['text_clean'].data, STOPWORDS, ' ')\n",
"text_col_sample['text_clean'] = nvtext.replace_tokens(text_col_sample['text_clean']._column.nvstrings, STOPWORDS, ' ')\n",
"text_col_sample['text_clean'].to_pandas()"
]
},
Expand Down Expand Up @@ -394,7 +400,7 @@
" \n",
" # remove stopwords\n",
" stopwords_gpu = nvstrings.to_device(stopwords)\n",
" input_strs = nvtext.replace_tokens(input_strs.data, stopwords_gpu, ' ')\n",
" input_strs = nvtext.replace_tokens(input_strs._column.nvstrings, stopwords_gpu, ' ')\n",
" input_strs = cudf.Series(input_strs)\n",
" \n",
" # replace multiple spaces with single one and strip leading/trailing spaces\n",
Expand Down Expand Up @@ -464,7 +470,7 @@
" df = cudf.DataFrame()\n",
" # tokenize sentences into a string using nvtext.tokenize()\n",
" # it into a single tall data-frame\n",
" df['string'] = nvtext.tokenize(str_col.data)\n",
" df['string'] = nvtext.tokenize(str_col._column.nvstrings)\n",
" \n",
" # Using Group by to do a value count for string columns\n",
" # This will be natively supported soon\n",
Expand Down Expand Up @@ -634,8 +640,8 @@
" \"\"\"\n",
" import rmm\n",
" \n",
" cat = nvcategory.from_strings(str_s.data).set_keys(keys)\n",
" device_array = rmm.device_array(str_s.data.size(), dtype=np.int32) \n",
" cat = nvcategory.from_strings(str_s._column.nvstrings).set_keys(keys)\n",
" device_array = rmm.device_array(str_s._column.nvstrings.size(), dtype=np.int32) \n",
" cat.values(devptr=device_array.device_ctypes_pointer.value)\n",
" \n",
" return cudf.Series(device_array)\n",
Expand Down Expand Up @@ -674,7 +680,7 @@
"%%time\n",
"# keep only top 20k words in the dataset\n",
"th = 20_000\n",
"keys = count_df['string'][:th].data\n",
"keys = count_df['string'][:th]._column.nvstrings\n",
"encoded_wc_ls = []\n",
"\n",
"for auth_wc_df in author_wc_ls:\n",
Expand Down Expand Up @@ -805,8 +811,8 @@
" count_sr = encoded_wc_df['counts']\n",
" token_id_sr = encoded_wc_df['encoded_str_id']\n",
" \n",
" count_ar = count_sr.data.to_gpu_array()\n",
" token_id_ar = token_id_sr.data.to_gpu_array()\n",
" count_ar = count_sr.to_gpu_array()\n",
" token_id_ar = token_id_sr.to_gpu_array()\n",
" author_ar = count_dary[author_id]\n",
" \n",
" # See https://numba.pydata.org/numba-doc/0.13/CUDAJit.html\n",
Expand Down Expand Up @@ -1015,7 +1021,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.4"
"version": "3.7.6"
}
},
"nbformat": 4,
Expand Down