diff --git a/verticapy/core/vdataframe/_text.py b/verticapy/core/vdataframe/_text.py index 14021711a..aead581e9 100755 --- a/verticapy/core/vdataframe/_text.py +++ b/verticapy/core/vdataframe/_text.py @@ -101,6 +101,87 @@ def regexp( ------- vDataFrame self + + Examples + --------- + + Let's begin by importing `VerticaPy`. + + .. ipython:: python + + import verticapy as vp + + + Let's generate a small dataset using the following data: + + .. ipython:: python + + data = vp.vDataFrame( + { + "rollno": ['1', '2', '3', '4'], + "subjects": ['English, Math', 'English, Math, Computer', 'Math, Computer, Science', 'Math, Science'], + }) + + Let's retrieve the second subject. + + .. code-block:: python + + data.regexp(column = "subjects", + pattern = "[^,]+", + method = "substr", + occurrence = 2, + name = "subject_2").select(["subjects", + "subject_2"]) + + + .. ipython:: python + :suppress: + + res = data.regexp(column = "subjects", + pattern = "[^,]+", + method = "substr", + occurrence = 2, + name = "subject_2").select(["subjects", + "subject_2"]) + html_file = open("figures/core_vDataFrame_text_regex1.html", "w") + html_file.write(res._repr_html_()) + html_file.close() + + .. raw:: html + :file: ../../../figures/core_vDataFrame_text_regex1.html + + Let's count the number of subjects. + + .. code-block:: python + + data.regexp(column = "subjects", + pattern = ",", + method = "count", + name = "nb_subjects") + data["nb_subjects"].add(1) + data.select(["subjects", "nb_subjects"]) + + + .. ipython:: python + :suppress: + + data.regexp(column = "subjects", + pattern = ",", + method = "count", + name = "nb_subjects") + data["nb_subjects"].add(1) + res = data.select(["subjects", "nb_subjects"]) + html_file = open("figures/core_vDataFrame_text_regex2.html", "w") + html_file.write(res._repr_html_()) + html_file.close() + + .. raw:: html + :file: ../../../figures/core_vDataFrame_text_regex2.html + + .. seealso:: + + | :py:mod:`verticapy.vDataFrame.eval` + """ column = self.format_colnames(column) pattern_str = pattern.replace("'", "''") @@ -135,6 +216,52 @@ def str_contains(self, pat: str) -> "vDataFrame": ------- vDataFrame self._parent + + Examples + --------- + + Let's begin by importing `VerticaPy`. + + .. ipython:: python + + import verticapy as vp + + + Let's generate a small dataset using the following data: + + .. ipython:: python + + data = vp.vDataFrame( + { + "rollno": ['1', '2', '3', '4'], + "subjects": ['English, Math', 'English, Math, Computer', 'Math, Computer, Science', 'Math, Science'], + }) + + Let's retrieve the second subject. + + .. code-block:: python + + data["subjects"].str_contains(pat = "English").select(["rollno", + "subjects as has_english"]) + + .. ipython:: python + :suppress: + + res = data["subjects"].str_contains(pat = "English").select(["rollno", + "subjects as has_english"]) + html_file = open("figures/core_vDataFrame_text_str_contains.html", "w") + html_file.write(res._repr_html_()) + html_file.close() + + .. raw:: html + :file: ../../../figures/core_vDataFrame_text_str_contains.html + + .. seealso:: + + | :py:mod:`verticapy.vDataFrame.str_count` + | :py:mod:`verticapy.vDataFrame.str_extract` + | :py:mod:`verticapy.vDataFrame.str_replace` + | :py:mod:`verticapy.vDataFrame.str_slice` """ pat = pat.replace("'", "''") return self.apply(func=f"REGEXP_COUNT({{}}, '{pat}') > 0") @@ -155,6 +282,52 @@ def str_count(self, pat: str) -> "vDataFrame": ------- vDataFrame self._parent + + Examples + --------- + + Let's begin by importing `VerticaPy`. + + .. ipython:: python + + import verticapy as vp + + + Let's generate a small dataset using the following data: + + .. ipython:: python + + data = vp.vDataFrame( + { + "rollno": ['1', '2', '3', '4'], + "subjects": ['English, Math', 'English, Math, Computer', 'Math, Computer, Science', 'Math, Science'], + }) + + Let's count number of times "English" appears in "subjects" vDataColumn. + + .. code-block:: python + + data["subjects"].str_count(pat = "English").select(["rollno", + "subjects as english_count"]) + + .. ipython:: python + :suppress: + + res = data["subjects"].str_count(pat = "English").select(["rollno", + "subjects as english_count"]) + html_file = open("figures/core_vDataFrame_text_str_count.html", "w") + html_file.write(res._repr_html_()) + html_file.close() + + .. raw:: html + :file: ../../../figures/core_vDataFrame_text_str_count.html + + .. seealso:: + + | :py:mod:`verticapy.vDataFrame.str_contains` + | :py:mod:`verticapy.vDataFrame.str_extract` + | :py:mod:`verticapy.vDataFrame.str_replace` + | :py:mod:`verticapy.vDataFrame.str_slice` """ pat = pat.replace("'", "''") return self.apply(func=f"REGEXP_COUNT({{}}, '{pat}')") @@ -174,6 +347,50 @@ def str_extract(self, pat: str) -> "vDataFrame": ------- vDataFrame self._parent + + Examples + --------- + + Let's begin by importing `VerticaPy`. + + .. ipython:: python + + import verticapy as vp + + + Let's generate a small dataset using the following data: + + .. ipython:: python + + data = vp.vDataFrame( + { + "name": ['Mr. Steve Smith', 'Mr. Charlie Dickens', 'Mrs. Helen Ross', 'Dr. Jack Smith'] + } + ) + + Let's extract the name prefix. + + .. code-block:: python + + data["name"].str_extract(pat = "([A-Za-z])+\.") + + .. ipython:: python + :suppress: + + res = data["name"].str_extract(pat = "([A-Za-z])+\.") + html_file = open("figures/core_vDataFrame_text_str_extract.html", "w") + html_file.write(res._repr_html_()) + html_file.close() + + .. raw:: html + :file: ../../../figures/core_vDataFrame_text_str_extract.html + + .. seealso:: + + | :py:mod:`verticapy.vDataFrame.str_contains` + | :py:mod:`verticapy.vDataFrame.str_count` + | :py:mod:`verticapy.vDataFrame.str_replace` + | :py:mod:`verticapy.vDataFrame.str_slice` """ pat = pat.replace("'", "''") return self.apply(func=f"REGEXP_SUBSTR({{}}, '{pat}')") @@ -196,6 +413,52 @@ def str_replace(self, to_replace: str, value: Optional[str] = None) -> "vDataFra ------- vDataFrame self._parent + + Examples + --------- + + Let's begin by importing `VerticaPy`. + + .. ipython:: python + + import verticapy as vp + + + Let's generate a small dataset using the following data: + + .. ipython:: python + + data = vp.vDataFrame( + { + "name": ['Mr. Steve Smith', 'Mr. Charlie Dickens', 'Mrs. Helen Ross', 'Dr. Jack Smith'] + } + ) + + Let's replace the name prefix with static text "[Name_Prefix]". + + .. code-block:: python + + data["name"].str_replace(to_replace = "([A-Za-z])+\.", + value = "[Name_Prefix]") + + .. ipython:: python + :suppress: + + res = data["name"].str_replace(to_replace = "([A-Za-z])+\.", + value = "[Name_Prefix]") + html_file = open("figures/core_vDataFrame_text_str_replace.html", "w") + html_file.write(res._repr_html_()) + html_file.close() + + .. raw:: html + :file: ../../../figures/core_vDataFrame_text_str_replace.html + + .. seealso:: + + | :py:mod:`verticapy.vDataFrame.str_contains` + | :py:mod:`verticapy.vDataFrame.str_count` + | :py:mod:`verticapy.vDataFrame.str_extract` + | :py:mod:`verticapy.vDataFrame.str_slice` """ to_replace = to_replace.replace("'", "''") value = value.replace("'", "''") @@ -217,5 +480,49 @@ def str_slice(self, start: int, step: int) -> "vDataFrame": ------- vDataFrame self._parent + + Examples + --------- + + Let's begin by importing `VerticaPy`. + + .. ipython:: python + + import verticapy as vp + + + Let's generate a small dataset using the following data: + + .. ipython:: python + + data = vp.vDataFrame( + { + "name": ['Mr. Steve Smith', 'Mr. Charlie Dickens', 'Mrs. Helen Ross', 'Dr. Jack Smith'] + } + ) + + Let's extract the first 3 alphabets of name. + + .. code-block:: python + + data["name"].str_slice(start = 0, step =3) + + .. ipython:: python + :suppress: + + res = data["name"].str_slice(start = 0, step =3) + html_file = open("figures/core_vDataFrame_text_str_slice.html", "w") + html_file.write(res._repr_html_()) + html_file.close() + + .. raw:: html + :file: ../../../figures/core_vDataFrame_text_str_slice.html + + .. seealso:: + + | :py:mod:`verticapy.vDataFrame.str_contains` + | :py:mod:`verticapy.vDataFrame.str_count` + | :py:mod:`verticapy.vDataFrame.str_replace` + | :py:mod:`verticapy.vDataFrame.str_extract` """ return self.apply(func=f"SUBSTR({{}}, {start}, {step})")