diff --git a/verticapy/core/vdataframe/_text.py b/verticapy/core/vdataframe/_text.py index 14021711a..70a0e599c 100755 --- a/verticapy/core/vdataframe/_text.py +++ b/verticapy/core/vdataframe/_text.py @@ -60,30 +60,39 @@ def regexp( The regular expression. method: str, optional Method used to compute the regular expressions. - count : Returns the number of times a - regular expression matches each - element of the input vDataColumn. - ilike : Returns True if the vDataColumn - element contains a match for the - regular expression. - instr : Returns the starting or ending - position in a vDataColumn element - where a regular expression matches. - like : Returns True if the vDataColumn - element matches the regular - expression. - not_ilike : Returns True if the vDataColumn - element does not match the case - -insensitive regular expression. - not_like : Returns True if the vDataColumn - element does not contain a match - for the regular expression. - replace : Replaces all occurrences of a - substring that match a regular - expression with another substring. - substr : Returns the substring that matches - a regular expression within a - vDataColumn. + + - count: + Returns the number of times a + regular expression matches each + element of the input vDataColumn. + - ilike: + Returns True if the vDataColumn + element contains a match for the + regular expression. + - instr: + Returns the starting or ending + position in a vDataColumn element + where a regular expression matches. + - like: + Returns True if the vDataColumn + element matches the regular + expression. + - not_ilike : + Returns True if the vDataColumn + element does not match the case + -insensitive regular expression. + - not_like: + Returns True if the vDataColumn + element does not contain a match + for the regular expression. + - replace: + Replaces all occurrences of a + substring that match a regular + expression with another substring. + - substr: + Returns the substring that matches + a regular expression within a + vDataColumn. position: int, optional The number of characters from the start of the string where the function should start searching for matches. @@ -101,6 +110,105 @@ def regexp( ------- vDataFrame self + + Examples + --------- + + Let's begin by importing `VerticaPy`. + + .. ipython:: python + + import verticapy as vp + + + Let's generate a small dataset using the following data: + + .. ipython:: python + + data = vp.vDataFrame( + { + "rollno": ['1', '2', '3', '4'], + "subjects": [ + 'English, Math', + 'English, Math, Computer', + 'Math, Computer, Science', + 'Math, Science', + ], + } + ) + + Let's retrieve the second subject. + + .. code-block:: python + + data.regexp( + column = "subjects", + pattern = "[^,]+", + method = "substr", + occurrence = 2, + name = "subject_2").select( + [ + "subjects", + "subject_2", + ] + ) + + .. ipython:: python + :suppress: + + res = data.regexp( + column = "subjects", + pattern = "[^,]+", + method = "substr", + occurrence = 2, + name = "subject_2").select( + [ + "subjects", + "subject_2", + ] + ) + html_file = open("figures/core_vDataFrame_text_regex1.html", "w") + html_file.write(res._repr_html_()) + html_file.close() + + .. raw:: html + :file: SPHINX_DIRECTORY/figures/core_vDataFrame_text_regex1.html + + Let's count the number of subjects. + + .. code-block:: python + + data.regexp( + column = "subjects", + pattern = ",", + method = "count", + name = "nb_subjects", + ) + data["nb_subjects"].add(1) + data.select(["subjects", "nb_subjects"]) + + .. ipython:: python + :suppress: + + data.regexp( + column = "subjects", + pattern = ",", + method = "count", + name = "nb_subjects", + ) + data["nb_subjects"].add(1) + res = data.select(["subjects", "nb_subjects"]) + html_file = open("figures/core_vDataFrame_text_regex2.html", "w") + html_file.write(res._repr_html_()) + html_file.close() + + .. raw:: html + :file: SPHINX_DIRECTORY/figures/core_vDataFrame_text_regex2.html + + .. seealso:: + + | :py:mod:`verticapy.vDataFrame.eval` + """ column = self.format_colnames(column) pattern_str = pattern.replace("'", "''") @@ -135,6 +243,66 @@ def str_contains(self, pat: str) -> "vDataFrame": ------- vDataFrame self._parent + + Examples + --------- + + Let's begin by importing `VerticaPy`. + + .. ipython:: python + + import verticapy as vp + + + Let's generate a small dataset using the following data: + + .. ipython:: python + + data = vp.vDataFrame( + { + "rollno": ['1', '2', '3', '4'], + "subjects": [ + 'English, Math', + 'English, Math, Computer', + 'Math, Computer, Science', + 'Math, Science', + ], + } + ) + + Let's retrieve the second subject. + + .. code-block:: python + + data["subjects"].str_contains(pat = "English").select( + [ + "rollno", + "subjects as has_english", + ] + ) + + .. ipython:: python + :suppress: + + res = data["subjects"].str_contains(pat = "English").select( + [ + "rollno", + "subjects as has_english", + ] + ) + html_file = open("figures/core_vDataFrame_text_str_contains.html", "w") + html_file.write(res._repr_html_()) + html_file.close() + + .. raw:: html + :file: SPHINX_DIRECTORY/figures/core_vDataFrame_text_str_contains.html + + .. seealso:: + + | :py:mod:`verticapy.vDataFrame.str_count` + | :py:mod:`verticapy.vDataFrame.str_extract` + | :py:mod:`verticapy.vDataFrame.str_replace` + | :py:mod:`verticapy.vDataFrame.str_slice` """ pat = pat.replace("'", "''") return self.apply(func=f"REGEXP_COUNT({{}}, '{pat}') > 0") @@ -155,6 +323,66 @@ def str_count(self, pat: str) -> "vDataFrame": ------- vDataFrame self._parent + + Examples + --------- + + Let's begin by importing `VerticaPy`. + + .. ipython:: python + + import verticapy as vp + + Let's generate a small dataset using the following data: + + .. ipython:: python + + data = vp.vDataFrame( + { + "rollno": ['1', '2', '3', '4'], + "subjects": [ + 'English, Math', + 'English, Math, Computer', + 'Math, Computer, Science', + 'Math, Science', + ], + } + ) + + Let's count number of times "English" appears in "subjects" + vDataColumn. + + .. code-block:: python + + data["subjects"].str_count(pat = "English").select( + [ + "rollno", + "subjects as english_count", + ] + ) + + .. ipython:: python + :suppress: + + res = data["subjects"].str_count(pat = "English").select( + [ + "rollno", + "subjects as english_count", + ] + ) + html_file = open("figures/core_vDataFrame_text_str_count.html", "w") + html_file.write(res._repr_html_()) + html_file.close() + + .. raw:: html + :file: SPHINX_DIRECTORY/figures/core_vDataFrame_text_str_count.html + + .. seealso:: + + | :py:mod:`verticapy.vDataFrame.str_contains` + | :py:mod:`verticapy.vDataFrame.str_extract` + | :py:mod:`verticapy.vDataFrame.str_replace` + | :py:mod:`verticapy.vDataFrame.str_slice` """ pat = pat.replace("'", "''") return self.apply(func=f"REGEXP_COUNT({{}}, '{pat}')") @@ -174,6 +402,55 @@ def str_extract(self, pat: str) -> "vDataFrame": ------- vDataFrame self._parent + + Examples + --------- + + Let's begin by importing `VerticaPy`. + + .. ipython:: python + + import verticapy as vp + + + Let's generate a small dataset using the following data: + + .. ipython:: python + + data = vp.vDataFrame( + { + "name": [ + 'Mr. Steve Smith', + 'Mr. Charlie Dickens', + 'Mrs. Helen Ross', + 'Dr. Jack Smith', + ] + } + ) + + Let's extract the name prefix. + + .. code-block:: python + + data["name"].str_extract(pat = "([A-Za-z])+\.") + + .. ipython:: python + :suppress: + + res = data["name"].str_extract(pat = "([A-Za-z])+\.") + html_file = open("figures/core_vDataFrame_text_str_extract.html", "w") + html_file.write(res._repr_html_()) + html_file.close() + + .. raw:: html + :file: SPHINX_DIRECTORY/figures/core_vDataFrame_text_str_extract.html + + .. seealso:: + + | :py:mod:`verticapy.vDataFrame.str_contains` + | :py:mod:`verticapy.vDataFrame.str_count` + | :py:mod:`verticapy.vDataFrame.str_replace` + | :py:mod:`verticapy.vDataFrame.str_slice` """ pat = pat.replace("'", "''") return self.apply(func=f"REGEXP_SUBSTR({{}}, '{pat}')") @@ -196,6 +473,61 @@ def str_replace(self, to_replace: str, value: Optional[str] = None) -> "vDataFra ------- vDataFrame self._parent + + Examples + --------- + + Let's begin by importing `VerticaPy`. + + .. ipython:: python + + import verticapy as vp + + Let's generate a small dataset using the following data: + + .. ipython:: python + + data = vp.vDataFrame( + { + "name": [ + 'Mr. Steve Smith', + 'Mr. Charlie Dickens', + 'Mrs. Helen Ross', + 'Dr. Jack Smith', + ] + } + ) + + Let's replace the name prefix with static text + "[Name_Prefix]". + + .. code-block:: python + + data["name"].str_replace( + to_replace = "([A-Za-z])+\.", + value = "[Name_Prefix]" + ) + + .. ipython:: python + :suppress: + + res = data["name"].str_replace( + to_replace = "([A-Za-z])+\.", + value = "[Name_Prefix]" + ) + html_file = open("figures/core_vDataFrame_text_str_replace.html", "w") + html_file.write(res._repr_html_()) + html_file.close() + + .. raw:: html + :file: SPHINX_DIRECTORY/figures/core_vDataFrame_text_str_replace.html + + .. seealso:: + + | :py:mod:`verticapy.vDataFrame.str_contains` + | :py:mod:`verticapy.vDataFrame.str_count` + | :py:mod:`verticapy.vDataFrame.str_extract` + | :py:mod:`verticapy.vDataFrame.str_slice` """ to_replace = to_replace.replace("'", "''") value = value.replace("'", "''") @@ -217,5 +549,54 @@ def str_slice(self, start: int, step: int) -> "vDataFrame": ------- vDataFrame self._parent + + Examples + --------- + + Let's begin by importing `VerticaPy`. + + .. ipython:: python + + import verticapy as vp + + + Let's generate a small dataset using the following data: + + .. ipython:: python + + data = vp.vDataFrame( + { + "name": [ + 'Mr. Steve Smith', + 'Mr. Charlie Dickens', + 'Mrs. Helen Ross', + 'Dr. Jack Smith', + ] + } + ) + + Let's extract the first 3 alphabets of name. + + .. code-block:: python + + data["name"].str_slice(start = 0, step =3) + + .. ipython:: python + :suppress: + + res = data["name"].str_slice(start = 0, step =3) + html_file = open("figures/core_vDataFrame_text_str_slice.html", "w") + html_file.write(res._repr_html_()) + html_file.close() + + .. raw:: html + :file: SPHINX_DIRECTORY/figures/core_vDataFrame_text_str_slice.html + + .. seealso:: + + | :py:mod:`verticapy.vDataFrame.str_contains` + | :py:mod:`verticapy.vDataFrame.str_count` + | :py:mod:`verticapy.vDataFrame.str_replace` + | :py:mod:`verticapy.vDataFrame.str_extract` """ return self.apply(func=f"SUBSTR({{}}, {start}, {step})")