From 274056de2b16b8dad9353cb7e13be96cbfa9229d Mon Sep 17 00:00:00 2001 From: MubashirullahD <45071858+MubashirullahD@users.noreply.github.com> Date: Fri, 27 Mar 2020 20:39:06 +0500 Subject: [PATCH 1/3] change remove_numbers_fun with change_numbers_fun line 95 [0-9] changed to [0-9]+ "" to be replaced with 'number'. It is the processing step that Stanford's ML course took when building a spam classifier. I think it is a huge mistake to remove numbers altogether. Boolean remove_numbers changed to change_numbers as well. --- nlppreprocess/nlppreprocess.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/nlppreprocess/nlppreprocess.py b/nlppreprocess/nlppreprocess.py index 5a2ca87..e3575b3 100644 --- a/nlppreprocess/nlppreprocess.py +++ b/nlppreprocess/nlppreprocess.py @@ -23,7 +23,7 @@ class NLP(): def __init__(self, remove_stopwords=True, replace_words=True, - remove_numbers=True, remove_html_tags=True, + replace_numbers=True, remove_html_tags=True, remove_punctuations=True, lemmatize=False, lemmatize_method='wordnet'): """ @@ -36,7 +36,7 @@ def __init__(self, remove_stopwords=True, replace_words=True, """ if (type(remove_stopwords) != bool or type(replace_words) != bool or - type(remove_numbers) != bool or + type(replace_numbers) != bool or type(remove_html_tags) != bool or type(remove_punctuations) != bool or type(lemmatize) != bool): @@ -47,7 +47,7 @@ def __init__(self, remove_stopwords=True, replace_words=True, self.lemmatizer = None self.remove_stopwords = remove_stopwords self.replace_words = replace_words - self.remove_numbers = remove_numbers + self.replace_numbers = replace_numbers self.remove_html_tags = remove_html_tags self.remove_punctations = remove_punctuations self.lemmatize_method = lemmatize_method @@ -87,12 +87,12 @@ def replace_words_fun(self): cleaned_doc.append(word) self.doc = ' '.join(cleaned_doc) - def remove_numbers_fun(self): + def replace_numbers_fun(self): """ This function uses regex to remve all the numbers from the doc. """ - self.doc = re.sub("[0-9]", "", self.doc) + self.doc = re.sub("[0-9]+", "number", self.doc) def remove_html_tags_fun(self): """ @@ -230,8 +230,8 @@ def process(self, doc): self.remove_html_tags_fun() if self.remove_stopwords is True: self.remove_stopwords_fun() - if self.remove_numbers is True: - self.remove_numbers_fun() + if self.replace_numbers is True: + self.replace_numbers_fun() if self.remove_punctations is True: self.remove_punctations_fun() if self.lemmatize is True: From cf887d7636361382684c69edff7f44bd4cd2108b Mon Sep 17 00:00:00 2001 From: MubashirullahD <45071858+MubashirullahD@users.noreply.github.com> Date: Sat, 28 Mar 2020 07:24:25 +0500 Subject: [PATCH 2/3] improve regex for html tag removal It can be done in a single line. Replaced with space. --- nlppreprocess/nlppreprocess.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/nlppreprocess/nlppreprocess.py b/nlppreprocess/nlppreprocess.py index 5a2ca87..e6dea1b 100644 --- a/nlppreprocess/nlppreprocess.py +++ b/nlppreprocess/nlppreprocess.py @@ -89,24 +89,21 @@ def replace_words_fun(self): def remove_numbers_fun(self): """ - This function uses regex to remve + This function uses regex to remove all the numbers from the doc. """ self.doc = re.sub("[0-9]", "", self.doc) def remove_html_tags_fun(self): """ - This function uses regex's complile method - to remove all the HTML tags from the doc + This function uses regex to remove + all the HTML tags from the doc """ - cleaner = re.compile('<.*?>') - cleaned_text = re.sub(cleaner, '', self.doc) - cleaned_text = re.sub('[\n\t]', '', cleaned_text) - self.doc = cleaned_text + self.doc = re.sub(r"<[^<>]+>", ' ', self.doc) def remove_punctations_fun(self): """ - This function uses regex to remove alk the + This function uses regex to remove all the punctations from the doc. """ self.doc = re.sub('[^a-zA-Z0-9]', ' ', self.doc) From d8a1e711f1f72d8e61f15ed651b4ef28d087aa8a Mon Sep 17 00:00:00 2001 From: MubashirullahD <45071858+MubashirullahD@users.noreply.github.com> Date: Sat, 28 Mar 2020 07:32:30 +0500 Subject: [PATCH 3/3] Revert "improve regex for html tag removal" --- nlppreprocess/nlppreprocess.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/nlppreprocess/nlppreprocess.py b/nlppreprocess/nlppreprocess.py index aa9c929..e3575b3 100644 --- a/nlppreprocess/nlppreprocess.py +++ b/nlppreprocess/nlppreprocess.py @@ -89,21 +89,24 @@ def replace_words_fun(self): def replace_numbers_fun(self): """ - This function uses regex to remove + This function uses regex to remve all the numbers from the doc. """ self.doc = re.sub("[0-9]+", "number", self.doc) def remove_html_tags_fun(self): """ - This function uses regex to remove - all the HTML tags from the doc + This function uses regex's complile method + to remove all the HTML tags from the doc """ - self.doc = re.sub(r"<[^<>]+>", ' ', self.doc) + cleaner = re.compile('<.*?>') + cleaned_text = re.sub(cleaner, '', self.doc) + cleaned_text = re.sub('[\n\t]', '', cleaned_text) + self.doc = cleaned_text def remove_punctations_fun(self): """ - This function uses regex to remove all the + This function uses regex to remove alk the punctations from the doc. """ self.doc = re.sub('[^a-zA-Z0-9]', ' ', self.doc)