From 274056de2b16b8dad9353cb7e13be96cbfa9229d Mon Sep 17 00:00:00 2001
From: MubashirullahD <45071858+MubashirullahD@users.noreply.github.com>
Date: Fri, 27 Mar 2020 20:39:06 +0500
Subject: [PATCH 1/3] change remove_numbers_fun with change_numbers_fun

line 95
[0-9] changed to [0-9]+
"" to be replaced with 'number'.
It is the processing step that Stanford's ML course took when building a spam classifier. I think it is a huge mistake to remove numbers altogether.
Boolean remove_numbers changed to change_numbers as well.
---
 nlppreprocess/nlppreprocess.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/nlppreprocess/nlppreprocess.py b/nlppreprocess/nlppreprocess.py
index 5a2ca87..e3575b3 100644
--- a/nlppreprocess/nlppreprocess.py
+++ b/nlppreprocess/nlppreprocess.py
@@ -23,7 +23,7 @@
 
 class NLP():
     def __init__(self, remove_stopwords=True, replace_words=True,
-                 remove_numbers=True, remove_html_tags=True,
+                 replace_numbers=True, remove_html_tags=True,
                  remove_punctuations=True, lemmatize=False,
                  lemmatize_method='wordnet'):
         """
@@ -36,7 +36,7 @@ def __init__(self, remove_stopwords=True, replace_words=True,
         """
         if (type(remove_stopwords) != bool or
             type(replace_words) != bool or
-            type(remove_numbers) != bool or
+            type(replace_numbers) != bool or
             type(remove_html_tags) != bool or
             type(remove_punctuations) != bool or
             type(lemmatize) != bool):
@@ -47,7 +47,7 @@ def __init__(self, remove_stopwords=True, replace_words=True,
         self.lemmatizer = None
         self.remove_stopwords = remove_stopwords
         self.replace_words = replace_words
-        self.remove_numbers = remove_numbers
+        self.replace_numbers = replace_numbers
         self.remove_html_tags = remove_html_tags
         self.remove_punctations = remove_punctuations
         self.lemmatize_method = lemmatize_method
@@ -87,12 +87,12 @@ def replace_words_fun(self):
                 cleaned_doc.append(word)
         self.doc = ' '.join(cleaned_doc)
 
-    def remove_numbers_fun(self):
+    def replace_numbers_fun(self):
         """
         This function uses regex to remve
         all the numbers from the doc.
         """
-        self.doc = re.sub("[0-9]", "", self.doc)
+        self.doc = re.sub("[0-9]+", "number", self.doc)
 
     def remove_html_tags_fun(self):
         """
@@ -230,8 +230,8 @@ def process(self, doc):
             self.remove_html_tags_fun()
         if self.remove_stopwords is True:
             self.remove_stopwords_fun()
-        if self.remove_numbers is True:
-            self.remove_numbers_fun()
+        if self.replace_numbers is True:
+            self.replace_numbers_fun()
         if self.remove_punctations is True:
             self.remove_punctations_fun() 
         if self.lemmatize is True:

From cf887d7636361382684c69edff7f44bd4cd2108b Mon Sep 17 00:00:00 2001
From: MubashirullahD <45071858+MubashirullahD@users.noreply.github.com>
Date: Sat, 28 Mar 2020 07:24:25 +0500
Subject: [PATCH 2/3] improve regex for html tag removal

It can be done in a single line. Replaced with space.
---
 nlppreprocess/nlppreprocess.py | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/nlppreprocess/nlppreprocess.py b/nlppreprocess/nlppreprocess.py
index 5a2ca87..e6dea1b 100644
--- a/nlppreprocess/nlppreprocess.py
+++ b/nlppreprocess/nlppreprocess.py
@@ -89,24 +89,21 @@ def replace_words_fun(self):
 
     def remove_numbers_fun(self):
         """
-        This function uses regex to remve
+        This function uses regex to remove
         all the numbers from the doc.
         """
         self.doc = re.sub("[0-9]", "", self.doc)
 
     def remove_html_tags_fun(self):
         """
-        This function uses regex's complile method
-        to remove all the HTML tags from the doc
+        This function uses regex to remove
+        all the HTML tags from the doc
         """
-        cleaner = re.compile('<.*?>')
-        cleaned_text = re.sub(cleaner, '', self.doc)
-        cleaned_text = re.sub('[\n\t]', '', cleaned_text)
-        self.doc = cleaned_text
+        self.doc = re.sub(r"<[^<>]+>", ' ', self.doc)
 
     def remove_punctations_fun(self):
         """
-        This function uses regex to remove alk the
+        This function uses regex to remove all the
         punctations from the doc.
         """ 
         self.doc = re.sub('[^a-zA-Z0-9]', ' ', self.doc)

From d8a1e711f1f72d8e61f15ed651b4ef28d087aa8a Mon Sep 17 00:00:00 2001
From: MubashirullahD <45071858+MubashirullahD@users.noreply.github.com>
Date: Sat, 28 Mar 2020 07:32:30 +0500
Subject: [PATCH 3/3] Revert "improve regex for html tag removal"

---
 nlppreprocess/nlppreprocess.py | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/nlppreprocess/nlppreprocess.py b/nlppreprocess/nlppreprocess.py
index aa9c929..e3575b3 100644
--- a/nlppreprocess/nlppreprocess.py
+++ b/nlppreprocess/nlppreprocess.py
@@ -89,21 +89,24 @@ def replace_words_fun(self):
 
     def replace_numbers_fun(self):
         """
-        This function uses regex to remove
+        This function uses regex to remve
         all the numbers from the doc.
         """
         self.doc = re.sub("[0-9]+", "number", self.doc)
 
     def remove_html_tags_fun(self):
         """
-        This function uses regex to remove
-        all the HTML tags from the doc
+        This function uses regex's complile method
+        to remove all the HTML tags from the doc
         """
-        self.doc = re.sub(r"<[^<>]+>", ' ', self.doc)
+        cleaner = re.compile('<.*?>')
+        cleaned_text = re.sub(cleaner, '', self.doc)
+        cleaned_text = re.sub('[\n\t]', '', cleaned_text)
+        self.doc = cleaned_text
 
     def remove_punctations_fun(self):
         """
-        This function uses regex to remove all the
+        This function uses regex to remove alk the
         punctations from the doc.
         """ 
         self.doc = re.sub('[^a-zA-Z0-9]', ' ', self.doc)