mailgun · raphael0202 · Nov 20, 2018 · Nov 20, 2018 · Nov 27, 2018 · Nov 27, 2018
diff --git a/talon/html_quotations.py b/talon/html_quotations.py
@@ -78,15 +78,15 @@ def delete_quotation_tags(html_note, counter, quotation_checkpoints):
 
 
 def cut_gmail_quote(html_message):
-    ''' Cuts the outermost block element with class gmail_quote. '''
+    """Cuts the outermost block element with class gmail_quote."""
     gmail_quote = cssselect('div.gmail_quote', html_message)
     if gmail_quote and (gmail_quote[0].text is None or not RE_FWD.match(gmail_quote[0].text)):
         gmail_quote[0].getparent().remove(gmail_quote[0])
         return True
 
 
 def cut_microsoft_quote(html_message):
-    ''' Cuts splitter block and all following blocks. '''
+    """Cuts splitter block and all following blocks."""
     #use EXSLT extensions to have a regex match() function with lxml
     ns = {"re": "http://exslt.org/regular-expressions"}
 
@@ -151,7 +151,7 @@ def cut_by_id(html_message):
 
 
 def cut_blockquote(html_message):
-    ''' Cuts the last non-nested blockquote with wrapping elements.'''
+    """Cuts the last non-nested blockquote with wrapping elements."""
     quote = html_message.xpath(
         '(.//blockquote)'
         '[not(@class="gmail_quote") and not(ancestor::blockquote)]'
@@ -222,6 +222,7 @@ def cut_from_block(html_message):
         block.getparent().remove(block)
         return True
 
+
 def cut_zimbra_quote(html_message):
     zDivider = html_message.xpath('//hr[@data-marker="__DIVIDER__"]')
     if zDivider:

diff --git a/talon/quotations.py b/talon/quotations.py
@@ -25,7 +25,7 @@
 RE_FWD = re.compile("^[-]+[ ]*Forwarded message[ ]*[-]+$", re.I | re.M)
 
 RE_ON_DATE_SMB_WROTE = re.compile(
-    u'(-*[>]?[ ]?({0})[ ].*({1})(.*\n){{0,2}}.*({2}):?-*)'.format(
+    u'(-*[>]?[ ]?({0})[ ].*({1})(.*\n){{0,2}}.*\s({2}):?-*)'.format(
         # Beginning of the line
         u'|'.join((
             # English
@@ -417,10 +417,8 @@ def extract_from_html(msg_body):
 
     Returns a unicode string.
     """
-    if isinstance(msg_body, six.text_type):
-        msg_body = msg_body.encode('utf8')
-    elif not isinstance(msg_body, bytes):
-        msg_body = msg_body.encode('ascii')
+    if not isinstance(msg_body, six.text_type):
+        msg_body = msg_body.decode('utf8')
 
     result = _extract_from_html(msg_body)
     if isinstance(result, bytes):
@@ -448,10 +446,10 @@ def _extract_from_html(msg_body):
     then checking deleted checkpoints,
     then deleting necessary tags.
     """
-    if msg_body.strip() == b'':
+    if msg_body.strip() == '':
         return msg_body
 
-    msg_body = msg_body.replace(b'\r\n', b'\n')
+    msg_body = msg_body.replace('\r\n', '\n')
 
     msg_body = re.sub(r"\<\?xml.+\?\>|\<\!DOCTYPE.+]\>", "", msg_body)
 
@@ -596,23 +594,22 @@ def _readable_text_empty(html_tree):
 
 
 def is_splitter(line):
-    '''
-    Returns Matcher object if provided string is a splitter and
+    """Returns Matcher object if provided string is a splitter and
     None otherwise.
-    '''
+    """
     for pattern in SPLITTER_PATTERNS:
         matcher = re.match(pattern, line)
         if matcher:
             return matcher
 
 
 def text_content(context):
-    '''XPath Extension function to return a node text content.'''
+    """XPath Extension function to return a node text content."""
     return context.context_node.xpath("string()").strip()
 
 
 def tail(context):
-    '''XPath Extension function to return a node tail text.'''
+    """XPath Extension function to return a node tail text."""
     return context.context_node.tail or ''
 
 

diff --git a/train.py b/train.py
@@ -7,5 +7,6 @@ def train_model():
     """ retrain model and persist """
     train(init(), EXTRACTOR_DATA, EXTRACTOR_FILENAME)
 
+
 if __name__ == "__main__":
     train_model()