From 705d95c00e1c1d7510122078f241ba5918d73991 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rapha=C3=ABl=20Bournhonesque?= <rbournhonesque@wuha.io>
Date: Tue, 20 Nov 2018 14:57:01 +0100
Subject: [PATCH 1/4] Fix splitter detection bug #178 See
 https://github.com/mailgun/talon/issues/178

---
 talon/quotations.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/talon/quotations.py b/talon/quotations.py
index 8b368e59..f2edf2d2 100644
--- a/talon/quotations.py
+++ b/talon/quotations.py
@@ -25,7 +25,7 @@
 RE_FWD = re.compile("^[-]+[ ]*Forwarded message[ ]*[-]+$", re.I | re.M)
 
 RE_ON_DATE_SMB_WROTE = re.compile(
-    u'(-*[>]?[ ]?({0})[ ].*({1})(.*\n){{0,2}}.*({2}):?-*)'.format(
+    u'(-*[>]?[ ]?({0})[ ].*({1})(.*\n){{0,2}}.* ({2}):?-*)'.format(
         # Beginning of the line
         u'|'.join((
             # English

From 7a3007c70f121e43fc3397c7bd4ab98cacb3bd2e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rapha=C3=ABl=20Bournhonesque?= <rbournhonesque@wuha.io>
Date: Tue, 20 Nov 2018 15:02:26 +0100
Subject: [PATCH 2/4] Replace simple space by \s in RE_ON_DATE_SMB_WROTE regex

---
 talon/quotations.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/talon/quotations.py b/talon/quotations.py
index f2edf2d2..5dfde97b 100644
--- a/talon/quotations.py
+++ b/talon/quotations.py
@@ -25,7 +25,7 @@
 RE_FWD = re.compile("^[-]+[ ]*Forwarded message[ ]*[-]+$", re.I | re.M)
 
 RE_ON_DATE_SMB_WROTE = re.compile(
-    u'(-*[>]?[ ]?({0})[ ].*({1})(.*\n){{0,2}}.* ({2}):?-*)'.format(
+    u'(-*[>]?[ ]?({0})[ ].*({1})(.*\n){{0,2}}.*\s({2}):?-*)'.format(
         # Beginning of the line
         u'|'.join((
             # English

From 16bf10c390f6fff2373e5f350884eb11d5bf8283 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rapha=C3=ABl=20Bournhonesque?= <rbournhonesque@wuha.io>
Date: Tue, 27 Nov 2018 10:00:02 +0100
Subject: [PATCH 3/4] Cosmetics

---
 talon/html_quotations.py | 7 ++++---
 talon/quotations.py      | 9 ++++-----
 train.py                 | 1 +
 3 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/talon/html_quotations.py b/talon/html_quotations.py
index a2db32d5..a2c0df5d 100644
--- a/talon/html_quotations.py
+++ b/talon/html_quotations.py
@@ -78,7 +78,7 @@ def delete_quotation_tags(html_note, counter, quotation_checkpoints):
 
 
 def cut_gmail_quote(html_message):
-    ''' Cuts the outermost block element with class gmail_quote. '''
+    """Cuts the outermost block element with class gmail_quote."""
     gmail_quote = cssselect('div.gmail_quote', html_message)
     if gmail_quote and (gmail_quote[0].text is None or not RE_FWD.match(gmail_quote[0].text)):
         gmail_quote[0].getparent().remove(gmail_quote[0])
@@ -86,7 +86,7 @@ def cut_gmail_quote(html_message):
 
 
 def cut_microsoft_quote(html_message):
-    ''' Cuts splitter block and all following blocks. '''
+    """Cuts splitter block and all following blocks."""
     #use EXSLT extensions to have a regex match() function with lxml
     ns = {"re": "http://exslt.org/regular-expressions"}
 
@@ -151,7 +151,7 @@ def cut_by_id(html_message):
 
 
 def cut_blockquote(html_message):
-    ''' Cuts the last non-nested blockquote with wrapping elements.'''
+    """Cuts the last non-nested blockquote with wrapping elements."""
     quote = html_message.xpath(
         '(.//blockquote)'
         '[not(@class="gmail_quote") and not(ancestor::blockquote)]'
@@ -222,6 +222,7 @@ def cut_from_block(html_message):
         block.getparent().remove(block)
         return True
 
+
 def cut_zimbra_quote(html_message):
     zDivider = html_message.xpath('//hr[@data-marker="__DIVIDER__"]')
     if zDivider:
diff --git a/talon/quotations.py b/talon/quotations.py
index 5dfde97b..3f75b46b 100644
--- a/talon/quotations.py
+++ b/talon/quotations.py
@@ -596,10 +596,9 @@ def _readable_text_empty(html_tree):
 
 
 def is_splitter(line):
-    '''
-    Returns Matcher object if provided string is a splitter and
+    """Returns Matcher object if provided string is a splitter and
     None otherwise.
-    '''
+    """
     for pattern in SPLITTER_PATTERNS:
         matcher = re.match(pattern, line)
         if matcher:
@@ -607,12 +606,12 @@ def is_splitter(line):
 
 
 def text_content(context):
-    '''XPath Extension function to return a node text content.'''
+    """XPath Extension function to return a node text content."""
     return context.context_node.xpath("string()").strip()
 
 
 def tail(context):
-    '''XPath Extension function to return a node tail text.'''
+    """XPath Extension function to return a node tail text."""
     return context.context_node.tail or ''
 
 
diff --git a/train.py b/train.py
index 63ac7fa0..0833c657 100644
--- a/train.py
+++ b/train.py
@@ -7,5 +7,6 @@ def train_model():
     """ retrain model and persist """
     train(init(), EXTRACTOR_DATA, EXTRACTOR_FILENAME)
 
+
 if __name__ == "__main__":
     train_model()

From dce2441bcebe7e19556e163c789cd558ed2c3a67 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rapha=C3=ABl=20Bournhonesque?= <rbournhonesque@wuha.io>
Date: Tue, 27 Nov 2018 10:02:14 +0100
Subject: [PATCH 4/4] Fix encoding issues with Python3

---
 talon/quotations.py | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/talon/quotations.py b/talon/quotations.py
index 3f75b46b..85bc1a13 100644
--- a/talon/quotations.py
+++ b/talon/quotations.py
@@ -417,10 +417,8 @@ def extract_from_html(msg_body):
 
     Returns a unicode string.
     """
-    if isinstance(msg_body, six.text_type):
-        msg_body = msg_body.encode('utf8')
-    elif not isinstance(msg_body, bytes):
-        msg_body = msg_body.encode('ascii')
+    if not isinstance(msg_body, six.text_type):
+        msg_body = msg_body.decode('utf8')
 
     result = _extract_from_html(msg_body)
     if isinstance(result, bytes):
@@ -448,10 +446,10 @@ def _extract_from_html(msg_body):
     then checking deleted checkpoints,
     then deleting necessary tags.
     """
-    if msg_body.strip() == b'':
+    if msg_body.strip() == '':
         return msg_body
 
-    msg_body = msg_body.replace(b'\r\n', b'\n')
+    msg_body = msg_body.replace('\r\n', '\n')
 
     msg_body = re.sub(r"\<\?xml.+\?\>|\<\!DOCTYPE.+]\>", "", msg_body)