From 705d95c00e1c1d7510122078f241ba5918d73991 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Bournhonesque?= Date: Tue, 20 Nov 2018 14:57:01 +0100 Subject: [PATCH 1/4] Fix splitter detection bug #178 See https://github.com/mailgun/talon/issues/178 --- talon/quotations.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/talon/quotations.py b/talon/quotations.py index 8b368e59..f2edf2d2 100644 --- a/talon/quotations.py +++ b/talon/quotations.py @@ -25,7 +25,7 @@ RE_FWD = re.compile("^[-]+[ ]*Forwarded message[ ]*[-]+$", re.I | re.M) RE_ON_DATE_SMB_WROTE = re.compile( - u'(-*[>]?[ ]?({0})[ ].*({1})(.*\n){{0,2}}.*({2}):?-*)'.format( + u'(-*[>]?[ ]?({0})[ ].*({1})(.*\n){{0,2}}.* ({2}):?-*)'.format( # Beginning of the line u'|'.join(( # English From 7a3007c70f121e43fc3397c7bd4ab98cacb3bd2e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Bournhonesque?= Date: Tue, 20 Nov 2018 15:02:26 +0100 Subject: [PATCH 2/4] Replace simple space by \s in RE_ON_DATE_SMB_WROTE regex --- talon/quotations.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/talon/quotations.py b/talon/quotations.py index f2edf2d2..5dfde97b 100644 --- a/talon/quotations.py +++ b/talon/quotations.py @@ -25,7 +25,7 @@ RE_FWD = re.compile("^[-]+[ ]*Forwarded message[ ]*[-]+$", re.I | re.M) RE_ON_DATE_SMB_WROTE = re.compile( - u'(-*[>]?[ ]?({0})[ ].*({1})(.*\n){{0,2}}.* ({2}):?-*)'.format( + u'(-*[>]?[ ]?({0})[ ].*({1})(.*\n){{0,2}}.*\s({2}):?-*)'.format( # Beginning of the line u'|'.join(( # English From 16bf10c390f6fff2373e5f350884eb11d5bf8283 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Bournhonesque?= Date: Tue, 27 Nov 2018 10:00:02 +0100 Subject: [PATCH 3/4] Cosmetics --- talon/html_quotations.py | 7 ++++--- talon/quotations.py | 9 ++++----- train.py | 1 + 3 files changed, 9 insertions(+), 8 deletions(-) diff --git a/talon/html_quotations.py b/talon/html_quotations.py index a2db32d5..a2c0df5d 100644 --- a/talon/html_quotations.py +++ b/talon/html_quotations.py @@ -78,7 +78,7 @@ def delete_quotation_tags(html_note, counter, quotation_checkpoints): def cut_gmail_quote(html_message): - ''' Cuts the outermost block element with class gmail_quote. ''' + """Cuts the outermost block element with class gmail_quote.""" gmail_quote = cssselect('div.gmail_quote', html_message) if gmail_quote and (gmail_quote[0].text is None or not RE_FWD.match(gmail_quote[0].text)): gmail_quote[0].getparent().remove(gmail_quote[0]) @@ -86,7 +86,7 @@ def cut_gmail_quote(html_message): def cut_microsoft_quote(html_message): - ''' Cuts splitter block and all following blocks. ''' + """Cuts splitter block and all following blocks.""" #use EXSLT extensions to have a regex match() function with lxml ns = {"re": "http://exslt.org/regular-expressions"} @@ -151,7 +151,7 @@ def cut_by_id(html_message): def cut_blockquote(html_message): - ''' Cuts the last non-nested blockquote with wrapping elements.''' + """Cuts the last non-nested blockquote with wrapping elements.""" quote = html_message.xpath( '(.//blockquote)' '[not(@class="gmail_quote") and not(ancestor::blockquote)]' @@ -222,6 +222,7 @@ def cut_from_block(html_message): block.getparent().remove(block) return True + def cut_zimbra_quote(html_message): zDivider = html_message.xpath('//hr[@data-marker="__DIVIDER__"]') if zDivider: diff --git a/talon/quotations.py b/talon/quotations.py index 5dfde97b..3f75b46b 100644 --- a/talon/quotations.py +++ b/talon/quotations.py @@ -596,10 +596,9 @@ def _readable_text_empty(html_tree): def is_splitter(line): - ''' - Returns Matcher object if provided string is a splitter and + """Returns Matcher object if provided string is a splitter and None otherwise. - ''' + """ for pattern in SPLITTER_PATTERNS: matcher = re.match(pattern, line) if matcher: @@ -607,12 +606,12 @@ def is_splitter(line): def text_content(context): - '''XPath Extension function to return a node text content.''' + """XPath Extension function to return a node text content.""" return context.context_node.xpath("string()").strip() def tail(context): - '''XPath Extension function to return a node tail text.''' + """XPath Extension function to return a node tail text.""" return context.context_node.tail or '' diff --git a/train.py b/train.py index 63ac7fa0..0833c657 100644 --- a/train.py +++ b/train.py @@ -7,5 +7,6 @@ def train_model(): """ retrain model and persist """ train(init(), EXTRACTOR_DATA, EXTRACTOR_FILENAME) + if __name__ == "__main__": train_model() From dce2441bcebe7e19556e163c789cd558ed2c3a67 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Bournhonesque?= Date: Tue, 27 Nov 2018 10:02:14 +0100 Subject: [PATCH 4/4] Fix encoding issues with Python3 --- talon/quotations.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/talon/quotations.py b/talon/quotations.py index 3f75b46b..85bc1a13 100644 --- a/talon/quotations.py +++ b/talon/quotations.py @@ -417,10 +417,8 @@ def extract_from_html(msg_body): Returns a unicode string. """ - if isinstance(msg_body, six.text_type): - msg_body = msg_body.encode('utf8') - elif not isinstance(msg_body, bytes): - msg_body = msg_body.encode('ascii') + if not isinstance(msg_body, six.text_type): + msg_body = msg_body.decode('utf8') result = _extract_from_html(msg_body) if isinstance(result, bytes): @@ -448,10 +446,10 @@ def _extract_from_html(msg_body): then checking deleted checkpoints, then deleting necessary tags. """ - if msg_body.strip() == b'': + if msg_body.strip() == '': return msg_body - msg_body = msg_body.replace(b'\r\n', b'\n') + msg_body = msg_body.replace('\r\n', '\n') msg_body = re.sub(r"\<\?xml.+\?\>|\<\!DOCTYPE.+]\>", "", msg_body)