diff --git a/talon/html_quotations.py b/talon/html_quotations.py index a2db32d5..a2c0df5d 100644 --- a/talon/html_quotations.py +++ b/talon/html_quotations.py @@ -78,7 +78,7 @@ def delete_quotation_tags(html_note, counter, quotation_checkpoints): def cut_gmail_quote(html_message): - ''' Cuts the outermost block element with class gmail_quote. ''' + """Cuts the outermost block element with class gmail_quote.""" gmail_quote = cssselect('div.gmail_quote', html_message) if gmail_quote and (gmail_quote[0].text is None or not RE_FWD.match(gmail_quote[0].text)): gmail_quote[0].getparent().remove(gmail_quote[0]) @@ -86,7 +86,7 @@ def cut_gmail_quote(html_message): def cut_microsoft_quote(html_message): - ''' Cuts splitter block and all following blocks. ''' + """Cuts splitter block and all following blocks.""" #use EXSLT extensions to have a regex match() function with lxml ns = {"re": "http://exslt.org/regular-expressions"} @@ -151,7 +151,7 @@ def cut_by_id(html_message): def cut_blockquote(html_message): - ''' Cuts the last non-nested blockquote with wrapping elements.''' + """Cuts the last non-nested blockquote with wrapping elements.""" quote = html_message.xpath( '(.//blockquote)' '[not(@class="gmail_quote") and not(ancestor::blockquote)]' @@ -222,6 +222,7 @@ def cut_from_block(html_message): block.getparent().remove(block) return True + def cut_zimbra_quote(html_message): zDivider = html_message.xpath('//hr[@data-marker="__DIVIDER__"]') if zDivider: diff --git a/talon/quotations.py b/talon/quotations.py index 8b368e59..85bc1a13 100644 --- a/talon/quotations.py +++ b/talon/quotations.py @@ -25,7 +25,7 @@ RE_FWD = re.compile("^[-]+[ ]*Forwarded message[ ]*[-]+$", re.I | re.M) RE_ON_DATE_SMB_WROTE = re.compile( - u'(-*[>]?[ ]?({0})[ ].*({1})(.*\n){{0,2}}.*({2}):?-*)'.format( + u'(-*[>]?[ ]?({0})[ ].*({1})(.*\n){{0,2}}.*\s({2}):?-*)'.format( # Beginning of the line u'|'.join(( # English @@ -417,10 +417,8 @@ def extract_from_html(msg_body): Returns a unicode string. """ - if isinstance(msg_body, six.text_type): - msg_body = msg_body.encode('utf8') - elif not isinstance(msg_body, bytes): - msg_body = msg_body.encode('ascii') + if not isinstance(msg_body, six.text_type): + msg_body = msg_body.decode('utf8') result = _extract_from_html(msg_body) if isinstance(result, bytes): @@ -448,10 +446,10 @@ def _extract_from_html(msg_body): then checking deleted checkpoints, then deleting necessary tags. """ - if msg_body.strip() == b'': + if msg_body.strip() == '': return msg_body - msg_body = msg_body.replace(b'\r\n', b'\n') + msg_body = msg_body.replace('\r\n', '\n') msg_body = re.sub(r"\<\?xml.+\?\>|\<\!DOCTYPE.+]\>", "", msg_body) @@ -596,10 +594,9 @@ def _readable_text_empty(html_tree): def is_splitter(line): - ''' - Returns Matcher object if provided string is a splitter and + """Returns Matcher object if provided string is a splitter and None otherwise. - ''' + """ for pattern in SPLITTER_PATTERNS: matcher = re.match(pattern, line) if matcher: @@ -607,12 +604,12 @@ def is_splitter(line): def text_content(context): - '''XPath Extension function to return a node text content.''' + """XPath Extension function to return a node text content.""" return context.context_node.xpath("string()").strip() def tail(context): - '''XPath Extension function to return a node tail text.''' + """XPath Extension function to return a node tail text.""" return context.context_node.tail or '' diff --git a/train.py b/train.py index 63ac7fa0..0833c657 100644 --- a/train.py +++ b/train.py @@ -7,5 +7,6 @@ def train_model(): """ retrain model and persist """ train(init(), EXTRACTOR_DATA, EXTRACTOR_FILENAME) + if __name__ == "__main__": train_model()