diff --git a/talon/html_quotations.py b/talon/html_quotations.py
index a2db32d5..a2c0df5d 100644
--- a/talon/html_quotations.py
+++ b/talon/html_quotations.py
@@ -78,7 +78,7 @@ def delete_quotation_tags(html_note, counter, quotation_checkpoints):
def cut_gmail_quote(html_message):
- ''' Cuts the outermost block element with class gmail_quote. '''
+ """Cuts the outermost block element with class gmail_quote."""
gmail_quote = cssselect('div.gmail_quote', html_message)
if gmail_quote and (gmail_quote[0].text is None or not RE_FWD.match(gmail_quote[0].text)):
gmail_quote[0].getparent().remove(gmail_quote[0])
@@ -86,7 +86,7 @@ def cut_gmail_quote(html_message):
def cut_microsoft_quote(html_message):
- ''' Cuts splitter block and all following blocks. '''
+ """Cuts splitter block and all following blocks."""
#use EXSLT extensions to have a regex match() function with lxml
ns = {"re": "http://exslt.org/regular-expressions"}
@@ -151,7 +151,7 @@ def cut_by_id(html_message):
def cut_blockquote(html_message):
- ''' Cuts the last non-nested blockquote with wrapping elements.'''
+ """Cuts the last non-nested blockquote with wrapping elements."""
quote = html_message.xpath(
'(.//blockquote)'
'[not(@class="gmail_quote") and not(ancestor::blockquote)]'
@@ -222,6 +222,7 @@ def cut_from_block(html_message):
block.getparent().remove(block)
return True
+
def cut_zimbra_quote(html_message):
zDivider = html_message.xpath('//hr[@data-marker="__DIVIDER__"]')
if zDivider:
diff --git a/talon/quotations.py b/talon/quotations.py
index 8b368e59..85bc1a13 100644
--- a/talon/quotations.py
+++ b/talon/quotations.py
@@ -25,7 +25,7 @@
RE_FWD = re.compile("^[-]+[ ]*Forwarded message[ ]*[-]+$", re.I | re.M)
RE_ON_DATE_SMB_WROTE = re.compile(
- u'(-*[>]?[ ]?({0})[ ].*({1})(.*\n){{0,2}}.*({2}):?-*)'.format(
+ u'(-*[>]?[ ]?({0})[ ].*({1})(.*\n){{0,2}}.*\s({2}):?-*)'.format(
# Beginning of the line
u'|'.join((
# English
@@ -417,10 +417,8 @@ def extract_from_html(msg_body):
Returns a unicode string.
"""
- if isinstance(msg_body, six.text_type):
- msg_body = msg_body.encode('utf8')
- elif not isinstance(msg_body, bytes):
- msg_body = msg_body.encode('ascii')
+ if not isinstance(msg_body, six.text_type):
+ msg_body = msg_body.decode('utf8')
result = _extract_from_html(msg_body)
if isinstance(result, bytes):
@@ -448,10 +446,10 @@ def _extract_from_html(msg_body):
then checking deleted checkpoints,
then deleting necessary tags.
"""
- if msg_body.strip() == b'':
+ if msg_body.strip() == '':
return msg_body
- msg_body = msg_body.replace(b'\r\n', b'\n')
+ msg_body = msg_body.replace('\r\n', '\n')
msg_body = re.sub(r"\<\?xml.+\?\>|\<\!DOCTYPE.+]\>", "", msg_body)
@@ -596,10 +594,9 @@ def _readable_text_empty(html_tree):
def is_splitter(line):
- '''
- Returns Matcher object if provided string is a splitter and
+ """Returns Matcher object if provided string is a splitter and
None otherwise.
- '''
+ """
for pattern in SPLITTER_PATTERNS:
matcher = re.match(pattern, line)
if matcher:
@@ -607,12 +604,12 @@ def is_splitter(line):
def text_content(context):
- '''XPath Extension function to return a node text content.'''
+ """XPath Extension function to return a node text content."""
return context.context_node.xpath("string()").strip()
def tail(context):
- '''XPath Extension function to return a node tail text.'''
+ """XPath Extension function to return a node tail text."""
return context.context_node.tail or ''
diff --git a/train.py b/train.py
index 63ac7fa0..0833c657 100644
--- a/train.py
+++ b/train.py
@@ -7,5 +7,6 @@ def train_model():
""" retrain model and persist """
train(init(), EXTRACTOR_DATA, EXTRACTOR_FILENAME)
+
if __name__ == "__main__":
train_model()