From 61d53e63c14fb6c0513177530f92fdc90453b00a Mon Sep 17 00:00:00 2001 From: Gaute Hope Date: Tue, 28 Jan 2014 16:16:32 +0100 Subject: [PATCH 1/2] rmail: dont convert to ascii, use unicode regexp and add test for header field with weird char --- lib/sup/message.rb | 4 +- lib/sup/util.rb | 17 ------- .../weird-encoding-in-header-field.eml | 7 +++ test/test_messages_dir.rb | 44 +++++++++++++++++++ 4 files changed, 54 insertions(+), 18 deletions(-) create mode 100644 test/messages/weird-encoding-in-header-field.eml diff --git a/lib/sup/message.rb b/lib/sup/message.rb index 7fcb1556e..67b16b849 100644 --- a/lib/sup/message.rb +++ b/lib/sup/message.rb @@ -72,7 +72,9 @@ def decode_header_field v return v unless v.is_a? String return unless v.size < MAX_HEADER_VALUE_SIZE # avoid regex blowup on spam d = v.dup - d = d.transcode($encoding, 'ASCII') + # no need to transcode field when it is not converted to ascii + # in RMail + #d = d.transcode($encoding, 'ASCII') Rfc2047.decode_to $encoding, d end diff --git a/lib/sup/util.rb b/lib/sup/util.rb index f5ddd921f..319f50ff8 100644 --- a/lib/sup/util.rb +++ b/lib/sup/util.rb @@ -128,23 +128,6 @@ def calculate_boundaries(message) end class Header - - # Convert to ASCII before trying to match with regexp - class Field - - class << self - def parse(field) - field = field.dup.to_s - field = field.fix_encoding!.ascii - if field =~ EXTRACT_FIELD_NAME_RE - [ $1, $'.chomp ] - else - [ "", Field.value_strip(field) ] - end - end - end - end - ## Be more cautious about invalid content-type headers ## the original RMail code calls ## value.strip.split(/\s*;\s*/)[0].downcase diff --git a/test/messages/weird-encoding-in-header-field.eml b/test/messages/weird-encoding-in-header-field.eml new file mode 100644 index 000000000..73cc419a3 --- /dev/null +++ b/test/messages/weird-encoding-in-header-field.eml @@ -0,0 +1,7 @@ +From: foo@example.org +To: tesæt@example.org +Subject: here comes a weird char: õ + + +check out: https://github.com/sup-heliotrope/sup/issues/205 + diff --git a/test/test_messages_dir.rb b/test/test_messages_dir.rb index 6341559a4..1556f2972 100644 --- a/test/test_messages_dir.rb +++ b/test/test_messages_dir.rb @@ -1,4 +1,5 @@ #!/usr/bin/ruby +# encoding: utf-8 require 'test_helper' require 'sup' @@ -140,6 +141,49 @@ def test_missing_line assert (badline.display_length > 0), "The length of this line should greater than 0: #{badline}" end + def test_weird_header_encoding + require 'sup/util' + message = '' + File.open 'test/messages/weird-encoding-in-header-field.eml', "r:UTF-8" do |f| + message = f.read + end + + #message.force_encoding 'UTF-8' + + source = DummySource.new("sup-test://test_messages") + source.messages = [ message ] + source_info = 0 + + + sup_message = Message.build_from_source(source, source_info) + sup_message.load_from_source! + + from = sup_message.from + # "from" is just a simple person item + + assert_equal("foo@example.org", from.email) + #assert_equal("Fake Sender", from.name) + + to = sup_message.to[0] + test_to = "tesæt@example.org" + + assert_equal(test_to, to.email) + + subj = sup_message.subj + test_subj = "here comes a weird char: õ" + assert_equal(test_subj, subj) + + chunks = sup_message.load_from_source! + indexable_chunks = sup_message.indexable_chunks + + # there should be only one chunk + #assert_equal(1, chunks.length) + + lines = chunks[0].lines + + # check if body content includes some of the expected text + assert (lines.join.include? "check out: "), "Body message does not match expected value" + end end end From 1a68681ebae0b5684140d8cd675800dbaf02e726 Mon Sep 17 00:00:00 2001 From: Gaute Hope Date: Thu, 30 Jan 2014 11:06:05 +0100 Subject: [PATCH 2/2] test: crappy address field in spam message --- test/messages/bad-address-spam.eml | 160 ++++++++++++++++++ .../weird-encoding-in-header-field.eml | 2 +- test/test_messages_dir.rb | 56 +++++- 3 files changed, 216 insertions(+), 2 deletions(-) create mode 100644 test/messages/bad-address-spam.eml diff --git a/test/messages/bad-address-spam.eml b/test/messages/bad-address-spam.eml new file mode 100644 index 000000000..132a11492 --- /dev/null +++ b/test/messages/bad-address-spam.eml @@ -0,0 +1,160 @@ +Return-Path: +X-Original-To: a@b.c +Delivered-To: a@b.c +Received: by binsbergen.vps.codeventures.net (Postfix, from userid 1001) + id 0FE2122017; Fri, 2 Mar 2012 07:37:55 +0100 (CET) +Received: from localhost by binsbergen.vps.codeventures.net + with SpamAssassin (version 3.3.2); + Fri, 02 Mar 2012 07:37:54 +0100 +From: VIAGRA ® Official Site +To: +Subject: ddddddddd Pf|zer Discount ID48184 +X-Spam-Checker-Version: SpamAssassin 3.3.2 (2011-06-06) on + aa.a.a.a.a.net +X-Spam-Flag: YES +X-Spam-Level: ***************************************** +X-Spam-Status: Yes, score=41.3 required=5.0 tests=BAYES_99,DOS_OE_TO_MX, + DRUGS_ERECTILE,DRUGS_ERECTILE_OBFU,FORGED_MUA_OUTLOOK,FORGED_OUTLOOK_HTML, + FORGED_YAHOO_RCVD,FREEMAIL_FROM,FSL_HELO_NON_FQDN_1,HELO_NO_DOMAIN, + HK_NAME_DRUGS,HTML_IMAGE_ONLY_12,HTML_MESSAGE,HTML_SHORT_LINK_IMG_1, + MIME_HTML_ONLY,MISSING_DATE,MISSING_MID,RAZOR2_CF_RANGE_51_100, + RAZOR2_CF_RANGE_E8_51_100,RAZOR2_CHECK,RCVD_IN_BL_SPAMCOP_NET, + RCVD_IN_BRBL_LASTEXT,RCVD_IN_PBL,RCVD_IN_XBL,RDNS_NONE,T_REMOTE_IMAGE, + T_SURBL_MULTI1,T_SURBL_MULTI2,T_SURBL_MULTI3,T_URIBL_SEM,T_URIBL_SEM_FRESH, + T_URIBL_SEM_RED,URIBL_AB_SURBL,URIBL_DBL_SPAM,URIBL_JP_SURBL,URIBL_RHS_DOB, + URIBL_SBL,URIBL_SC_SURBL,URIBL_WS_SURBL autolearn=spam version=3.3.2 +MIME-Version: 1.0 +Content-Type: multipart/mixed; boundary="----------=_4F506AC2.EE281DC4" +Message-Id: <20120302063755.0FE2122017@a.a.a.a> +Date: Fri, 2 Mar 2012 07:37:55 +0100 (CET) + +This is a multi-part message in MIME format. + +------------=_4F506AC2.EE281DC4 +Content-Type: text/plain; charset=iso-8859-1 +Content-Disposition: inline +Content-Transfer-Encoding: 8bit + +Spam detection software, running on the system "a.a.a.a.a.", has +identified this incoming email as possible spam. The original message +has been attached to this so you can view it (if it isn't spam) or label +similar future email. If you have any questions, see +@@CONTACT_ADDRESS@@ for details. + +Content preview: Click here! [...] + +Content analysis details: (41.3 points, 5.0 required) + + pts rule name description +---- ---------------------- -------------------------------------------------- + 3.3 RCVD_IN_PBL RBL: Received via a relay in Spamhaus PBL + [197.226.240.40 listed in zen.spamhaus.org] + 0.4 RCVD_IN_XBL RBL: Received via a relay in Spamhaus XBL + 4.5 URIBL_AB_SURBL Contains an URL listed in the AB SURBL blocklist + [URIs: pillsquar.com] + 1.6 URIBL_WS_SURBL Contains an URL listed in the WS SURBL blocklist + [URIs: pillsquar.com] + 1.2 URIBL_JP_SURBL Contains an URL listed in the JP SURBL blocklist + [URIs: pillsquar.com] + 0.6 URIBL_SC_SURBL Contains an URL listed in the SC SURBL blocklist + [URIs: pillsquar.com] + 1.5 URIBL_RHS_DOB Contains an URI of a new domain (Day Old Bread) + [URIs: pillsquar.com] + 1.7 URIBL_DBL_SPAM Contains an URL listed in the DBL blocklist + [URIs: pillsquar.com] + 3.5 BAYES_99 BODY: Bayes spam probability is 99 to 100% + [score: 1.0000] + 0.0 FSL_HELO_NON_FQDN_1 FSL_HELO_NON_FQDN_1 + 0.6 HK_NAME_DRUGS From name contains drugs + 0.0 FREEMAIL_FROM Sender email is commonly abused enduser mail provider + (hbvv[at]yahoo.com) + 1.6 FORGED_YAHOO_RCVD 'From' yahoo.com does not match 'Received' headers + 2.1 HTML_IMAGE_ONLY_12 BODY: HTML: images with 800-1200 bytes of words + 0.0 HTML_MESSAGE BODY: HTML included in message + 0.7 MIME_HTML_ONLY BODY: Message only has text/html MIME parts + 1.9 RAZOR2_CF_RANGE_E8_51_100 Razor2 gives engine 8 confidence level + above 50% + [cf: 100] + 0.5 RAZOR2_CF_RANGE_51_100 Razor2 gives confidence level above 50% + [cf: 100] + 0.9 RAZOR2_CHECK Listed in Razor2 (http://razor.sf.net/) + 1.4 RCVD_IN_BRBL_LASTEXT RBL: RCVD_IN_BRBL_LASTEXT + [197.226.240.40 listed in bb.barracudacentral.org] + 1.3 RCVD_IN_BL_SPAMCOP_NET RBL: Received via a relay in bl.spamcop.net + [Blocked - see ] + 0.0 T_URIBL_SEM_RED Contains a URI listed in urired.spameatingmonkey.net + [URIs: pillsquar.com] + 0.0 T_URIBL_SEM Contains a URI listed in uribl.spameatingmonkey.net + [URIs: pillsquar.com] + 0.0 T_URIBL_SEM_FRESH Contains a domain registered less than 5 days ago + [URIs: pillsquar.com] + 1.6 URIBL_SBL Contains an URL listed in the SBL blocklist + [URIs: pillsquar.com] + 0.5 MISSING_MID Missing Message-Id: header + 1.1 DRUGS_ERECTILE_OBFU Obfuscated reference to an erectile drug + 2.0 DRUGS_ERECTILE Refers to an erectile drug + 0.0 T_SURBL_MULTI2 T_SURBL_MULTI2 + 0.8 RDNS_NONE Delivered to internal network by a host with no rDNS + 0.0 HTML_SHORT_LINK_IMG_1 HTML is very short with a linked image + 0.0 T_SURBL_MULTI3 T_SURBL_MULTI3 + 0.0 FORGED_OUTLOOK_HTML Outlook can't send HTML message only + 0.0 T_SURBL_MULTI1 T_SURBL_MULTI1 + 0.0 HELO_NO_DOMAIN Relay reports its domain incorrectly + 1.4 MISSING_DATE Missing Date: header + 0.0 T_REMOTE_IMAGE Message contains an external image + 1.9 FORGED_MUA_OUTLOOK Forged mail pretending to be from MS Outlook + 2.5 DOS_OE_TO_MX Delivered direct to MX with OE headers + +The original message was not completely plain text, and may be unsafe to +open with some email clients; in particular, it may contain a virus, +or confirm that your address can receive spam. If you wish to view +it, it may be safer to save it to a file and open it with an editor. + + +------------=_4F506AC2.EE281DC4 +Content-Type: message/rfc822; x-spam-type=original +Content-Description: original message before SpamAssassin +Content-Disposition: attachment +Content-Transfer-Encoding: 8bit + +Received: from aspire4315 (unknown [197.226.240.40]) + by binsbergen.vps.codeventures.net (Postfix) with SMTP id EC69521FA4 + for ; Fri, 2 Mar 2012 07:37:51 +0100 (CET) +Subject: a@b.c Pf|zer Discount ID48184 +From: VIAGRA ® Official Site +To: +MIME-Version: 1.0 +Content-Type: multipart/related; + boundary="----=_NextPart_000_0ADE_01CCF860.3D890300" +X-Priority: 3 +X-MSMail-Priority: Normal +X-Mailer: Microsoft Outlook Express 6.00.2900.3138 +X-MimeOLE: Produced By Microsoft MimeOLE V6.00.2900.3138 + +This is a multi-part message in MIME format. + +------=_NextPart_000_0ADE_01CCF860.3D890300 +Content-Type: text/html; + charset="windows-1251" +Content-Transfer-Encoding: 8bit + + + + + + + + + + +
Click here!
+
+
+ + + + + +------------=_4F506AC2.EE281DC4-- + diff --git a/test/messages/weird-encoding-in-header-field.eml b/test/messages/weird-encoding-in-header-field.eml index 73cc419a3..2c5e20e90 100644 --- a/test/messages/weird-encoding-in-header-field.eml +++ b/test/messages/weird-encoding-in-header-field.eml @@ -1,4 +1,4 @@ -From: foo@example.org +From: Hæy There To: tesæt@example.org Subject: here comes a weird char: õ diff --git a/test/test_messages_dir.rb b/test/test_messages_dir.rb index 1556f2972..4151b7d7a 100644 --- a/test/test_messages_dir.rb +++ b/test/test_messages_dir.rb @@ -27,6 +27,18 @@ def File.exists? file module Redwood + # monkey patch the MBox source to work without the index. + require 'sup/mbox' + class MBox + def first_new_message + 0 + end + + def my_ensure_open + @f = File.open @path, 'rb' if @f.nil? + end + end + class TestMessagesDir < ::Minitest::Unit::TestCase def setup @@ -142,7 +154,6 @@ def test_missing_line end def test_weird_header_encoding - require 'sup/util' message = '' File.open 'test/messages/weird-encoding-in-header-field.eml', "r:UTF-8" do |f| message = f.read @@ -184,6 +195,49 @@ def test_weird_header_encoding # check if body content includes some of the expected text assert (lines.join.include? "check out: "), "Body message does not match expected value" end + + + + def test_bad_address_field + + testmsg = File.join(File.dirname(__FILE__), 'messages/bad-address-spam.eml') + source = MBox.new ("mbox:///#{testmsg}") + source.my_ensure_open + source_info = 0 + + sup_message = nil + + source.poll do |sym, args| + assert_equal(sym, :add) + sup_message = Message.build_from_source source, args[:info] + end + + from = sup_message.from + # "from" is just a simple person item + + assert_equal("hbvv@yahoo.com", from.email) + #assert_equal("Fake Sender", from.name) + + to = sup_message.to[0] + test_to = "a@b.c" + + assert_equal(test_to, to.email) + + subj = sup_message.subj + test_subj = "here comes a weird char: õ" + #assert_equal(test_subj, subj) + + chunks = sup_message.load_from_source! + indexable_chunks = sup_message.indexable_chunks + + # there should be only one chunk + #assert_equal(1, chunks.length) + + lines = chunks[0].lines + + # check if body content includes some of the expected text + #assert (lines.join.include? "check out: "), "Body message does not match expected value" + end end end