-
Notifications
You must be signed in to change notification settings - Fork 17
/
Copy pathheaderparser.js
1154 lines (1068 loc) · 46.2 KB
/
headerparser.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
define(function(require) {
/**
* This file implements the structured decoding of message header fields. It is
* part of the same system as found in mimemimeutils.js, and occasionally makes
* references to globals defined in that file or other dependencies thereof. See
* documentation in that file for more information about external dependencies.
*/
"use strict";
var mimeutils = require('./mimeutils');
/**
* This is the API that we ultimately return.
*
* We define it as a global here, because we need to pass it as a |this|
* argument to a few functions.
*/
var headerparser = {};
/**
* Tokenizes a message header into a stream of tokens as a generator.
*
* The low-level tokens are meant to be loosely correspond to the tokens as
* defined in RFC 5322. For reasons of saner error handling, however, the two
* definitions are not exactly equivalent. The tokens we emit are the following:
* 1. Special delimiters: Any char in the delimiters string is emitted as a
* string by itself. Parsing parameter headers, for example, would use ";="
* for the delimiter string.
* 2. Quoted-strings (if opt.qstring is true): A string which is surrounded by
* double quotes. Escapes in the string are omitted when returning.
* 3. Domain Literals (if opt.dliteral is true): A string which matches the
* dliteral construct in RFC 5322. Escapes here are NOT omitted.
* 4. Comments (if opt.comments is true): Comments are handled specially. In
* practice, decoding the comments in To headers appears to be necessary, so
* comments are not stripped in the output value. Instead, they are emitted
* as if they are a special delimiter. However, all delimiters found within a
* comment are returned as if they were a quoted string, so that consumers
* ignore delimiters within comments. If ignoring comment text completely is
* desired, upon seeing a "(" token, consumers should ignore all tokens until
* a matching ")" is found (note that comments can be nested).
* 5. RFC 2047 encoded-words (if opts.rfc2047 is true): These are strings which
* are the decoded contents of RFC 2047's =?UTF-8?Q?blah?=-style words.
* 6. Atoms: Atoms are defined not in the RFC 5322 sense, but rather as the
* longest sequence of characters that is neither whitespace nor any of the
* special characters above.
*
* The intended interpretation of the stream of output tokens is that they are
* the portions of text which can be safely wrapped in whitespace with no ill
* effect. The output tokens are either strings (which represent individual
* delimiter tokens) or instances of a class that has a customized .toString()
* for output (for quoted strings, atoms, domain literals, and encoded-words).
* Checking for a delimiter MUST use the strictly equals operator (===). For
* example, the proper way to call this method is as follows:
*
* for (let token of getHeaderTokens(rest, ";=", opts)) {
* if (token === ';') {
* // This represents a literal ';' in the string
* } else if (token === '=') {
* // This represents a literal '=' in the string
* } else {
* // If a ";" qstring was parsed, we fall through to here!
* token = token.toString();
* }
* }
*
* This method does not properly tokenize 5322 in all corner cases; however,
* this is equivalent in those corner cases to an older header parsing
* algorithm, so the algorithm should be correct for all real-world cases. The
* corner cases are as follows:
* 1. Quoted-strings and domain literals are parsed even if they are within a
* comment block (we effectively treat ctext as containing qstring).
* 2. WSP need not be between a qstring and an atom (a"b" produces two tokens,
* a and b). This is an error case, though.
* 3. Legacy comments as display names: We recognize address fields with
* comments, and (a) either drop them if inside addr-spec or (b) preserve
* them as part of the display-name if not. If the display-name is empty
* while the last comment is not, we assume it's the legacy form above and
* take the comment content as the display-name.
*
* @param {String} value The header value, post charset conversion but
* before RFC 2047 decoding, to be parsed.
* @param {String} delimiters A set of delimiters to include as individual
* tokens.
* @param {Object} opts A set of options selecting what to parse.
* @param {Boolean} [opts.qstring] If true, recognize quoted strings.
* @param {Boolean} [opts.dliteral] If true, recognize domain literals.
* @param {Boolean} [opts.comments] If true, recognize comments.
* @param {Boolean} [opts.rfc2047] If true, parse and decode RFC 2047
* encoded-words.
* @returns {(Token|String)[]} An array of Token objects (which have a toString
* method returning their value) or String objects
* (representing delimiters).
*/
function getHeaderTokens(value, delimiters, opts) {
// The array of parsed tokens. This method used to be a generator, but it
// appears that generators are poorly optimized in current engines, so it was
// converted to not be one.
let tokenList = [];
/// Represents a non-delimiter token
function Token(token) {
// Unescape all quoted pairs. Any trailing \ is deleted.
this.token = token.replace(/\\(.?)/g, "$1");
}
Token.prototype.toString = function () { return this.token; };
// The start of the current token (e.g., atoms, strings)
let tokenStart = undefined;
// The set of whitespace characters, as defined by RFC 5322
let wsp = " \t\r\n";
// If we are a domain literal ([]) or a quoted string ("), this is set to the
// character to look for at the end.
let endQuote = undefined;
// The current depth of comments, since they can be nested. A value 0 means we
// are not in a comment.
let commentDepth = 0;
// Iterate over every character one character at a time.
let length = value.length;
for (let i = 0; i < length; i++) {
let ch = value[i];
// If we see a \, no matter what context we are in, ignore the next
// character.
if (ch == '\\') {
i++;
continue;
}
// If we are in a qstring or a dliteral, process the character only if it is
// what we are looking for to end the quote.
if (endQuote !== undefined) {
if (ch == endQuote && ch == '"') {
// Quoted strings don't include their delimiters.
let text = value.slice(tokenStart + 1, i);
// If RFC 2047 is enabled, always decode the qstring.
if (opts.rfc2047)
text = decodeRFC2047Words(text);
tokenList.push(new Token(text));
endQuote = undefined;
tokenStart = undefined;
} else if (ch == endQuote && ch == ']') {
// Domain literals include their delimiters.
tokenList.push(new Token(value.slice(tokenStart, i + 1)));
endQuote = undefined;
tokenStart = undefined;
}
// Avoid any further processing.
continue;
}
// If we can match the RFC 2047 encoded-word pattern, we need to decode the
// entire word or set of words.
if (opts.rfc2047 && ch == '=' && i + 1 < value.length && value[i + 1] == '?') {
// RFC 2047 tokens separated only by whitespace are conceptually part of
// the same output token, so we need to decode them all at once.
let encodedWordsRE = /([ \t\r\n]*=\?[^?]*\?[BbQq]\?[^?]*\?=)+/;
let result = encodedWordsRE.exec(value.slice(i));
if (result !== null) {
// If we were in the middle of a prior token (i.e., something like
// foobar=?UTF-8?Q?blah?=), yield the previous segment as a token.
if (tokenStart !== undefined) {
tokenList.push(new Token(value.slice(tokenStart, i)));
tokenStart = undefined;
}
// Find out how much we need to decode...
let encWordsLen = result[0].length;
let string = decodeRFC2047Words(value.slice(i, i + encWordsLen),
"UTF-8");
// Don't make a new Token variable, since we do not want to unescape the
// decoded string.
tokenList.push({ toString: function() { return string; }});
// Skip everything we decoded. The -1 is because we don't want to
// include the starting character.
i += encWordsLen - 1;
continue;
}
// If we are here, then we failed to match the simple 2047 encoded-word
// regular expression, despite the fact that it matched the =? at the
// beginning. Fall through and treat the text as if we aren't trying to
// decode RFC 2047.
}
// If we reach this point, we're not inside of quoted strings, domain
// literals, or RFC 2047 encoded-words. This means that the characters we
// parse are potential delimiters (unless we're in comments, where
// everything starts to go really wonky). Several things could happen,
// depending on the kind of character we read and whether or not we were in
// the middle of a token. The three values here tell us what we could need
// to do at this point:
// tokenIsEnding: The current character is not able to be accumulated to an
// atom, so we need to flush the atom if there is one.
// tokenIsStarting: The current character could begin an atom (or
// anything that requires us to mark the starting point), so we need to save
// the location.
// isSpecial: The current character is a delimiter that needs to be output.
let tokenIsEnding = false, tokenIsStarting = false, isSpecial = false;
if (wsp.includes(ch)) {
// Whitespace ends current tokens, doesn't emit anything.
tokenIsEnding = true;
} else if (commentDepth == 0 && delimiters.includes(ch)) {
// Delimiters end the current token, and need to be output. They do not
// apply within comments.
tokenIsEnding = true;
isSpecial = true;
} else if (opts.qstring && ch == '"') {
// Quoted strings end the last token and start a new one.
tokenIsEnding = true;
tokenIsStarting = true;
endQuote = ch;
} else if (opts.dliteral && ch == '[') {
// Domain literals end the last token and start a new one.
tokenIsEnding = true;
tokenIsStarting = true;
endQuote = ']';
} else if (opts.comments && ch == '(') {
// Comments are nested (oh joy). We only really care for the outer
// delimiter, though, which also ends the prior token and needs to be
// output if the consumer requests it.
commentDepth++;
if (commentDepth == 1) {
tokenIsEnding = true;
isSpecial = true;
} else {
tokenIsStarting = true;
}
} else if (opts.comments && ch == ')') {
// Comments are nested (oh joy). We only really care for the outer
// delimiter, though, which also ends the prior token and needs to be
// output if the consumer requests it.
if (commentDepth > 0)
commentDepth--;
if (commentDepth == 0) {
tokenIsEnding = true;
isSpecial = true;
} else {
tokenIsStarting = true;
}
} else {
// Not a delimiter, whitespace, comment, domain literal, or quoted string.
// Must be part of an atom then!
tokenIsStarting = true;
}
// If our analysis concluded that we closed an open token, and there is an
// open token, then yield that token.
if (tokenIsEnding && tokenStart !== undefined) {
tokenList.push(new Token(value.slice(tokenStart, i)));
tokenStart = undefined;
}
// If we need to output a delimiter, do so.
if (isSpecial)
tokenList.push(ch);
// If our analysis concluded that we could open a token, and no token is
// opened yet, then start the token.
if (tokenIsStarting && tokenStart === undefined) {
tokenStart = i;
}
}
// That concludes the loop! If there is a currently open token, close that
// token now.
if (tokenStart !== undefined) {
// Error case: a partially-open quoted string is assumed to have a trailing
// " character.
if (endQuote == '"')
tokenList.push(new Token(value.slice(tokenStart + 1)));
else
tokenList.push(new Token(value.slice(tokenStart)));
}
return tokenList;
}
/**
* Convert a header value into UTF-16 strings by attempting to decode as UTF-8
* or another legacy charset. If the header is valid UTF-8, it will be decoded
* as UTF-8; if it is not, the fallbackCharset will be attempted instead.
*
* @param {String} headerValue The header (as a binary string) to attempt
* to convert to UTF-16.
* @param {String} [fallbackCharset] The optional charset to try if UTF-8
* doesn't work.
* @returns {String} The UTF-16 representation of the string above.
*/
function convert8BitHeader(headerValue, fallbackCharset) {
// Only attempt to convert the headerValue if it contains non-ASCII
// characters.
if (/[\x80-\xff]/.exec(headerValue)) {
// First convert the value to a typed-array for TextDecoder.
let typedarray = mimeutils.stringToTypedArray(headerValue);
// Don't try UTF-8 as fallback (redundant), and don't try UTF-16 or UTF-32
// either, since they radically change header interpretation.
// If we have a fallback charset, we want to know if decoding will fail;
// otherwise, we want to replace with substitution chars.
let hasFallback = fallbackCharset &&
!fallbackCharset.toLowerCase().startsWith("utf");
let utf8Decoder = new TextDecoder("utf-8", {fatal: hasFallback});
try {
headerValue = utf8Decoder.decode(typedarray);
} catch (e) {
// Failed, try the fallback
let decoder = new TextDecoder(fallbackCharset, {fatal: false});
headerValue = decoder.decode(typedarray);
}
}
return headerValue;
}
/**
* Decodes all RFC 2047 encoded-words in the input string. The string does not
* necessarily have to contain any such words. This is useful, for example, for
* parsing unstructured headers.
*
* @param {String} headerValue The header which may contain RFC 2047 encoded-
* words.
* @returns {String} A full UTF-16 string with all encoded words expanded.
*/
function decodeRFC2047Words(headerValue) {
// Unfortunately, many implementations of RFC 2047 encoding are actually wrong
// in that they split over-long encoded words without regard for whether or
// not the split point is in the middle of a multibyte character. Therefore,
// we need to be able to handle these situations gracefully. This is done by
// using the decoder in streaming mode so long as the next token is another
// 2047 token with the same charset.
let lastCharset = '', currentDecoder = undefined;
/**
* Decode a single RFC 2047 token. This function is inline so that we can
* easily close over the lastCharset/currentDecoder variables, needed for
* handling bad RFC 2047 productions properly.
*/
function decode2047Token(token) {
let tokenParts = token.split("?");
// If it's obviously not a valid token, return false immediately.
if (tokenParts.length != 5 || tokenParts[4] != '=')
return false;
// The charset parameter is defined in RFC 2231 to be charset or
// charset*language. We only care about the charset here, so ignore any
// language parameter that gets passed in.
let charset = tokenParts[1].split('*', 1)[0];
let encoding = tokenParts[2], text = tokenParts[3];
let buffer;
if (encoding == 'B' || encoding == 'b') {
// Decode base64. If there's any non-base64 data, treat the string as
// an illegal token.
if (/[^A-Za-z0-9+\/=]/.exec(text))
return false;
// Base64 strings must be a length of multiple 4, but it seems that some
// mailers accidentally insert one too many `=' chars. Gracefully handle
// this case; see bug 227290 for more information.
if (text.length % 4 == 1 && text.charAt(text.length - 1) == '=')
text = text.slice(0, -1);
// Decode the string
buffer = mimeutils.decode_base64(text, false)[0];
} else if (encoding == 'Q' || encoding == 'q') {
// Q encoding here looks a lot like quoted-printable text. The differences
// between quoted-printable and this are that quoted-printable allows you
// to quote newlines (this doesn't), while this replaces spaces with _.
// We can reuse the decode_qp code here, since newlines are already
// stripped from the header. There is one edge case that could trigger a
// false positive, namely when you have a single = or an = followed by
// whitespace at the end of the string. Such an input string is already
// malformed to begin with, so stripping the = and following input in that
// case should not be an important loss.
buffer = mimeutils.decode_qp(text.replace(/_/g, ' '), false)[0];
} else {
return false;
}
// Make the buffer be a typed array for what follows
buffer = mimeutils.stringToTypedArray(buffer);
// If we cannot reuse the last decoder, flush out whatever remains.
var output = '';
if (charset != lastCharset && currentDecoder) {
output += currentDecoder.decode();
currentDecoder = null;
}
// Initialize the decoder for this token.
lastCharset = charset;
if (!currentDecoder) {
try {
currentDecoder = new TextDecoder(charset, {fatal: false});
} catch (e) {
// We don't recognize the charset, so give up.
return false;
}
}
// Convert this token with the buffer. Note the stream parameter--although
// RFC 2047 tokens aren't supposed to break in the middle of a multibyte
// character, a lot of software messes up and does so because it's hard not
// to (see headeremitter.js for exactly how hard!).
return output + currentDecoder.decode(buffer, {stream: true});
}
// The first step of decoding is to split the string into RFC 2047 and
// non-RFC 2047 tokens. RFC 2047 tokens look like the following:
// =?charset?c?text?=, where c is one of B, b, Q, and q. The split regex does
// some amount of semantic checking, so that malformed RFC 2047 tokens will
// get ignored earlier.
let components = headerValue.split(/(=\?[^?]*\?[BQbq]\?[^?]*\?=)/);
for (let i = 0; i < components.length; i++) {
if (components[i].substring(0, 2) == "=?") {
let decoded = decode2047Token(components[i]);
if (decoded !== false) {
// If 2047 decoding succeeded for this bit, rewrite the original value
// with the proper decoding.
components[i] = decoded;
// We're done processing, so continue to the next link.
continue;
}
} else if (/^[ \t\r\n]*$/.exec(components[i])) {
// Whitespace-only tokens get squashed into nothing, so 2047 tokens will
// be concatenated together.
components[i] = '';
continue;
}
// If there was stuff left over from decoding the last 2047 token, flush it
// out.
lastCharset = '';
if (currentDecoder) {
components[i] = currentDecoder.decode() + components[i];
currentDecoder = null;
}
}
// After the for loop, we'll have a set of decoded strings. Concatenate them
// together to make the return value.
return components.join('');
}
///////////////////////////////
// Structured field decoders //
///////////////////////////////
/**
* Extract a list of addresses from a header which matches the RFC 5322
* address-list production, possibly doing RFC 2047 decoding along the way.
*
* The output of this method is an array of elements corresponding to the
* addresses and the groups in the input header. An address is represented by
* an object of the form:
* {
* name: The display name of the address
* email: The address of the object
* }
* while a group is represented by an object of the form:
* {
* name: The display name of the group
* group: An array of address object for members in the group.
* }
*
* @param {String} header The MIME header text to be parsed
* @param {Boolean} doRFC2047 If true, decode RFC 2047 parameters found in the
* header.
* @returns {(Address|Group)[]} An array of the addresses found in the header,
* where each element is of the form mentioned
* above.
*/
function parseAddressingHeader(header, doRFC2047) {
// Default to true
if (doRFC2047 === undefined)
doRFC2047 = true;
// The final (top-level) results list to append to.
let results = [];
// Temporary results
let addrlist = [];
// Build up all of the values
let name = '', groupName = '', localPart = '', address = '', comment = '';
// Indicators of current state
let inAngle = false, inComment = false, needsSpace = false;
let preserveSpace = false;
let commentClosed = false;
// RFC 5322 §3.4 notes that legacy implementations exist which use a simple
// recipient form where the addr-spec appears without the angle brackets,
// but includes the name of the recipient in parentheses as a comment
// following the addr-spec. While we do not create this format, we still
// want to recognize it, though.
// Furthermore, despite allowing comments in addresses, RFC 5322 §3.4 notes
// that legacy implementations may interpret the comment, and thus it
// recommends not to use them. (Also, they may be illegal as per RFC 5321.)
// While we do not create address fields with comments, we recognize such
// comments during parsing and (a) either drop them if inside addr-spec or
// (b) preserve them as part of the display-name if not.
// If the display-name is empty while the last comment is not, we assume it's
// the legacy form above and take the comment content as the display-name.
//
// When parsing the address field, we at first do not know whether any
// strings belong to the display-name (which may include comments) or to the
// local-part of an addr-spec (where we ignore comments) until we find an
// '@' or an '<' token. Thus, we collect both variants until the fog lifts,
// plus the last comment seen.
let lastComment = '';
/**
* Add the parsed mailbox object to the address list.
* If it's in the legacy form above, correct the display-name.
* Also reset any faked flags.
* @param {String} displayName display-name as per RFC 5322
* @param {String} addrSpec addr-spec as per RFC 5322
*/
function addToAddrList(displayName, addrSpec) {
if (displayName === '' && lastComment !== '') {
// Take last comment content as the display-name.
let offset = lastComment[0] === ' ' ? 2 : 1;
displayName = lastComment.substr(offset, lastComment.length - offset - 1);
}
if (displayName !== '' || addrSpec !== '')
addrlist.push({name: displayName, email: addrSpec});
// Clear pending flags and variables.
name = localPart = address = lastComment = '';
inAngle = inComment = needsSpace = false;
}
// Main parsing loop
for (let token of getHeaderTokens(header, ":,;<>@",
{qstring: true, comments: true, dliteral: true, rfc2047: doRFC2047})) {
if (token === ':') {
groupName = name;
name = '';
localPart = '';
// If we had prior email address results, commit them to the top-level.
if (addrlist.length > 0)
results = results.concat(addrlist);
addrlist = [];
} else if (token === '<') {
if (inAngle) {
// Interpret the address we were parsing as a name.
if (address.length > 0) {
name = address;
}
localPart = address = '';
} else {
inAngle = true;
}
} else if (token === '>') {
inAngle = false;
// Forget addr-spec comments.
lastComment = '';
} else if (token === '(') {
inComment = true;
// The needsSpace flag may not always be set even if it should be,
// e.g. for a comment behind an angle-addr.
// Also, we need to restore the needsSpace flag if we ignore the comment.
preserveSpace = needsSpace;
if (!needsSpace)
needsSpace = name !== '' && name.substr(-1) !== ' ';
comment = needsSpace ? ' (' : '(';
commentClosed = false;
} else if (token === ')') {
inComment = false;
comment += ')';
lastComment = comment;
// The comment may be part of the name, but not of the local-part.
// Enforce a space behind the comment only when not ignoring it.
if (inAngle) {
needsSpace = preserveSpace;
} else {
name += comment;
needsSpace = true;
}
commentClosed = true;
continue;
} else if (token === '@') {
// An @ means we see an email address. If we're not within <> brackets,
// then we just parsed an email address instead of a display name. Empty
// out the display name for the current production.
if (!inAngle) {
address = localPart;
name = '';
localPart = '';
// The remainder of this mailbox is part of an addr-spec.
inAngle = true;
}
// Keep the local-part quoted if it needs to be.
if (/[ !()<>\[\]:;@\\,"]/.exec(address) !== null)
address = '"' + address.replace(/([\\"])/g, "\\$1") + '"';
address += '@';
} else if (token === ',') {
// A comma ends the current name. If we have something that's kind of a
// name, add it to the result list. If we don't, then our input looks like
// To: , , -> don't bother adding an empty entry.
addToAddrList(name, address);
} else if (token === ';') {
// Add pending name to the list
addToAddrList(name, address);
// If no group name was found, treat the ';' as a ','. In any case, we
// need to copy the results of addrlist into either a new group object or
// the main list.
if (groupName === '') {
results = results.concat(addrlist);
} else {
results.push({
name: groupName,
group: addrlist
});
}
// ... and reset every other variable.
addrlist = [];
groupName = '';
} else {
// This is either comment content, a quoted-string, or some span of
// dots and atoms.
// Ignore the needs space if we're a "close" delimiter token.
let spacedToken = token;
if (needsSpace && token.toString()[0] != '.')
spacedToken = ' ' + spacedToken;
// Which field do we add this data to?
if (inComment) {
comment += spacedToken;
} else if (inAngle) {
address += spacedToken;
} else {
name += spacedToken;
// Never add a space to the local-part, if we just ignored a comment.
if (commentClosed) {
localPart += token;
commentClosed = false;
} else {
localPart += spacedToken;
}
}
// We need space for the next token if we aren't some kind of comment or
// . delimiter.
needsSpace = token.toString()[0] != '.';
// The fall-through case after this resets needsSpace to false, and we
// don't want that!
continue;
}
// If we just parsed a delimiter, we don't need any space for the next
// token.
needsSpace = false;
}
// If we're missing the final ';' of a group, assume it was present. Also, add
// in the details of any email/address that we previously saw.
addToAddrList(name, address);
if (groupName !== '') {
results.push({name: groupName, group: addrlist});
addrlist = [];
}
// Add the current address list build-up to the list of addresses, and return
// the whole array to the caller.
return results.concat(addrlist);
}
/**
* Extract parameters from a header which is a series of ;-separated
* attribute=value tokens.
*
* @param {String} headerValue The MIME header value to parse.
* @param {Boolean} doRFC2047 If true, decode RFC 2047 encoded-words.
* @param {Boolean} doRFC2231 If true, decode RFC 2231 encoded parameters.
* @return {Map(String -> String)} A map of parameter names to parameter values.
* The property preSemi is set to the token that
* precedes the first semicolon.
*/
function parseParameterHeader(headerValue, doRFC2047, doRFC2231) {
// The basic syntax of headerValue is token [; token = token-or-qstring]*
// Copying more or less liberally from nsMIMEHeaderParamImpl:
// The first token is the text to the first whitespace or semicolon.
var semi = headerValue.indexOf(";");
if (semi < 0) {
var start = headerValue;
var rest = '';
} else {
var start = headerValue.substring(0, semi);
var rest = headerValue.substring(semi); // Include the semicolon
}
// Strip start to be <WSP><nowsp><WSP>.
start = start.trim().split(/[ \t\r\n]/)[0];
// Decode the the parameter tokens.
let opts = {qstring: true, rfc2047: doRFC2047};
// Name is the name of the parameter, inName is true iff we don't have a name
// yet.
let name = '', inName = true;
// Matches is a list of [name, value] pairs, where we found something that
// looks like name=value in the input string.
let matches = [];
for (let token of getHeaderTokens(rest, ";=", opts)) {
if (token === ';') {
// If we didn't find a name yet (we have ... tokenA; tokenB), push the
// name with an empty token instead.
if (name != '' && inName == false)
matches.push([name, '']);
name = '';
inName = true;
} else if (token === '=') {
inName = false;
} else if (inName && name == '') {
name = token.toString();
} else if (!inName && name != '') {
token = token.toString();
// RFC 2231 doesn't make it clear if %-encoding is supposed to happen
// within a quoted string, but this is very much required in practice. If
// it ends with a '*', then the string is an extended-value, which means
// that its value may be %-encoded.
if (doRFC2231 && name.endsWith('*')) {
token = token.replace(/%([0-9A-Fa-f]{2})/g,
function percent_deencode(match, hexchars) {
return String.fromCharCode(parseInt(hexchars, 16));
});
}
matches.push([name, token]);
// Clear the name, so we ignore anything afterwards.
name = '';
} else if (inName) {
// We have ...; tokenA tokenB ... -> ignore both tokens
name = ''; // Error recovery, ignore this one
}
}
// If we have a leftover ...; tokenA, push the tokenA
if (name != '' && inName == false)
matches.push([name, '']);
// Now matches holds the parameters, so clean up for RFC 2231. There are three
// cases: param=val, param*=us-ascii'en-US'blah, and param*n= variants. The
// order of preference is to pick the middle, then the last, then the first.
// Note that we already unpacked %-encoded values.
// simpleValues is just a straight parameter -> value map.
// charsetValues is the parameter -> value map, although values are stored
// before charset decoding happens.
// continuationValues maps parameter -> array of values, with extra properties
// valid (if we decided we couldn't do anything anymore) and hasCharset (which
// records if we need to decode the charset parameter or not).
var simpleValues = new Map(), charsetValues = new Map(),
continuationValues = new Map();
for (let pair of matches) {
let name = pair[0];
let value = pair[1];
// Get first index, not last index, so we match param*0*= like param*0=.
let star = name.indexOf('*');
if (star == -1) {
// This is the case of param=val. Select the first value here, if there
// are multiple ones.
if (!simpleValues.has(name))
simpleValues.set(name, value);
} else if (star == name.length - 1) {
// This is the case of param*=us-ascii'en-US'blah.
name = name.substring(0, star);
// Again, select only the first value here.
if (!charsetValues.has(name))
charsetValues.set(name, value);
} else {
// This is the case of param*0= or param*0*=.
let param = name.substring(0, star);
let entry = continuationValues.get(param);
// Did we previously find this one to be bungled? Then ignore it.
if (continuationValues.has(param) && !entry.valid)
continue;
// If we haven't seen it yet, set up entry already. Note that entries are
// not straight string values but rather [valid, hasCharset, param0, ... ]
if (!continuationValues.has(param)) {
entry = new Array();
entry.valid = true;
entry.hasCharset = undefined;
continuationValues.set(param, entry);
}
// When the string ends in *, we need to charset decoding.
// Note that the star is only meaningful for the *0*= case.
let lastStar = name[name.length - 1] == '*';
let number = name.substring(star + 1, name.length - (lastStar ? 1 : 0));
if (number == '0')
entry.hasCharset = lastStar;
// Is the continuation number illegal?
else if ((number[0] == '0' && number != '0') ||
!(/^[0-9]+$/.test(number))) {
entry.valid = false;
continue;
}
// Normalize to an integer
number = parseInt(number, 10);
// Is this a repeat? If so, bail.
if (entry[number] !== undefined) {
entry.valid = false;
continue;
}
// Set the value for this continuation index. JS's magic array setter will
// expand the array if necessary.
entry[number] = value;
}
}
// Build the actual parameter array from the parsed values
var values = new Map();
// Simple values have lowest priority, so just add everything into the result
// now.
for (let pair of simpleValues) {
values.set(pair[0], pair[1]);
}
if (doRFC2231) {
// Continuation values come next
for (let pair of continuationValues) {
let name = pair[0];
let entry = pair[1];
// If we never saw a param*0= or param*0*= value, then we can't do any
// reasoning about what it looks like, so bail out now.
if (entry.hasCharset === undefined) continue;
// Use as many entries in the array as are valid--if we are missing an
// entry, stop there.
let valid = true;
for (var i = 0; valid && i < entry.length; i++)
if (entry[i] === undefined)
valid = false;
// Concatenate as many parameters as are valid. If we need to decode thec
// charset, do so now.
var value = entry.slice(0, i).join('');
if (entry.hasCharset) {
try {
value = decode2231Value(value);
} catch (e) {
// Bad charset, don't add anything.
continue;
}
}
// Finally, add this to the output array.
values.set(name, value);
}
// Highest priority is the charset conversion.
for (let pair of charsetValues) {
try {
values.set(pair[0], decode2231Value(pair[1]));
} catch (e) {
// Bad charset, don't add anything.
}
}
}
// Finally, return the values computed above.
values.preSemi = start;
return values;
}
/**
* Convert a RFC 2231-encoded string parameter into a Unicode version of the
* string. This assumes that percent-decoding has already been applied.
*
* @param {String} value The RFC 2231-encoded string to decode.
* @return The Unicode version of the string.
*/
function decode2231Value(value) {
let quote1 = value.indexOf("'");
let quote2 = quote1 >= 0 ? value.indexOf("'", quote1 + 1) : -1;
let charset = (quote1 >= 0 ? value.substring(0, quote1) : "");
// It turns out that the language isn't useful anywhere in our codebase for
// the present time, so we will safely ignore it.
//var language = (quote2 >= 0 ? value.substring(quote1 + 2, quote2) : "");
value = value.substring(Math.max(quote1, quote2) + 1);
// Convert the value into a typed array for decoding
let typedarray = mimeutils.stringToTypedArray(value);
// Decode the charset. If the charset isn't found, we throw an error. Try to
// fallback in that case.
return new TextDecoder(charset, {fatal: true})
.decode(typedarray, {stream: false});
}
// This is a map of known timezone abbreviations, for fallback in obsolete Date
// productions.
const kKnownTZs = {
// The following timezones are explicitly listed in RFC 5322.
"UT": "+0000", "GMT": "+0000",
"EST": "-0500", "EDT": "-0400",
"CST": "-0600", "CDT": "-0500",
"MST": "-0700", "MDT": "-0600",
"PST": "-0800", "PDT": "-0700",
// The following are time zones copied from NSPR's prtime.c
"AST": "-0400", // Atlantic Standard Time
"NST": "-0330", // Newfoundland Standard Time
"BST": "+0100", // British Summer Time
"MET": "+0100", // Middle Europe Time
"EET": "+0200", // Eastern Europe Time
"JST": "+0900" // Japan Standard Time
};
/**
* Parse a header that contains a date-time definition according to RFC 5322.
* The result is a JS date object with the same timestamp as the header.
*
* The dates returned by this parser cannot be reliably converted back into the
* original header for two reasons. First, JS date objects cannot retain the
* timezone information they were initialized with, so reserializing a date
* header would necessarily produce a date in either the current timezone or in
* UTC. Second, JS dates measure time as seconds elapsed from the POSIX epoch
* excluding leap seconds. Any timestamp containing a leap second is instead
* converted into one that represents the next second.
*
* Dates that do not match the RFC 5322 production are instead attempted to
* parse using the Date.parse function. The strings that are accepted by
* Date.parse are not fully defined by the standard, but most implementations
* should accept strings that look rather close to RFC 5322 strings. Truly
* invalid dates produce a formulation that results in an invalid date,
* detectable by having its .getTime() method return NaN.
*
* @param {String} header The MIME header value to parse.
* @returns {Date} The date contained within the header, as described
* above.
*/
function parseDateHeader(header) {
let tokens = [for (x of getHeaderTokens(header, ",:", {})) x.toString()];
// What does a Date header look like? In practice, most date headers devolve
// into Date: [dow ,] dom mon year hh:mm:ss tzoff [(abbrev)], with the day of
// week mostly present and the timezone abbreviation mostly absent.
// First, ignore the day-of-the-week if present. This would be the first two
// tokens.
if (tokens.length > 1 && tokens[1] === ',')
tokens = tokens.slice(2);
// If there are too few tokens, the date is obviously invalid.
if (tokens.length < 8)
return new Date(NaN);
// Save off the numeric tokens
let day = parseInt(tokens[0]);
// month is tokens[1]
let year = parseInt(tokens[2]);
let hours = parseInt(tokens[3]);
// tokens[4] === ':'
let minutes = parseInt(tokens[5]);
// tokens[6] === ':'
let seconds = parseInt(tokens[7]);
// Compute the month. Check only the first three digits for equality; this
// allows us to accept, e.g., "January" in lieu of "Jan."
let month = mimeutils.kMonthNames.indexOf(tokens[1].slice(0, 3));
// If the month name is not recognized, make the result illegal.
if (month < 0)
month = NaN;
// Compute the full year if it's only 2 digits. RFC 5322 states that the
// cutoff is 50 instead of 70.
if (year < 100) {
year += year < 50 ? 2000 : 1900;
}
// Compute the timezone offset. If it's not in the form ±hhmm, convert it to
// that form.
let tzoffset = tokens[8];
if (tzoffset in kKnownTZs)
tzoffset = kKnownTZs[tzoffset];
let decompose = /^([+-])(\d\d)(\d\d)$/.exec(tzoffset);
// Unknown? Make it +0000
if (decompose === null)
decompose = ['+0000', '+', '00', '00'];
let tzOffsetInMin = parseInt(decompose[2]) * 60 + parseInt(decompose[3]);
if (decompose[1] == '-')
tzOffsetInMin = -tzOffsetInMin;
// How do we make the date at this point? Well, the JS date's constructor
// builds the time in terms of the local timezone. To account for the offset
// properly, we need to build in UTC.
let finalDate = new Date(Date.UTC(year, month, day, hours, minutes, seconds)
- tzOffsetInMin * 60 * 1000);
// Suppose our header was mangled and we couldn't read it--some of the fields
// became undefined. In that case, the date would become invalid, and the
// indication that it is so is that the underlying number is a NaN. In that
// scenario, we could build attempt to use JS Date parsing as a last-ditch
// attempt. But it's not clear that such messages really exist in practice,
// and the valid formats for Date in ES6 are unspecified.
return finalDate;
}