-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutf7.go
191 lines (171 loc) · 5.25 KB
/
utf7.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
// Copyright 2013 The Go-IMAP Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the go-imap.LICENSE file.
package imap
import (
"encoding/base64"
"errors"
"unicode/utf16"
"unicode/utf8"
)
const (
uRepl = '\uFFFD' // Unicode replacement code point
u7min = 0x20 // Minimum self-representing UTF-7 value
u7max = 0x7E // Maximum self-representing UTF-7 value
)
// ErrBadUTF7 is returned to indicate invalid modified UTF-7 encoding.
var ErrBadUTF7 = errors.New("imap: bad utf-7 encoding")
// Base64 codec for code points outside of the 0x20-0x7E range.
var u7enc = base64.NewEncoding(
"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+,")
// UTF7Encode converts a string from UTF-8 encoding to modified UTF-7. This
// encoding is used by the Mailbox International Naming Convention (RFC 3501
// section 5.1.3). Invalid UTF-8 byte sequences are replaced by the Unicode
// replacement code point (U+FFFD).
func UTF7Encode(s string) string {
return string(UTF7EncodeBytes([]byte(s)))
}
// UTF7EncodeBytes converts a byte slice from UTF-8 encoding to modified UTF-7.
func UTF7EncodeBytes(s []byte) []byte {
u := make([]byte, 0, len(s)*2)
for i, n := 0, len(s); i < n; {
if c := s[i]; u7min <= c && c <= u7max {
i++
if u = append(u, c); c == '&' {
u = append(u, '-')
}
continue
}
start := i
for i++; i < n && (s[i] < u7min || s[i] > u7max); i++ {
// Find the next printable ASCII code point
}
u = append(u, utf7enc(s[start:i])...)
}
return u
}
// utf7enc converts string s from UTF-8 to UTF-16-BE, encodes the result as
// Base64, removes the padding, and adds UTF-7 shifts.
func utf7enc(s []byte) []byte {
// len(s) is sufficient for UTF-8 to UTF-16 conversion if there are no
// control code points (see table below).
b := make([]byte, 0, len(s)+4)
for len(s) > 0 {
r, size := utf8.DecodeRune(s)
if r > utf8.MaxRune {
r, size = utf8.RuneError, 1 // Bug fix (issue 3785)
}
s = s[size:]
if r1, r2 := utf16.EncodeRune(r); r1 != uRepl {
b = append(b, byte(r1>>8), byte(r1))
r = r2
}
b = append(b, byte(r>>8), byte(r))
}
// Encode as Base64
n := u7enc.EncodedLen(len(b)) + 2
b64 := make([]byte, n)
u7enc.Encode(b64[1:], b)
// Strip padding
n -= 2 - (len(b)+2)%3
b64 = b64[:n]
// Add UTF-7 shifts
b64[0] = '&'
b64[n-1] = '-'
return b64
}
// UTF7Decode converts a string from modified UTF-7 encoding to UTF-8.
func UTF7Decode(u string) (s string, err error) {
b, err := UTF7DecodeBytes([]byte(u))
s = string(b)
return
}
// UTF7DecodeBytes converts a byte slice from modified UTF-7 encoding to UTF-8.
func UTF7DecodeBytes(u []byte) (s []byte, err error) {
s = make([]byte, 0, len(u))
ascii := true
for i, n := 0, len(u); i < n; i++ {
if c := u[i]; c < u7min || c > u7max {
return nil, ErrBadUTF7 // Illegal code point in ASCII mode
} else if c != '&' {
s = append(s, c)
ascii = true
continue
}
start := i + 1
// Find the end of the Base64 or "&-" segment
for i++; i < n && u[i] != '-'; i++ {
if u[i] == cr || u[i] == lf { // base64 package ignores CR and LF
return nil, ErrBadUTF7
}
}
if i == n {
return nil, ErrBadUTF7 // Implicit shift ("&...")
} else if i == start {
s = append(s, '&') // Escape sequence "&-"
ascii = true
} else if b := utf7dec(u[start:i]); ascii && len(b) > 0 {
s = append(s, b...) // Control or non-ASCII code points in Base64
ascii = false
} else {
return nil, ErrBadUTF7 // Null shift ("&...-&...-") or bad encoding
}
}
return
}
// utf7dec extracts UTF-16-BE bytes from Base64 data and converts them to UTF-8.
// A nil slice is returned if the encoding is invalid.
func utf7dec(b64 []byte) []byte {
var b []byte
// Allocate a single block of memory large enough to store the Base64 data
// (if padding is required), UTF-16-BE bytes, and decoded UTF-8 bytes.
// Since a 2-byte UTF-16 sequence may expand into a 3-byte UTF-8 sequence,
// double the space allocation for UTF-8.
if n := len(b64); b64[n-1] == '=' {
return nil
} else if n&3 == 0 {
b = make([]byte, u7enc.DecodedLen(n)*3)
} else {
n += 4 - n&3
b = make([]byte, n+u7enc.DecodedLen(n)*3)
copy(b[copy(b, b64):n], []byte("=="))
b64, b = b[:n], b[n:]
}
// Decode Base64 into the first 1/3rd of b
n, err := u7enc.Decode(b, b64)
if err != nil || n&1 == 1 {
return nil
}
// Decode UTF-16-BE into the remaining 2/3rds of b
b, s := b[:n], b[n:]
j := 0
for i := 0; i < n; i += 2 {
r := rune(b[i])<<8 | rune(b[i+1])
if utf16.IsSurrogate(r) {
if i += 2; i == n {
return nil
}
r2 := rune(b[i])<<8 | rune(b[i+1])
if r = utf16.DecodeRune(r, r2); r == uRepl {
return nil
}
} else if u7min <= r && r <= u7max {
return nil
}
j += utf8.EncodeRune(s[j:], r)
}
return s[:j]
}
/*
The following table shows the number of bytes required to encode each code point
in the specified range using UTF-8 and UTF-16 representations:
+-----------------+-------+--------+
| Code points | UTF-8 | UTF-16 |
+-----------------+-------+--------+
| 000000 - 00007F | 1 | 2 |
| 000080 - 0007FF | 2 | 2 |
| 000800 - 00FFFF | 3 | 2 |
| 010000 - 10FFFF | 4 | 4 |
+-----------------+-------+--------+
Source: http://en.wikipedia.org/wiki/Comparison_of_Unicode_encodings
*/