github.com/zebozhuang/go@v0.0.0-20200207033046-f8a98f6f5c5d/src/mime/encodedword.go (about) 1 // Copyright 2015 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package mime 6 7 import ( 8 "bytes" 9 "encoding/base64" 10 "errors" 11 "fmt" 12 "io" 13 "strings" 14 "sync" 15 "unicode" 16 "unicode/utf8" 17 ) 18 19 // A WordEncoder is an RFC 2047 encoded-word encoder. 20 type WordEncoder byte 21 22 const ( 23 // BEncoding represents Base64 encoding scheme as defined by RFC 2045. 24 BEncoding = WordEncoder('b') 25 // QEncoding represents the Q-encoding scheme as defined by RFC 2047. 26 QEncoding = WordEncoder('q') 27 ) 28 29 var ( 30 errInvalidWord = errors.New("mime: invalid RFC 2047 encoded-word") 31 ) 32 33 // Encode returns the encoded-word form of s. If s is ASCII without special 34 // characters, it is returned unchanged. The provided charset is the IANA 35 // charset name of s. It is case insensitive. 36 func (e WordEncoder) Encode(charset, s string) string { 37 if !needsEncoding(s) { 38 return s 39 } 40 return e.encodeWord(charset, s) 41 } 42 43 func needsEncoding(s string) bool { 44 for _, b := range s { 45 if (b < ' ' || b > '~') && b != '\t' { 46 return true 47 } 48 } 49 return false 50 } 51 52 // encodeWord encodes a string into an encoded-word. 53 func (e WordEncoder) encodeWord(charset, s string) string { 54 buf := getBuffer() 55 defer putBuffer(buf) 56 57 e.openWord(buf, charset) 58 if e == BEncoding { 59 e.bEncode(buf, charset, s) 60 } else { 61 e.qEncode(buf, charset, s) 62 } 63 closeWord(buf) 64 65 return buf.String() 66 } 67 68 const ( 69 // The maximum length of an encoded-word is 75 characters. 70 // See RFC 2047, section 2. 71 maxEncodedWordLen = 75 72 // maxContentLen is how much content can be encoded, ignoring the header and 73 // 2-byte footer. 74 maxContentLen = maxEncodedWordLen - len("=?UTF-8?q?") - len("?=") 75 ) 76 77 var maxBase64Len = base64.StdEncoding.DecodedLen(maxContentLen) 78 79 // bEncode encodes s using base64 encoding and writes it to buf. 80 func (e WordEncoder) bEncode(buf *bytes.Buffer, charset, s string) { 81 w := base64.NewEncoder(base64.StdEncoding, buf) 82 // If the charset is not UTF-8 or if the content is short, do not bother 83 // splitting the encoded-word. 84 if !isUTF8(charset) || base64.StdEncoding.EncodedLen(len(s)) <= maxContentLen { 85 io.WriteString(w, s) 86 w.Close() 87 return 88 } 89 90 var currentLen, last, runeLen int 91 for i := 0; i < len(s); i += runeLen { 92 // Multi-byte characters must not be split across encoded-words. 93 // See RFC 2047, section 5.3. 94 _, runeLen = utf8.DecodeRuneInString(s[i:]) 95 96 if currentLen+runeLen <= maxBase64Len { 97 currentLen += runeLen 98 } else { 99 io.WriteString(w, s[last:i]) 100 w.Close() 101 e.splitWord(buf, charset) 102 last = i 103 currentLen = runeLen 104 } 105 } 106 io.WriteString(w, s[last:]) 107 w.Close() 108 } 109 110 // qEncode encodes s using Q encoding and writes it to buf. It splits the 111 // encoded-words when necessary. 112 func (e WordEncoder) qEncode(buf *bytes.Buffer, charset, s string) { 113 // We only split encoded-words when the charset is UTF-8. 114 if !isUTF8(charset) { 115 writeQString(buf, s) 116 return 117 } 118 119 var currentLen, runeLen int 120 for i := 0; i < len(s); i += runeLen { 121 b := s[i] 122 // Multi-byte characters must not be split across encoded-words. 123 // See RFC 2047, section 5.3. 124 var encLen int 125 if b >= ' ' && b <= '~' && b != '=' && b != '?' && b != '_' { 126 runeLen, encLen = 1, 1 127 } else { 128 _, runeLen = utf8.DecodeRuneInString(s[i:]) 129 encLen = 3 * runeLen 130 } 131 132 if currentLen+encLen > maxContentLen { 133 e.splitWord(buf, charset) 134 currentLen = 0 135 } 136 writeQString(buf, s[i:i+runeLen]) 137 currentLen += encLen 138 } 139 } 140 141 // writeQString encodes s using Q encoding and writes it to buf. 142 func writeQString(buf *bytes.Buffer, s string) { 143 for i := 0; i < len(s); i++ { 144 switch b := s[i]; { 145 case b == ' ': 146 buf.WriteByte('_') 147 case b >= '!' && b <= '~' && b != '=' && b != '?' && b != '_': 148 buf.WriteByte(b) 149 default: 150 buf.WriteByte('=') 151 buf.WriteByte(upperhex[b>>4]) 152 buf.WriteByte(upperhex[b&0x0f]) 153 } 154 } 155 } 156 157 // openWord writes the beginning of an encoded-word into buf. 158 func (e WordEncoder) openWord(buf *bytes.Buffer, charset string) { 159 buf.WriteString("=?") 160 buf.WriteString(charset) 161 buf.WriteByte('?') 162 buf.WriteByte(byte(e)) 163 buf.WriteByte('?') 164 } 165 166 // closeWord writes the end of an encoded-word into buf. 167 func closeWord(buf *bytes.Buffer) { 168 buf.WriteString("?=") 169 } 170 171 // splitWord closes the current encoded-word and opens a new one. 172 func (e WordEncoder) splitWord(buf *bytes.Buffer, charset string) { 173 closeWord(buf) 174 buf.WriteByte(' ') 175 e.openWord(buf, charset) 176 } 177 178 func isUTF8(charset string) bool { 179 return strings.EqualFold(charset, "UTF-8") 180 } 181 182 const upperhex = "0123456789ABCDEF" 183 184 // A WordDecoder decodes MIME headers containing RFC 2047 encoded-words. 185 type WordDecoder struct { 186 // CharsetReader, if non-nil, defines a function to generate 187 // charset-conversion readers, converting from the provided 188 // charset into UTF-8. 189 // Charsets are always lower-case. utf-8, iso-8859-1 and us-ascii charsets 190 // are handled by default. 191 // One of the CharsetReader's result values must be non-nil. 192 CharsetReader func(charset string, input io.Reader) (io.Reader, error) 193 } 194 195 // Decode decodes an RFC 2047 encoded-word. 196 func (d *WordDecoder) Decode(word string) (string, error) { 197 // See https://tools.ietf.org/html/rfc2047#section-2 for details. 198 // Our decoder is permissive, we accept empty encoded-text. 199 if len(word) < 8 || !strings.HasPrefix(word, "=?") || !strings.HasSuffix(word, "?=") || strings.Count(word, "?") != 4 { 200 return "", errInvalidWord 201 } 202 word = word[2 : len(word)-2] 203 204 // split delimits the first 2 fields 205 split := strings.IndexByte(word, '?') 206 207 // split word "UTF-8?q?ascii" into "UTF-8", 'q', and "ascii" 208 charset := word[:split] 209 if len(charset) == 0 { 210 return "", errInvalidWord 211 } 212 if len(word) < split+3 { 213 return "", errInvalidWord 214 } 215 encoding := word[split+1] 216 // the field after split must only be one byte 217 if word[split+2] != '?' { 218 return "", errInvalidWord 219 } 220 text := word[split+3:] 221 222 content, err := decode(encoding, text) 223 if err != nil { 224 return "", err 225 } 226 227 buf := getBuffer() 228 defer putBuffer(buf) 229 230 if err := d.convert(buf, charset, content); err != nil { 231 return "", err 232 } 233 234 return buf.String(), nil 235 } 236 237 // DecodeHeader decodes all encoded-words of the given string. It returns an 238 // error if and only if CharsetReader of d returns an error. 239 func (d *WordDecoder) DecodeHeader(header string) (string, error) { 240 // If there is no encoded-word, returns before creating a buffer. 241 i := strings.Index(header, "=?") 242 if i == -1 { 243 return header, nil 244 } 245 246 buf := getBuffer() 247 defer putBuffer(buf) 248 249 buf.WriteString(header[:i]) 250 header = header[i:] 251 252 betweenWords := false 253 for { 254 start := strings.Index(header, "=?") 255 if start == -1 { 256 break 257 } 258 cur := start + len("=?") 259 260 i := strings.Index(header[cur:], "?") 261 if i == -1 { 262 break 263 } 264 charset := header[cur : cur+i] 265 cur += i + len("?") 266 267 if len(header) < cur+len("Q??=") { 268 break 269 } 270 encoding := header[cur] 271 cur++ 272 273 if header[cur] != '?' { 274 break 275 } 276 cur++ 277 278 j := strings.Index(header[cur:], "?=") 279 if j == -1 { 280 break 281 } 282 text := header[cur : cur+j] 283 end := cur + j + len("?=") 284 285 content, err := decode(encoding, text) 286 if err != nil { 287 betweenWords = false 288 buf.WriteString(header[:start+2]) 289 header = header[start+2:] 290 continue 291 } 292 293 // Write characters before the encoded-word. White-space and newline 294 // characters separating two encoded-words must be deleted. 295 if start > 0 && (!betweenWords || hasNonWhitespace(header[:start])) { 296 buf.WriteString(header[:start]) 297 } 298 299 if err := d.convert(buf, charset, content); err != nil { 300 return "", err 301 } 302 303 header = header[end:] 304 betweenWords = true 305 } 306 307 if len(header) > 0 { 308 buf.WriteString(header) 309 } 310 311 return buf.String(), nil 312 } 313 314 func decode(encoding byte, text string) ([]byte, error) { 315 switch encoding { 316 case 'B', 'b': 317 return base64.StdEncoding.DecodeString(text) 318 case 'Q', 'q': 319 return qDecode(text) 320 default: 321 return nil, errInvalidWord 322 } 323 } 324 325 func (d *WordDecoder) convert(buf *bytes.Buffer, charset string, content []byte) error { 326 switch { 327 case strings.EqualFold("utf-8", charset): 328 buf.Write(content) 329 case strings.EqualFold("iso-8859-1", charset): 330 for _, c := range content { 331 buf.WriteRune(rune(c)) 332 } 333 case strings.EqualFold("us-ascii", charset): 334 for _, c := range content { 335 if c >= utf8.RuneSelf { 336 buf.WriteRune(unicode.ReplacementChar) 337 } else { 338 buf.WriteByte(c) 339 } 340 } 341 default: 342 if d.CharsetReader == nil { 343 return fmt.Errorf("mime: unhandled charset %q", charset) 344 } 345 r, err := d.CharsetReader(strings.ToLower(charset), bytes.NewReader(content)) 346 if err != nil { 347 return err 348 } 349 if _, err = buf.ReadFrom(r); err != nil { 350 return err 351 } 352 } 353 return nil 354 } 355 356 // hasNonWhitespace reports whether s (assumed to be ASCII) contains at least 357 // one byte of non-whitespace. 358 func hasNonWhitespace(s string) bool { 359 for _, b := range s { 360 switch b { 361 // Encoded-words can only be separated by linear white spaces which does 362 // not include vertical tabs (\v). 363 case ' ', '\t', '\n', '\r': 364 default: 365 return true 366 } 367 } 368 return false 369 } 370 371 // qDecode decodes a Q encoded string. 372 func qDecode(s string) ([]byte, error) { 373 dec := make([]byte, len(s)) 374 n := 0 375 for i := 0; i < len(s); i++ { 376 switch c := s[i]; { 377 case c == '_': 378 dec[n] = ' ' 379 case c == '=': 380 if i+2 >= len(s) { 381 return nil, errInvalidWord 382 } 383 b, err := readHexByte(s[i+1], s[i+2]) 384 if err != nil { 385 return nil, err 386 } 387 dec[n] = b 388 i += 2 389 case (c <= '~' && c >= ' ') || c == '\n' || c == '\r' || c == '\t': 390 dec[n] = c 391 default: 392 return nil, errInvalidWord 393 } 394 n++ 395 } 396 397 return dec[:n], nil 398 } 399 400 // readHexByte returns the byte from its quoted-printable representation. 401 func readHexByte(a, b byte) (byte, error) { 402 var hb, lb byte 403 var err error 404 if hb, err = fromHex(a); err != nil { 405 return 0, err 406 } 407 if lb, err = fromHex(b); err != nil { 408 return 0, err 409 } 410 return hb<<4 | lb, nil 411 } 412 413 func fromHex(b byte) (byte, error) { 414 switch { 415 case b >= '0' && b <= '9': 416 return b - '0', nil 417 case b >= 'A' && b <= 'F': 418 return b - 'A' + 10, nil 419 // Accept badly encoded bytes. 420 case b >= 'a' && b <= 'f': 421 return b - 'a' + 10, nil 422 } 423 return 0, fmt.Errorf("mime: invalid hex byte %#02x", b) 424 } 425 426 var bufPool = sync.Pool{ 427 New: func() interface{} { 428 return new(bytes.Buffer) 429 }, 430 } 431 432 func getBuffer() *bytes.Buffer { 433 return bufPool.Get().(*bytes.Buffer) 434 } 435 436 func putBuffer(buf *bytes.Buffer) { 437 if buf.Len() > 1024 { 438 return 439 } 440 buf.Reset() 441 bufPool.Put(buf) 442 }