github.com/q45/go@v0.0.0-20151101211701-a4fb8c13db3f/src/mime/encodedword.go (about)

     1  // Copyright 2015 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package mime
     6  
     7  import (
     8  	"bytes"
     9  	"encoding/base64"
    10  	"errors"
    11  	"fmt"
    12  	"io"
    13  	"strings"
    14  	"sync"
    15  	"unicode"
    16  	"unicode/utf8"
    17  )
    18  
    19  // A WordEncoder is a RFC 2047 encoded-word encoder.
    20  type WordEncoder byte
    21  
    22  const (
    23  	// BEncoding represents Base64 encoding scheme as defined by RFC 2045.
    24  	BEncoding = WordEncoder('b')
    25  	// QEncoding represents the Q-encoding scheme as defined by RFC 2047.
    26  	QEncoding = WordEncoder('q')
    27  )
    28  
    29  var (
    30  	errInvalidWord = errors.New("mime: invalid RFC 2047 encoded-word")
    31  )
    32  
    33  // Encode returns the encoded-word form of s. If s is ASCII without special
    34  // characters, it is returned unchanged. The provided charset is the IANA
    35  // charset name of s. It is case insensitive.
    36  func (e WordEncoder) Encode(charset, s string) string {
    37  	if !needsEncoding(s) {
    38  		return s
    39  	}
    40  	return e.encodeWord(charset, s)
    41  }
    42  
    43  func needsEncoding(s string) bool {
    44  	for _, b := range s {
    45  		if (b < ' ' || b > '~') && b != '\t' {
    46  			return true
    47  		}
    48  	}
    49  	return false
    50  }
    51  
    52  // encodeWord encodes a string into an encoded-word.
    53  func (e WordEncoder) encodeWord(charset, s string) string {
    54  	buf := getBuffer()
    55  	defer putBuffer(buf)
    56  
    57  	e.openWord(buf, charset)
    58  	if e == BEncoding {
    59  		e.bEncode(buf, charset, s)
    60  	} else {
    61  		e.qEncode(buf, charset, s)
    62  	}
    63  	closeWord(buf)
    64  
    65  	return buf.String()
    66  }
    67  
    68  const (
    69  	// The maximum length of an encoded-word is 75 characters.
    70  	// See RFC 2047, section 2.
    71  	maxEncodedWordLen = 75
    72  	// maxContentLen is how much content can be encoded, ignoring the header and
    73  	// 2-byte footer.
    74  	maxContentLen = maxEncodedWordLen - len("=?UTF-8?") - len("?=")
    75  )
    76  
    77  var maxBase64Len = base64.StdEncoding.DecodedLen(maxContentLen)
    78  
    79  // bEncode encodes s using base64 encoding and writes it to buf.
    80  func (e WordEncoder) bEncode(buf *bytes.Buffer, charset, s string) {
    81  	w := base64.NewEncoder(base64.StdEncoding, buf)
    82  	// If the charset is not UTF-8 or if the content is short, do not bother
    83  	// splitting the encoded-word.
    84  	if !isUTF8(charset) || base64.StdEncoding.EncodedLen(len(s)) <= maxContentLen {
    85  		io.WriteString(w, s)
    86  		w.Close()
    87  		return
    88  	}
    89  
    90  	var currentLen, last, runeLen int
    91  	for i := 0; i < len(s); i += runeLen {
    92  		// Multi-byte characters must not be split accross encoded-words.
    93  		// See RFC 2047, section 5.3.
    94  		_, runeLen = utf8.DecodeRuneInString(s[i:])
    95  
    96  		if currentLen+runeLen <= maxBase64Len {
    97  			currentLen += runeLen
    98  		} else {
    99  			io.WriteString(w, s[last:i])
   100  			w.Close()
   101  			e.splitWord(buf, charset)
   102  			last = i
   103  			currentLen = runeLen
   104  		}
   105  	}
   106  	io.WriteString(w, s[last:])
   107  	w.Close()
   108  }
   109  
   110  // qEncode encodes s using Q encoding and writes it to buf. It splits the
   111  // encoded-words when necessary.
   112  func (e WordEncoder) qEncode(buf *bytes.Buffer, charset, s string) {
   113  	// We only split encoded-words when the charset is UTF-8.
   114  	if !isUTF8(charset) {
   115  		writeQString(buf, s)
   116  		return
   117  	}
   118  
   119  	var currentLen, runeLen int
   120  	for i := 0; i < len(s); i += runeLen {
   121  		b := s[i]
   122  		// Multi-byte characters must not be split accross encoded-words.
   123  		// See RFC 2047, section 5.3.
   124  		var encLen int
   125  		if b >= ' ' && b <= '~' && b != '=' && b != '?' && b != '_' {
   126  			runeLen, encLen = 1, 1
   127  		} else {
   128  			_, runeLen = utf8.DecodeRuneInString(s[i:])
   129  			encLen = 3 * runeLen
   130  		}
   131  
   132  		if currentLen+encLen > maxContentLen {
   133  			e.splitWord(buf, charset)
   134  			currentLen = 0
   135  		}
   136  		writeQString(buf, s[i:i+runeLen])
   137  		currentLen += encLen
   138  	}
   139  }
   140  
   141  // writeQString encodes s using Q encoding and writes it to buf.
   142  func writeQString(buf *bytes.Buffer, s string) {
   143  	for i := 0; i < len(s); i++ {
   144  		switch b := s[i]; {
   145  		case b == ' ':
   146  			buf.WriteByte('_')
   147  		case b >= '!' && b <= '~' && b != '=' && b != '?' && b != '_':
   148  			buf.WriteByte(b)
   149  		default:
   150  			buf.WriteByte('=')
   151  			buf.WriteByte(upperhex[b>>4])
   152  			buf.WriteByte(upperhex[b&0x0f])
   153  		}
   154  	}
   155  }
   156  
   157  // openWord writes the beginning of an encoded-word into buf.
   158  func (e WordEncoder) openWord(buf *bytes.Buffer, charset string) {
   159  	buf.WriteString("=?")
   160  	buf.WriteString(charset)
   161  	buf.WriteByte('?')
   162  	buf.WriteByte(byte(e))
   163  	buf.WriteByte('?')
   164  }
   165  
   166  // closeWord writes the end of an encoded-word into buf.
   167  func closeWord(buf *bytes.Buffer) {
   168  	buf.WriteString("?=")
   169  }
   170  
   171  // splitWord closes the current encoded-word and opens a new one.
   172  func (e WordEncoder) splitWord(buf *bytes.Buffer, charset string) {
   173  	closeWord(buf)
   174  	buf.WriteByte(' ')
   175  	e.openWord(buf, charset)
   176  }
   177  
   178  func isUTF8(charset string) bool {
   179  	return strings.EqualFold(charset, "UTF-8")
   180  }
   181  
   182  const upperhex = "0123456789ABCDEF"
   183  
   184  // A WordDecoder decodes MIME headers containing RFC 2047 encoded-words.
   185  type WordDecoder struct {
   186  	// CharsetReader, if non-nil, defines a function to generate
   187  	// charset-conversion readers, converting from the provided
   188  	// charset into UTF-8.
   189  	// Charsets are always lower-case. utf-8, iso-8859-1 and us-ascii charsets
   190  	// are handled by default.
   191  	// One of the the CharsetReader's result values must be non-nil.
   192  	CharsetReader func(charset string, input io.Reader) (io.Reader, error)
   193  }
   194  
   195  // Decode decodes an RFC 2047 encoded-word.
   196  func (d *WordDecoder) Decode(word string) (string, error) {
   197  	if !strings.HasPrefix(word, "=?") || !strings.HasSuffix(word, "?=") || strings.Count(word, "?") != 4 {
   198  		return "", errInvalidWord
   199  	}
   200  	word = word[2 : len(word)-2]
   201  
   202  	// split delimits the first 2 fields
   203  	split := strings.IndexByte(word, '?')
   204  	// the field after split must only be one byte
   205  	if word[split+2] != '?' {
   206  		return "", errInvalidWord
   207  	}
   208  
   209  	// split word "UTF-8?q?ascii" into "UTF-8", 'q', and "ascii"
   210  	charset := word[:split]
   211  	encoding := word[split+1]
   212  	text := word[split+3:]
   213  
   214  	content, err := decode(encoding, text)
   215  	if err != nil {
   216  		return "", err
   217  	}
   218  
   219  	buf := getBuffer()
   220  	defer putBuffer(buf)
   221  
   222  	if err := d.convert(buf, charset, content); err != nil {
   223  		return "", err
   224  	}
   225  
   226  	return buf.String(), nil
   227  }
   228  
   229  // DecodeHeader decodes all encoded-words of the given string. It returns an
   230  // error if and only if CharsetReader of d returns an error.
   231  func (d *WordDecoder) DecodeHeader(header string) (string, error) {
   232  	// If there is no encoded-word, returns before creating a buffer.
   233  	i := strings.Index(header, "=?")
   234  	if i == -1 {
   235  		return header, nil
   236  	}
   237  
   238  	buf := getBuffer()
   239  	defer putBuffer(buf)
   240  
   241  	buf.WriteString(header[:i])
   242  	header = header[i:]
   243  
   244  	betweenWords := false
   245  	for {
   246  		start := strings.Index(header, "=?")
   247  		if start == -1 {
   248  			break
   249  		}
   250  		cur := start + len("=?")
   251  
   252  		i := strings.Index(header[cur:], "?")
   253  		if i == -1 {
   254  			break
   255  		}
   256  		charset := header[cur : cur+i]
   257  		cur += i + len("?")
   258  
   259  		if len(header) < cur+len("Q??=") {
   260  			break
   261  		}
   262  		encoding := header[cur]
   263  		cur++
   264  
   265  		if header[cur] != '?' {
   266  			break
   267  		}
   268  		cur++
   269  
   270  		j := strings.Index(header[cur:], "?=")
   271  		if j == -1 {
   272  			break
   273  		}
   274  		text := header[cur : cur+j]
   275  		end := cur + j + len("?=")
   276  
   277  		content, err := decode(encoding, text)
   278  		if err != nil {
   279  			betweenWords = false
   280  			buf.WriteString(header[:start+2])
   281  			header = header[start+2:]
   282  			continue
   283  		}
   284  
   285  		// Write characters before the encoded-word. White-space and newline
   286  		// characters separating two encoded-words must be deleted.
   287  		if start > 0 && (!betweenWords || hasNonWhitespace(header[:start])) {
   288  			buf.WriteString(header[:start])
   289  		}
   290  
   291  		if err := d.convert(buf, charset, content); err != nil {
   292  			return "", err
   293  		}
   294  
   295  		header = header[end:]
   296  		betweenWords = true
   297  	}
   298  
   299  	if len(header) > 0 {
   300  		buf.WriteString(header)
   301  	}
   302  
   303  	return buf.String(), nil
   304  }
   305  
   306  func decode(encoding byte, text string) ([]byte, error) {
   307  	switch encoding {
   308  	case 'B', 'b':
   309  		return base64.StdEncoding.DecodeString(text)
   310  	case 'Q', 'q':
   311  		return qDecode(text)
   312  	default:
   313  		return nil, errInvalidWord
   314  	}
   315  }
   316  
   317  func (d *WordDecoder) convert(buf *bytes.Buffer, charset string, content []byte) error {
   318  	switch {
   319  	case strings.EqualFold("utf-8", charset):
   320  		buf.Write(content)
   321  	case strings.EqualFold("iso-8859-1", charset):
   322  		for _, c := range content {
   323  			buf.WriteRune(rune(c))
   324  		}
   325  	case strings.EqualFold("us-ascii", charset):
   326  		for _, c := range content {
   327  			if c >= utf8.RuneSelf {
   328  				buf.WriteRune(unicode.ReplacementChar)
   329  			} else {
   330  				buf.WriteByte(c)
   331  			}
   332  		}
   333  	default:
   334  		if d.CharsetReader == nil {
   335  			return fmt.Errorf("mime: unhandled charset %q", charset)
   336  		}
   337  		r, err := d.CharsetReader(strings.ToLower(charset), bytes.NewReader(content))
   338  		if err != nil {
   339  			return err
   340  		}
   341  		if _, err = buf.ReadFrom(r); err != nil {
   342  			return err
   343  		}
   344  	}
   345  	return nil
   346  }
   347  
   348  // hasNonWhitespace reports whether s (assumed to be ASCII) contains at least
   349  // one byte of non-whitespace.
   350  func hasNonWhitespace(s string) bool {
   351  	for _, b := range s {
   352  		switch b {
   353  		// Encoded-words can only be separated by linear white spaces which does
   354  		// not include vertical tabs (\v).
   355  		case ' ', '\t', '\n', '\r':
   356  		default:
   357  			return true
   358  		}
   359  	}
   360  	return false
   361  }
   362  
   363  // qDecode decodes a Q encoded string.
   364  func qDecode(s string) ([]byte, error) {
   365  	dec := make([]byte, len(s))
   366  	n := 0
   367  	for i := 0; i < len(s); i++ {
   368  		switch c := s[i]; {
   369  		case c == '_':
   370  			dec[n] = ' '
   371  		case c == '=':
   372  			if i+2 >= len(s) {
   373  				return nil, errInvalidWord
   374  			}
   375  			b, err := readHexByte(s[i+1], s[i+2])
   376  			if err != nil {
   377  				return nil, err
   378  			}
   379  			dec[n] = b
   380  			i += 2
   381  		case (c <= '~' && c >= ' ') || c == '\n' || c == '\r' || c == '\t':
   382  			dec[n] = c
   383  		default:
   384  			return nil, errInvalidWord
   385  		}
   386  		n++
   387  	}
   388  
   389  	return dec[:n], nil
   390  }
   391  
   392  // readHexByte returns the byte from its quoted-printable representation.
   393  func readHexByte(a, b byte) (byte, error) {
   394  	var hb, lb byte
   395  	var err error
   396  	if hb, err = fromHex(a); err != nil {
   397  		return 0, err
   398  	}
   399  	if lb, err = fromHex(b); err != nil {
   400  		return 0, err
   401  	}
   402  	return hb<<4 | lb, nil
   403  }
   404  
   405  func fromHex(b byte) (byte, error) {
   406  	switch {
   407  	case b >= '0' && b <= '9':
   408  		return b - '0', nil
   409  	case b >= 'A' && b <= 'F':
   410  		return b - 'A' + 10, nil
   411  	// Accept badly encoded bytes.
   412  	case b >= 'a' && b <= 'f':
   413  		return b - 'a' + 10, nil
   414  	}
   415  	return 0, fmt.Errorf("mime: invalid hex byte %#02x", b)
   416  }
   417  
   418  var bufPool = sync.Pool{
   419  	New: func() interface{} {
   420  		return new(bytes.Buffer)
   421  	},
   422  }
   423  
   424  func getBuffer() *bytes.Buffer {
   425  	return bufPool.Get().(*bytes.Buffer)
   426  }
   427  
   428  func putBuffer(buf *bytes.Buffer) {
   429  	if buf.Len() > 1024 {
   430  		return
   431  	}
   432  	buf.Reset()
   433  	bufPool.Put(buf)
   434  }