github.com/go-xe2/third@v1.0.3/golang.org/x/text/encoding/encoding.go (about) 1 // Copyright 2013 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // Package encoding defines an interface for character encodings, such as Shift 6 // JIS and Windows 1252, that can convert to and from UTF-8. 7 // 8 // Encoding implementations are provided in other packages, such as 9 // github.com/go-xe2/third/golang.org/x/text/encoding/charmap and 10 // github.com/go-xe2/third/golang.org/x/text/encoding/japanese. 11 package encoding // import "github.com/go-xe2/third/golang.org/x/text/encoding" 12 13 import ( 14 "errors" 15 "io" 16 "strconv" 17 "unicode/utf8" 18 19 "github.com/go-xe2/third/golang.org/x/text/encoding/internal/identifier" 20 "github.com/go-xe2/third/golang.org/x/text/transform" 21 ) 22 23 // TODO: 24 // - There seems to be some inconsistency in when decoders return errors 25 // and when not. Also documentation seems to suggest they shouldn't return 26 // errors at all (except for UTF-16). 27 // - Encoders seem to rely on or at least benefit from the input being in NFC 28 // normal form. Perhaps add an example how users could prepare their output. 29 30 // Encoding is a character set encoding that can be transformed to and from 31 // UTF-8. 32 type Encoding interface { 33 // NewDecoder returns a Decoder. 34 NewDecoder() *Decoder 35 36 // NewEncoder returns an Encoder. 37 NewEncoder() *Encoder 38 } 39 40 // A Decoder converts bytes to UTF-8. It implements transform.Transformer. 41 // 42 // Transforming source bytes that are not of that encoding will not result in an 43 // error per se. Each byte that cannot be transcoded will be represented in the 44 // output by the UTF-8 encoding of '\uFFFD', the replacement rune. 45 type Decoder struct { 46 transform.Transformer 47 48 // This forces external creators of Decoders to use names in struct 49 // initializers, allowing for future extendibility without having to break 50 // code. 51 _ struct{} 52 } 53 54 // Bytes converts the given encoded bytes to UTF-8. It returns the converted 55 // bytes or nil, err if any error occurred. 56 func (d *Decoder) Bytes(b []byte) ([]byte, error) { 57 b, _, err := transform.Bytes(d, b) 58 if err != nil { 59 return nil, err 60 } 61 return b, nil 62 } 63 64 // String converts the given encoded string to UTF-8. It returns the converted 65 // string or "", err if any error occurred. 66 func (d *Decoder) String(s string) (string, error) { 67 s, _, err := transform.String(d, s) 68 if err != nil { 69 return "", err 70 } 71 return s, nil 72 } 73 74 // Reader wraps another Reader to decode its bytes. 75 // 76 // The Decoder may not be used for any other operation as long as the returned 77 // Reader is in use. 78 func (d *Decoder) Reader(r io.Reader) io.Reader { 79 return transform.NewReader(r, d) 80 } 81 82 // An Encoder converts bytes from UTF-8. It implements transform.Transformer. 83 // 84 // Each rune that cannot be transcoded will result in an error. In this case, 85 // the transform will consume all source byte up to, not including the offending 86 // rune. Transforming source bytes that are not valid UTF-8 will be replaced by 87 // `\uFFFD`. To return early with an error instead, use transform.Chain to 88 // preprocess the data with a UTF8Validator. 89 type Encoder struct { 90 transform.Transformer 91 92 // This forces external creators of Encoders to use names in struct 93 // initializers, allowing for future extendibility without having to break 94 // code. 95 _ struct{} 96 } 97 98 // Bytes converts bytes from UTF-8. It returns the converted bytes or nil, err if 99 // any error occurred. 100 func (e *Encoder) Bytes(b []byte) ([]byte, error) { 101 b, _, err := transform.Bytes(e, b) 102 if err != nil { 103 return nil, err 104 } 105 return b, nil 106 } 107 108 // String converts a string from UTF-8. It returns the converted string or 109 // "", err if any error occurred. 110 func (e *Encoder) String(s string) (string, error) { 111 s, _, err := transform.String(e, s) 112 if err != nil { 113 return "", err 114 } 115 return s, nil 116 } 117 118 // Writer wraps another Writer to encode its UTF-8 output. 119 // 120 // The Encoder may not be used for any other operation as long as the returned 121 // Writer is in use. 122 func (e *Encoder) Writer(w io.Writer) io.Writer { 123 return transform.NewWriter(w, e) 124 } 125 126 // ASCIISub is the ASCII substitute character, as recommended by 127 // http://unicode.org/reports/tr36/#Text_Comparison 128 const ASCIISub = '\x1a' 129 130 // Nop is the nop encoding. Its transformed bytes are the same as the source 131 // bytes; it does not replace invalid UTF-8 sequences. 132 var Nop Encoding = nop{} 133 134 type nop struct{} 135 136 func (nop) NewDecoder() *Decoder { 137 return &Decoder{Transformer: transform.Nop} 138 } 139 func (nop) NewEncoder() *Encoder { 140 return &Encoder{Transformer: transform.Nop} 141 } 142 143 // Replacement is the replacement encoding. Decoding from the replacement 144 // encoding yields a single '\uFFFD' replacement rune. Encoding from UTF-8 to 145 // the replacement encoding yields the same as the source bytes except that 146 // invalid UTF-8 is converted to '\uFFFD'. 147 // 148 // It is defined at http://encoding.spec.whatwg.org/#replacement 149 var Replacement Encoding = replacement{} 150 151 type replacement struct{} 152 153 func (replacement) NewDecoder() *Decoder { 154 return &Decoder{Transformer: replacementDecoder{}} 155 } 156 157 func (replacement) NewEncoder() *Encoder { 158 return &Encoder{Transformer: replacementEncoder{}} 159 } 160 161 func (replacement) ID() (mib identifier.MIB, other string) { 162 return identifier.Replacement, "" 163 } 164 165 type replacementDecoder struct{ transform.NopResetter } 166 167 func (replacementDecoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { 168 if len(dst) < 3 { 169 return 0, 0, transform.ErrShortDst 170 } 171 if atEOF { 172 const fffd = "\ufffd" 173 dst[0] = fffd[0] 174 dst[1] = fffd[1] 175 dst[2] = fffd[2] 176 nDst = 3 177 } 178 return nDst, len(src), nil 179 } 180 181 type replacementEncoder struct{ transform.NopResetter } 182 183 func (replacementEncoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { 184 r, size := rune(0), 0 185 186 for ; nSrc < len(src); nSrc += size { 187 r = rune(src[nSrc]) 188 189 // Decode a 1-byte rune. 190 if r < utf8.RuneSelf { 191 size = 1 192 193 } else { 194 // Decode a multi-byte rune. 195 r, size = utf8.DecodeRune(src[nSrc:]) 196 if size == 1 { 197 // All valid runes of size 1 (those below utf8.RuneSelf) were 198 // handled above. We have invalid UTF-8 or we haven't seen the 199 // full character yet. 200 if !atEOF && !utf8.FullRune(src[nSrc:]) { 201 err = transform.ErrShortSrc 202 break 203 } 204 r = '\ufffd' 205 } 206 } 207 208 if nDst+utf8.RuneLen(r) > len(dst) { 209 err = transform.ErrShortDst 210 break 211 } 212 nDst += utf8.EncodeRune(dst[nDst:], r) 213 } 214 return nDst, nSrc, err 215 } 216 217 // HTMLEscapeUnsupported wraps encoders to replace source runes outside the 218 // repertoire of the destination encoding with HTML escape sequences. 219 // 220 // This wrapper exists to comply to URL and HTML forms requiring a 221 // non-terminating legacy encoder. The produced sequences may lead to data 222 // loss as they are indistinguishable from legitimate input. To avoid this 223 // issue, use UTF-8 encodings whenever possible. 224 func HTMLEscapeUnsupported(e *Encoder) *Encoder { 225 return &Encoder{Transformer: &errorHandler{e, errorToHTML}} 226 } 227 228 // ReplaceUnsupported wraps encoders to replace source runes outside the 229 // repertoire of the destination encoding with an encoding-specific 230 // replacement. 231 // 232 // This wrapper is only provided for backwards compatibility and legacy 233 // handling. Its use is strongly discouraged. Use UTF-8 whenever possible. 234 func ReplaceUnsupported(e *Encoder) *Encoder { 235 return &Encoder{Transformer: &errorHandler{e, errorToReplacement}} 236 } 237 238 type errorHandler struct { 239 *Encoder 240 handler func(dst []byte, r rune, err repertoireError) (n int, ok bool) 241 } 242 243 // TODO: consider making this error public in some form. 244 type repertoireError interface { 245 Replacement() byte 246 } 247 248 func (h errorHandler) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { 249 nDst, nSrc, err = h.Transformer.Transform(dst, src, atEOF) 250 for err != nil { 251 rerr, ok := err.(repertoireError) 252 if !ok { 253 return nDst, nSrc, err 254 } 255 r, sz := utf8.DecodeRune(src[nSrc:]) 256 n, ok := h.handler(dst[nDst:], r, rerr) 257 if !ok { 258 return nDst, nSrc, transform.ErrShortDst 259 } 260 err = nil 261 nDst += n 262 if nSrc += sz; nSrc < len(src) { 263 var dn, sn int 264 dn, sn, err = h.Transformer.Transform(dst[nDst:], src[nSrc:], atEOF) 265 nDst += dn 266 nSrc += sn 267 } 268 } 269 return nDst, nSrc, err 270 } 271 272 func errorToHTML(dst []byte, r rune, err repertoireError) (n int, ok bool) { 273 buf := [8]byte{} 274 b := strconv.AppendUint(buf[:0], uint64(r), 10) 275 if n = len(b) + len("&#;"); n >= len(dst) { 276 return 0, false 277 } 278 dst[0] = '&' 279 dst[1] = '#' 280 dst[copy(dst[2:], b)+2] = ';' 281 return n, true 282 } 283 284 func errorToReplacement(dst []byte, r rune, err repertoireError) (n int, ok bool) { 285 if len(dst) == 0 { 286 return 0, false 287 } 288 dst[0] = err.Replacement() 289 return 1, true 290 } 291 292 // ErrInvalidUTF8 means that a transformer encountered invalid UTF-8. 293 var ErrInvalidUTF8 = errors.New("encoding: invalid UTF-8") 294 295 // UTF8Validator is a transformer that returns ErrInvalidUTF8 on the first 296 // input byte that is not valid UTF-8. 297 var UTF8Validator transform.Transformer = utf8Validator{} 298 299 type utf8Validator struct{ transform.NopResetter } 300 301 func (utf8Validator) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { 302 n := len(src) 303 if n > len(dst) { 304 n = len(dst) 305 } 306 for i := 0; i < n; { 307 if c := src[i]; c < utf8.RuneSelf { 308 dst[i] = c 309 i++ 310 continue 311 } 312 _, size := utf8.DecodeRune(src[i:]) 313 if size == 1 { 314 // All valid runes of size 1 (those below utf8.RuneSelf) were 315 // handled above. We have invalid UTF-8 or we haven't seen the 316 // full character yet. 317 err = ErrInvalidUTF8 318 if !atEOF && !utf8.FullRune(src[i:]) { 319 err = transform.ErrShortSrc 320 } 321 return i, i, err 322 } 323 if i+size > len(dst) { 324 return i, i, transform.ErrShortDst 325 } 326 for ; size > 0; size-- { 327 dst[i] = src[i] 328 i++ 329 } 330 } 331 if len(src) > len(dst) { 332 err = transform.ErrShortDst 333 } 334 return n, n, err 335 }