golang.org/x/text@v0.14.0/encoding/unicode/utf32/utf32.go (about) 1 // Copyright 2016 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // Package utf32 provides the UTF-32 Unicode encoding. 6 // 7 // Please note that support for UTF-32 is discouraged as it is a rare and 8 // inefficient encoding, unfit for use as an interchange format. For use 9 // on the web, the W3C strongly discourages its use 10 // (https://www.w3.org/TR/html5/document-metadata.html#charset) 11 // while WHATWG directly prohibits supporting it 12 // (https://html.spec.whatwg.org/multipage/syntax.html#character-encodings). 13 package utf32 // import "golang.org/x/text/encoding/unicode/utf32" 14 15 import ( 16 "errors" 17 "unicode/utf8" 18 19 "golang.org/x/text/encoding" 20 "golang.org/x/text/encoding/internal/identifier" 21 "golang.org/x/text/transform" 22 ) 23 24 // All lists a configuration for each IANA-defined UTF-32 variant. 25 var All = []encoding.Encoding{ 26 UTF32(BigEndian, UseBOM), 27 UTF32(BigEndian, IgnoreBOM), 28 UTF32(LittleEndian, IgnoreBOM), 29 } 30 31 // ErrMissingBOM means that decoding UTF-32 input with ExpectBOM did not 32 // find a starting byte order mark. 33 var ErrMissingBOM = errors.New("encoding: missing byte order mark") 34 35 // UTF32 returns a UTF-32 Encoding for the given default endianness and 36 // byte order mark (BOM) policy. 37 // 38 // When decoding from UTF-32 to UTF-8, if the BOMPolicy is IgnoreBOM then 39 // neither BOMs U+FEFF nor ill-formed code units 0xFFFE0000 in the input 40 // stream will affect the endianness used for decoding. Instead BOMs will 41 // be output as their standard UTF-8 encoding "\xef\xbb\xbf" while 42 // 0xFFFE0000 code units will be output as "\xef\xbf\xbd", the standard 43 // UTF-8 encoding for the Unicode replacement character. If the BOMPolicy 44 // is UseBOM or ExpectBOM a starting BOM is not written to the UTF-8 45 // output. Instead, it overrides the default endianness e for the remainder 46 // of the transformation. Any subsequent BOMs U+FEFF or ill-formed code 47 // units 0xFFFE0000 will not affect the endianness used, and will instead 48 // be output as their standard UTF-8 (replacement) encodings. For UseBOM, 49 // if there is no starting BOM, it will proceed with the default 50 // Endianness. For ExpectBOM, in that case, the transformation will return 51 // early with an ErrMissingBOM error. 52 // 53 // When encoding from UTF-8 to UTF-32, a BOM will be inserted at the start 54 // of the output if the BOMPolicy is UseBOM or ExpectBOM. Otherwise, a BOM 55 // will not be inserted. The UTF-8 input does not need to contain a BOM. 56 // 57 // There is no concept of a 'native' endianness. If the UTF-32 data is 58 // produced and consumed in a greater context that implies a certain 59 // endianness, use IgnoreBOM. Otherwise, use ExpectBOM and always produce 60 // and consume a BOM. 61 // 62 // In the language of https://www.unicode.org/faq/utf_bom.html#bom10, 63 // IgnoreBOM corresponds to "Where the precise type of the data stream is 64 // known... the BOM should not be used" and ExpectBOM corresponds to "A 65 // particular protocol... may require use of the BOM". 66 func UTF32(e Endianness, b BOMPolicy) encoding.Encoding { 67 return utf32Encoding{config{e, b}, mibValue[e][b&bomMask]} 68 } 69 70 // mibValue maps Endianness and BOMPolicy settings to MIB constants for UTF-32. 71 // Note that some configurations map to the same MIB identifier. 72 var mibValue = map[Endianness][numBOMValues]identifier.MIB{ 73 BigEndian: [numBOMValues]identifier.MIB{ 74 IgnoreBOM: identifier.UTF32BE, 75 UseBOM: identifier.UTF32, 76 }, 77 LittleEndian: [numBOMValues]identifier.MIB{ 78 IgnoreBOM: identifier.UTF32LE, 79 UseBOM: identifier.UTF32, 80 }, 81 // ExpectBOM is not widely used and has no valid MIB identifier. 82 } 83 84 // BOMPolicy is a UTF-32 encodings's byte order mark policy. 85 type BOMPolicy uint8 86 87 const ( 88 writeBOM BOMPolicy = 0x01 89 acceptBOM BOMPolicy = 0x02 90 requireBOM BOMPolicy = 0x04 91 bomMask BOMPolicy = 0x07 92 93 // HACK: numBOMValues == 8 triggers a bug in the 1.4 compiler (cannot have a 94 // map of an array of length 8 of a type that is also used as a key or value 95 // in another map). See golang.org/issue/11354. 96 // TODO: consider changing this value back to 8 if the use of 1.4.* has 97 // been minimized. 98 numBOMValues = 8 + 1 99 100 // IgnoreBOM means to ignore any byte order marks. 101 IgnoreBOM BOMPolicy = 0 102 // Unicode-compliant interpretation for UTF-32BE/LE. 103 104 // UseBOM means that the UTF-32 form may start with a byte order mark, 105 // which will be used to override the default encoding. 106 UseBOM BOMPolicy = writeBOM | acceptBOM 107 // Unicode-compliant interpretation for UTF-32. 108 109 // ExpectBOM means that the UTF-32 form must start with a byte order mark, 110 // which will be used to override the default encoding. 111 ExpectBOM BOMPolicy = writeBOM | acceptBOM | requireBOM 112 // Consistent with BOMPolicy definition in golang.org/x/text/encoding/unicode 113 ) 114 115 // Endianness is a UTF-32 encoding's default endianness. 116 type Endianness bool 117 118 const ( 119 // BigEndian is UTF-32BE. 120 BigEndian Endianness = false 121 // LittleEndian is UTF-32LE. 122 LittleEndian Endianness = true 123 ) 124 125 type config struct { 126 endianness Endianness 127 bomPolicy BOMPolicy 128 } 129 130 type utf32Encoding struct { 131 config 132 mib identifier.MIB 133 } 134 135 func (u utf32Encoding) NewDecoder() *encoding.Decoder { 136 return &encoding.Decoder{Transformer: &utf32Decoder{ 137 initial: u.config, 138 current: u.config, 139 }} 140 } 141 142 func (u utf32Encoding) NewEncoder() *encoding.Encoder { 143 return &encoding.Encoder{Transformer: &utf32Encoder{ 144 endianness: u.endianness, 145 initialBOMPolicy: u.bomPolicy, 146 currentBOMPolicy: u.bomPolicy, 147 }} 148 } 149 150 func (u utf32Encoding) ID() (mib identifier.MIB, other string) { 151 return u.mib, "" 152 } 153 154 func (u utf32Encoding) String() string { 155 e, b := "B", "" 156 if u.endianness == LittleEndian { 157 e = "L" 158 } 159 switch u.bomPolicy { 160 case ExpectBOM: 161 b = "Expect" 162 case UseBOM: 163 b = "Use" 164 case IgnoreBOM: 165 b = "Ignore" 166 } 167 return "UTF-32" + e + "E (" + b + " BOM)" 168 } 169 170 type utf32Decoder struct { 171 initial config 172 current config 173 } 174 175 func (u *utf32Decoder) Reset() { 176 u.current = u.initial 177 } 178 179 func (u *utf32Decoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { 180 if len(src) == 0 { 181 if atEOF && u.current.bomPolicy&requireBOM != 0 { 182 return 0, 0, ErrMissingBOM 183 } 184 return 0, 0, nil 185 } 186 if u.current.bomPolicy&acceptBOM != 0 { 187 if len(src) < 4 { 188 return 0, 0, transform.ErrShortSrc 189 } 190 switch { 191 case src[0] == 0x00 && src[1] == 0x00 && src[2] == 0xfe && src[3] == 0xff: 192 u.current.endianness = BigEndian 193 nSrc = 4 194 case src[0] == 0xff && src[1] == 0xfe && src[2] == 0x00 && src[3] == 0x00: 195 u.current.endianness = LittleEndian 196 nSrc = 4 197 default: 198 if u.current.bomPolicy&requireBOM != 0 { 199 return 0, 0, ErrMissingBOM 200 } 201 } 202 u.current.bomPolicy = IgnoreBOM 203 } 204 205 var r rune 206 var dSize, sSize int 207 for nSrc < len(src) { 208 if nSrc+3 < len(src) { 209 x := uint32(src[nSrc+0])<<24 | uint32(src[nSrc+1])<<16 | 210 uint32(src[nSrc+2])<<8 | uint32(src[nSrc+3]) 211 if u.current.endianness == LittleEndian { 212 x = x>>24 | (x >> 8 & 0x0000FF00) | (x << 8 & 0x00FF0000) | x<<24 213 } 214 r, sSize = rune(x), 4 215 if dSize = utf8.RuneLen(r); dSize < 0 { 216 r, dSize = utf8.RuneError, 3 217 } 218 } else if atEOF { 219 // 1..3 trailing bytes. 220 r, dSize, sSize = utf8.RuneError, 3, len(src)-nSrc 221 } else { 222 err = transform.ErrShortSrc 223 break 224 } 225 if nDst+dSize > len(dst) { 226 err = transform.ErrShortDst 227 break 228 } 229 nDst += utf8.EncodeRune(dst[nDst:], r) 230 nSrc += sSize 231 } 232 return nDst, nSrc, err 233 } 234 235 type utf32Encoder struct { 236 endianness Endianness 237 initialBOMPolicy BOMPolicy 238 currentBOMPolicy BOMPolicy 239 } 240 241 func (u *utf32Encoder) Reset() { 242 u.currentBOMPolicy = u.initialBOMPolicy 243 } 244 245 func (u *utf32Encoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { 246 if u.currentBOMPolicy&writeBOM != 0 { 247 if len(dst) < 4 { 248 return 0, 0, transform.ErrShortDst 249 } 250 dst[0], dst[1], dst[2], dst[3] = 0x00, 0x00, 0xfe, 0xff 251 u.currentBOMPolicy = IgnoreBOM 252 nDst = 4 253 } 254 255 r, size := rune(0), 0 256 for nSrc < len(src) { 257 r = rune(src[nSrc]) 258 259 // Decode a 1-byte rune. 260 if r < utf8.RuneSelf { 261 size = 1 262 263 } else { 264 // Decode a multi-byte rune. 265 r, size = utf8.DecodeRune(src[nSrc:]) 266 if size == 1 { 267 // All valid runes of size 1 (those below utf8.RuneSelf) were 268 // handled above. We have invalid UTF-8 or we haven't seen the 269 // full character yet. 270 if !atEOF && !utf8.FullRune(src[nSrc:]) { 271 err = transform.ErrShortSrc 272 break 273 } 274 } 275 } 276 277 if nDst+4 > len(dst) { 278 err = transform.ErrShortDst 279 break 280 } 281 282 dst[nDst+0] = uint8(r >> 24) 283 dst[nDst+1] = uint8(r >> 16) 284 dst[nDst+2] = uint8(r >> 8) 285 dst[nDst+3] = uint8(r) 286 nDst += 4 287 nSrc += size 288 } 289 290 if u.endianness == LittleEndian { 291 for i := 0; i < nDst; i += 4 { 292 dst[i], dst[i+1], dst[i+2], dst[i+3] = dst[i+3], dst[i+2], dst[i+1], dst[i] 293 } 294 } 295 return nDst, nSrc, err 296 }