github.com/graybobo/golang.org-package-offline-cache@v0.0.0-20200626051047-6608995c132f/x/text/encoding/simplifiedchinese/gbk.go (about) 1 // Copyright 2013 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package simplifiedchinese 6 7 import ( 8 "errors" 9 "unicode/utf8" 10 11 "golang.org/x/text/encoding" 12 "golang.org/x/text/encoding/internal" 13 "golang.org/x/text/encoding/internal/identifier" 14 "golang.org/x/text/transform" 15 ) 16 17 var ( 18 // GB18030 is the GB18030 encoding. 19 GB18030 encoding.Encoding = &gbk18030 20 // GBK is the GBK encoding. It encodes an extension of the GB2312 character set 21 // and is also known as Code Page 936. 22 GBK encoding.Encoding = &gbk 23 ) 24 25 var gbk = internal.Encoding{ 26 &internal.SimpleEncoding{ 27 gbkDecoder{gb18030: false}, 28 gbkEncoder{gb18030: false}, 29 }, 30 "GBK", 31 identifier.GBK, 32 } 33 34 var gbk18030 = internal.Encoding{ 35 &internal.SimpleEncoding{ 36 gbkDecoder{gb18030: true}, 37 gbkEncoder{gb18030: true}, 38 }, 39 "GB18030", 40 identifier.GB18030, 41 } 42 43 var ( 44 errInvalidGB18030 = errors.New("simplifiedchinese: invalid GB18030 encoding") 45 errInvalidGBK = errors.New("simplifiedchinese: invalid GBK encoding") 46 ) 47 48 type gbkDecoder struct { 49 transform.NopResetter 50 gb18030 bool 51 } 52 53 func (d gbkDecoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { 54 r, size := rune(0), 0 55 loop: 56 for ; nSrc < len(src); nSrc += size { 57 switch c0 := src[nSrc]; { 58 case c0 < utf8.RuneSelf: 59 r, size = rune(c0), 1 60 61 // Microsoft's Code Page 936 extends GBK 1.0 to encode the euro sign U+20AC 62 // as 0x80. The HTML5 specification at http://encoding.spec.whatwg.org/#gbk 63 // says to treat "gbk" as Code Page 936. 64 case c0 == 0x80: 65 r, size = '€', 1 66 67 case c0 < 0xff: 68 if nSrc+1 >= len(src) { 69 err = transform.ErrShortSrc 70 break loop 71 } 72 c1 := src[nSrc+1] 73 switch { 74 case 0x40 <= c1 && c1 < 0x7f: 75 c1 -= 0x40 76 case 0x80 <= c1 && c1 < 0xff: 77 c1 -= 0x41 78 case d.gb18030 && 0x30 <= c1 && c1 < 0x40: 79 if nSrc+3 >= len(src) { 80 err = transform.ErrShortSrc 81 break loop 82 } 83 c2 := src[nSrc+2] 84 if c2 < 0x81 || 0xff <= c2 { 85 err = errInvalidGB18030 86 break loop 87 } 88 c3 := src[nSrc+3] 89 if c3 < 0x30 || 0x3a <= c3 { 90 err = errInvalidGB18030 91 break loop 92 } 93 size = 4 94 r = ((rune(c0-0x81)*10+rune(c1-0x30))*126+rune(c2-0x81))*10 + rune(c3-0x30) 95 if r < 39420 { 96 i, j := 0, len(gb18030) 97 for i < j { 98 h := i + (j-i)/2 99 if r >= rune(gb18030[h][0]) { 100 i = h + 1 101 } else { 102 j = h 103 } 104 } 105 dec := &gb18030[i-1] 106 r += rune(dec[1]) - rune(dec[0]) 107 goto write 108 } 109 r -= 189000 110 if 0 <= r && r < 0x100000 { 111 r += 0x10000 112 goto write 113 } 114 err = errInvalidGB18030 115 break loop 116 default: 117 if d.gb18030 { 118 err = errInvalidGB18030 119 } else { 120 err = errInvalidGBK 121 } 122 break loop 123 } 124 r, size = '\ufffd', 2 125 if i := int(c0-0x81)*190 + int(c1); i < len(decode) { 126 r = rune(decode[i]) 127 if r == 0 { 128 r = '\ufffd' 129 } 130 } 131 132 default: 133 if d.gb18030 { 134 err = errInvalidGB18030 135 } else { 136 err = errInvalidGBK 137 } 138 break loop 139 } 140 141 write: 142 if nDst+utf8.RuneLen(r) > len(dst) { 143 err = transform.ErrShortDst 144 break loop 145 } 146 nDst += utf8.EncodeRune(dst[nDst:], r) 147 } 148 if atEOF && err == transform.ErrShortSrc { 149 if d.gb18030 { 150 err = errInvalidGB18030 151 } else { 152 err = errInvalidGBK 153 } 154 } 155 return nDst, nSrc, err 156 } 157 158 type gbkEncoder struct { 159 transform.NopResetter 160 gb18030 bool 161 } 162 163 func (e gbkEncoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { 164 r, r2, size := rune(0), rune(0), 0 165 for ; nSrc < len(src); nSrc += size { 166 r = rune(src[nSrc]) 167 168 // Decode a 1-byte rune. 169 if r < utf8.RuneSelf { 170 size = 1 171 172 } else { 173 // Decode a multi-byte rune. 174 r, size = utf8.DecodeRune(src[nSrc:]) 175 if size == 1 { 176 // All valid runes of size 1 (those below utf8.RuneSelf) were 177 // handled above. We have invalid UTF-8 or we haven't seen the 178 // full character yet. 179 if !atEOF && !utf8.FullRune(src[nSrc:]) { 180 err = transform.ErrShortSrc 181 break 182 } 183 } 184 185 // func init checks that the switch covers all tables. 186 switch { 187 case encode0Low <= r && r < encode0High: 188 if r2 = rune(encode0[r-encode0Low]); r2 != 0 { 189 goto write2 190 } 191 case encode1Low <= r && r < encode1High: 192 // Microsoft's Code Page 936 extends GBK 1.0 to encode the euro sign U+20AC 193 // as 0x80. The HTML5 specification at http://encoding.spec.whatwg.org/#gbk 194 // says to treat "gbk" as Code Page 936. 195 if r == '€' { 196 r = 0x80 197 goto write1 198 } 199 if r2 = rune(encode1[r-encode1Low]); r2 != 0 { 200 goto write2 201 } 202 case encode2Low <= r && r < encode2High: 203 if r2 = rune(encode2[r-encode2Low]); r2 != 0 { 204 goto write2 205 } 206 case encode3Low <= r && r < encode3High: 207 if r2 = rune(encode3[r-encode3Low]); r2 != 0 { 208 goto write2 209 } 210 case encode4Low <= r && r < encode4High: 211 if r2 = rune(encode4[r-encode4Low]); r2 != 0 { 212 goto write2 213 } 214 } 215 216 if e.gb18030 { 217 if r < 0x10000 { 218 i, j := 0, len(gb18030) 219 for i < j { 220 h := i + (j-i)/2 221 if r >= rune(gb18030[h][1]) { 222 i = h + 1 223 } else { 224 j = h 225 } 226 } 227 dec := &gb18030[i-1] 228 r += rune(dec[0]) - rune(dec[1]) 229 goto write4 230 } else if r < 0x110000 { 231 r += 189000 - 0x10000 232 goto write4 233 } 234 } 235 err = internal.ErrASCIIReplacement 236 break 237 } 238 239 write1: 240 if nDst >= len(dst) { 241 err = transform.ErrShortDst 242 break 243 } 244 dst[nDst] = uint8(r) 245 nDst++ 246 continue 247 248 write2: 249 if nDst+2 > len(dst) { 250 err = transform.ErrShortDst 251 break 252 } 253 dst[nDst+0] = uint8(r2 >> 8) 254 dst[nDst+1] = uint8(r2) 255 nDst += 2 256 continue 257 258 write4: 259 if nDst+4 > len(dst) { 260 err = transform.ErrShortDst 261 break 262 } 263 dst[nDst+3] = uint8(r%10 + 0x30) 264 r /= 10 265 dst[nDst+2] = uint8(r%126 + 0x81) 266 r /= 126 267 dst[nDst+1] = uint8(r%10 + 0x30) 268 r /= 10 269 dst[nDst+0] = uint8(r + 0x81) 270 nDst += 4 271 continue 272 } 273 return nDst, nSrc, err 274 } 275 276 func init() { 277 // Check that the hard-coded encode switch covers all tables. 278 if numEncodeTables != 5 { 279 panic("bad numEncodeTables") 280 } 281 }