github.com/go-xe2/third@v1.0.3/golang.org/x/text/encoding/simplifiedchinese/gbk.go (about) 1 // Copyright 2013 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package simplifiedchinese 6 7 import ( 8 "unicode/utf8" 9 10 "github.com/go-xe2/third/golang.org/x/text/encoding" 11 "github.com/go-xe2/third/golang.org/x/text/encoding/internal" 12 "github.com/go-xe2/third/golang.org/x/text/encoding/internal/identifier" 13 "github.com/go-xe2/third/golang.org/x/text/transform" 14 ) 15 16 var ( 17 // GB18030 is the GB18030 encoding. 18 GB18030 encoding.Encoding = &gbk18030 19 // GBK is the GBK encoding. It encodes an extension of the GB2312 character set 20 // and is also known as Code Page 936. 21 GBK encoding.Encoding = &gbk 22 ) 23 24 var gbk = internal.Encoding{ 25 &internal.SimpleEncoding{ 26 gbkDecoder{gb18030: false}, 27 gbkEncoder{gb18030: false}, 28 }, 29 "GBK", 30 identifier.GBK, 31 } 32 33 var gbk18030 = internal.Encoding{ 34 &internal.SimpleEncoding{ 35 gbkDecoder{gb18030: true}, 36 gbkEncoder{gb18030: true}, 37 }, 38 "GB18030", 39 identifier.GB18030, 40 } 41 42 type gbkDecoder struct { 43 transform.NopResetter 44 gb18030 bool 45 } 46 47 func (d gbkDecoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { 48 r, size := rune(0), 0 49 loop: 50 for ; nSrc < len(src); nSrc += size { 51 switch c0 := src[nSrc]; { 52 case c0 < utf8.RuneSelf: 53 r, size = rune(c0), 1 54 55 // Microsoft's Code Page 936 extends GBK 1.0 to encode the euro sign U+20AC 56 // as 0x80. The HTML5 specification at http://encoding.spec.whatwg.org/#gbk 57 // says to treat "gbk" as Code Page 936. 58 case c0 == 0x80: 59 r, size = '€', 1 60 61 case c0 < 0xff: 62 if nSrc+1 >= len(src) { 63 if !atEOF { 64 err = transform.ErrShortSrc 65 break loop 66 } 67 r, size = utf8.RuneError, 1 68 goto write 69 } 70 c1 := src[nSrc+1] 71 switch { 72 case 0x40 <= c1 && c1 < 0x7f: 73 c1 -= 0x40 74 case 0x80 <= c1 && c1 < 0xff: 75 c1 -= 0x41 76 case d.gb18030 && 0x30 <= c1 && c1 < 0x40: 77 if nSrc+3 >= len(src) { 78 if !atEOF { 79 err = transform.ErrShortSrc 80 break loop 81 } 82 // The second byte here is always ASCII, so we can set size 83 // to 1 in all cases. 84 r, size = utf8.RuneError, 1 85 goto write 86 } 87 c2 := src[nSrc+2] 88 if c2 < 0x81 || 0xff <= c2 { 89 r, size = utf8.RuneError, 1 90 goto write 91 } 92 c3 := src[nSrc+3] 93 if c3 < 0x30 || 0x3a <= c3 { 94 r, size = utf8.RuneError, 1 95 goto write 96 } 97 size = 4 98 r = ((rune(c0-0x81)*10+rune(c1-0x30))*126+rune(c2-0x81))*10 + rune(c3-0x30) 99 if r < 39420 { 100 i, j := 0, len(gb18030) 101 for i < j { 102 h := i + (j-i)/2 103 if r >= rune(gb18030[h][0]) { 104 i = h + 1 105 } else { 106 j = h 107 } 108 } 109 dec := &gb18030[i-1] 110 r += rune(dec[1]) - rune(dec[0]) 111 goto write 112 } 113 r -= 189000 114 if 0 <= r && r < 0x100000 { 115 r += 0x10000 116 } else { 117 r, size = utf8.RuneError, 1 118 } 119 goto write 120 default: 121 r, size = utf8.RuneError, 1 122 goto write 123 } 124 r, size = '\ufffd', 2 125 if i := int(c0-0x81)*190 + int(c1); i < len(decode) { 126 r = rune(decode[i]) 127 if r == 0 { 128 r = '\ufffd' 129 } 130 } 131 132 default: 133 r, size = utf8.RuneError, 1 134 } 135 136 write: 137 if nDst+utf8.RuneLen(r) > len(dst) { 138 err = transform.ErrShortDst 139 break loop 140 } 141 nDst += utf8.EncodeRune(dst[nDst:], r) 142 } 143 return nDst, nSrc, err 144 } 145 146 type gbkEncoder struct { 147 transform.NopResetter 148 gb18030 bool 149 } 150 151 func (e gbkEncoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { 152 r, r2, size := rune(0), rune(0), 0 153 for ; nSrc < len(src); nSrc += size { 154 r = rune(src[nSrc]) 155 156 // Decode a 1-byte rune. 157 if r < utf8.RuneSelf { 158 size = 1 159 160 } else { 161 // Decode a multi-byte rune. 162 r, size = utf8.DecodeRune(src[nSrc:]) 163 if size == 1 { 164 // All valid runes of size 1 (those below utf8.RuneSelf) were 165 // handled above. We have invalid UTF-8 or we haven't seen the 166 // full character yet. 167 if !atEOF && !utf8.FullRune(src[nSrc:]) { 168 err = transform.ErrShortSrc 169 break 170 } 171 } 172 173 // func init checks that the switch covers all tables. 174 switch { 175 case encode0Low <= r && r < encode0High: 176 if r2 = rune(encode0[r-encode0Low]); r2 != 0 { 177 goto write2 178 } 179 case encode1Low <= r && r < encode1High: 180 // Microsoft's Code Page 936 extends GBK 1.0 to encode the euro sign U+20AC 181 // as 0x80. The HTML5 specification at http://encoding.spec.whatwg.org/#gbk 182 // says to treat "gbk" as Code Page 936. 183 if r == '€' { 184 r = 0x80 185 goto write1 186 } 187 if r2 = rune(encode1[r-encode1Low]); r2 != 0 { 188 goto write2 189 } 190 case encode2Low <= r && r < encode2High: 191 if r2 = rune(encode2[r-encode2Low]); r2 != 0 { 192 goto write2 193 } 194 case encode3Low <= r && r < encode3High: 195 if r2 = rune(encode3[r-encode3Low]); r2 != 0 { 196 goto write2 197 } 198 case encode4Low <= r && r < encode4High: 199 if r2 = rune(encode4[r-encode4Low]); r2 != 0 { 200 goto write2 201 } 202 } 203 204 if e.gb18030 { 205 if r < 0x10000 { 206 i, j := 0, len(gb18030) 207 for i < j { 208 h := i + (j-i)/2 209 if r >= rune(gb18030[h][1]) { 210 i = h + 1 211 } else { 212 j = h 213 } 214 } 215 dec := &gb18030[i-1] 216 r += rune(dec[0]) - rune(dec[1]) 217 goto write4 218 } else if r < 0x110000 { 219 r += 189000 - 0x10000 220 goto write4 221 } 222 } 223 err = internal.ErrASCIIReplacement 224 break 225 } 226 227 write1: 228 if nDst >= len(dst) { 229 err = transform.ErrShortDst 230 break 231 } 232 dst[nDst] = uint8(r) 233 nDst++ 234 continue 235 236 write2: 237 if nDst+2 > len(dst) { 238 err = transform.ErrShortDst 239 break 240 } 241 dst[nDst+0] = uint8(r2 >> 8) 242 dst[nDst+1] = uint8(r2) 243 nDst += 2 244 continue 245 246 write4: 247 if nDst+4 > len(dst) { 248 err = transform.ErrShortDst 249 break 250 } 251 dst[nDst+3] = uint8(r%10 + 0x30) 252 r /= 10 253 dst[nDst+2] = uint8(r%126 + 0x81) 254 r /= 126 255 dst[nDst+1] = uint8(r%10 + 0x30) 256 r /= 10 257 dst[nDst+0] = uint8(r + 0x81) 258 nDst += 4 259 continue 260 } 261 return nDst, nSrc, err 262 } 263 264 func init() { 265 // Check that the hard-coded encode switch covers all tables. 266 if numEncodeTables != 5 { 267 panic("bad numEncodeTables") 268 } 269 }