golang.org/x/text@v0.14.0/encoding/simplifiedchinese/gbk.go (about) 1 // Copyright 2013 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package simplifiedchinese 6 7 import ( 8 "unicode/utf8" 9 10 "golang.org/x/text/encoding" 11 "golang.org/x/text/encoding/internal" 12 "golang.org/x/text/encoding/internal/identifier" 13 "golang.org/x/text/transform" 14 ) 15 16 var ( 17 // GB18030 is the GB18030 encoding. 18 GB18030 encoding.Encoding = &gbk18030 19 // GBK is the GBK encoding. It encodes an extension of the GB2312 character set 20 // and is also known as Code Page 936. 21 GBK encoding.Encoding = &gbk 22 ) 23 24 var gbk = internal.Encoding{ 25 &internal.SimpleEncoding{ 26 gbkDecoder{gb18030: false}, 27 gbkEncoder{gb18030: false}, 28 }, 29 "GBK", 30 identifier.GBK, 31 } 32 33 var gbk18030 = internal.Encoding{ 34 &internal.SimpleEncoding{ 35 gbkDecoder{gb18030: true}, 36 gbkEncoder{gb18030: true}, 37 }, 38 "GB18030", 39 identifier.GB18030, 40 } 41 42 type gbkDecoder struct { 43 transform.NopResetter 44 gb18030 bool 45 } 46 47 func (d gbkDecoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { 48 r, size := rune(0), 0 49 loop: 50 for ; nSrc < len(src); nSrc += size { 51 switch c0 := src[nSrc]; { 52 case c0 < utf8.RuneSelf: 53 r, size = rune(c0), 1 54 55 // Microsoft's Code Page 936 extends GBK 1.0 to encode the euro sign U+20AC 56 // as 0x80. The HTML5 specification at http://encoding.spec.whatwg.org/#gbk 57 // says to treat "gbk" as Code Page 936. 58 // GBK’s decoder is gb18030’s decoder. https://encoding.spec.whatwg.org/#gbk-decoder 59 // If byte is 0x80, return code point U+20AC. https://encoding.spec.whatwg.org/#gb18030-decoder 60 case c0 == 0x80: 61 r, size = '€', 1 62 63 case c0 < 0xff: 64 if nSrc+1 >= len(src) { 65 if !atEOF { 66 err = transform.ErrShortSrc 67 break loop 68 } 69 r, size = utf8.RuneError, 1 70 goto write 71 } 72 c1 := src[nSrc+1] 73 switch { 74 case 0x40 <= c1 && c1 < 0x7f: 75 c1 -= 0x40 76 case 0x80 <= c1 && c1 < 0xff: 77 c1 -= 0x41 78 case d.gb18030 && 0x30 <= c1 && c1 < 0x40: 79 if nSrc+3 >= len(src) { 80 if !atEOF { 81 err = transform.ErrShortSrc 82 break loop 83 } 84 // The second byte here is always ASCII, so we can set size 85 // to 1 in all cases. 86 r, size = utf8.RuneError, 1 87 goto write 88 } 89 c2 := src[nSrc+2] 90 if c2 < 0x81 || 0xff <= c2 { 91 r, size = utf8.RuneError, 1 92 goto write 93 } 94 c3 := src[nSrc+3] 95 if c3 < 0x30 || 0x3a <= c3 { 96 r, size = utf8.RuneError, 1 97 goto write 98 } 99 size = 4 100 r = ((rune(c0-0x81)*10+rune(c1-0x30))*126+rune(c2-0x81))*10 + rune(c3-0x30) 101 if r < 39420 { 102 i, j := 0, len(gb18030) 103 for i < j { 104 h := i + (j-i)/2 105 if r >= rune(gb18030[h][0]) { 106 i = h + 1 107 } else { 108 j = h 109 } 110 } 111 dec := &gb18030[i-1] 112 r += rune(dec[1]) - rune(dec[0]) 113 goto write 114 } 115 r -= 189000 116 if 0 <= r && r < 0x100000 { 117 r += 0x10000 118 } else { 119 r, size = utf8.RuneError, 1 120 } 121 goto write 122 default: 123 r, size = utf8.RuneError, 1 124 goto write 125 } 126 r, size = '\ufffd', 2 127 if i := int(c0-0x81)*190 + int(c1); i < len(decode) { 128 r = rune(decode[i]) 129 if r == 0 { 130 r = '\ufffd' 131 } 132 } 133 134 default: 135 r, size = utf8.RuneError, 1 136 } 137 138 write: 139 if nDst+utf8.RuneLen(r) > len(dst) { 140 err = transform.ErrShortDst 141 break loop 142 } 143 nDst += utf8.EncodeRune(dst[nDst:], r) 144 } 145 return nDst, nSrc, err 146 } 147 148 type gbkEncoder struct { 149 transform.NopResetter 150 gb18030 bool 151 } 152 153 func (e gbkEncoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { 154 r, r2, size := rune(0), rune(0), 0 155 for ; nSrc < len(src); nSrc += size { 156 r = rune(src[nSrc]) 157 158 // Decode a 1-byte rune. 159 if r < utf8.RuneSelf { 160 size = 1 161 162 } else { 163 // Decode a multi-byte rune. 164 r, size = utf8.DecodeRune(src[nSrc:]) 165 if size == 1 { 166 // All valid runes of size 1 (those below utf8.RuneSelf) were 167 // handled above. We have invalid UTF-8 or we haven't seen the 168 // full character yet. 169 if !atEOF && !utf8.FullRune(src[nSrc:]) { 170 err = transform.ErrShortSrc 171 break 172 } 173 } 174 175 // func init checks that the switch covers all tables. 176 switch { 177 case encode0Low <= r && r < encode0High: 178 if r2 = rune(encode0[r-encode0Low]); r2 != 0 { 179 goto write2 180 } 181 case encode1Low <= r && r < encode1High: 182 // Microsoft's Code Page 936 extends GBK 1.0 to encode the euro sign U+20AC 183 // as 0x80. The HTML5 specification at http://encoding.spec.whatwg.org/#gbk 184 // says to treat "gbk" as Code Page 936. 185 // GBK’s encoder is gb18030’s encoder with its _is GBK_ set to true. https://encoding.spec.whatwg.org/#gbk-encoder 186 // If _is GBK_ is true and code point is U+20AC, return byte 0x80. https://encoding.spec.whatwg.org/#gb18030-encoder 187 if !e.gb18030 && r == '€' { 188 r = 0x80 189 goto write1 190 } 191 if r2 = rune(encode1[r-encode1Low]); r2 != 0 { 192 goto write2 193 } 194 case encode2Low <= r && r < encode2High: 195 if r2 = rune(encode2[r-encode2Low]); r2 != 0 { 196 goto write2 197 } 198 case encode3Low <= r && r < encode3High: 199 if r2 = rune(encode3[r-encode3Low]); r2 != 0 { 200 goto write2 201 } 202 case encode4Low <= r && r < encode4High: 203 if r2 = rune(encode4[r-encode4Low]); r2 != 0 { 204 goto write2 205 } 206 } 207 208 if e.gb18030 { 209 if r < 0x10000 { 210 i, j := 0, len(gb18030) 211 for i < j { 212 h := i + (j-i)/2 213 if r >= rune(gb18030[h][1]) { 214 i = h + 1 215 } else { 216 j = h 217 } 218 } 219 dec := &gb18030[i-1] 220 r += rune(dec[0]) - rune(dec[1]) 221 goto write4 222 } else if r < 0x110000 { 223 r += 189000 - 0x10000 224 goto write4 225 } 226 } 227 err = internal.ErrASCIIReplacement 228 break 229 } 230 231 write1: 232 if nDst >= len(dst) { 233 err = transform.ErrShortDst 234 break 235 } 236 dst[nDst] = uint8(r) 237 nDst++ 238 continue 239 240 write2: 241 if nDst+2 > len(dst) { 242 err = transform.ErrShortDst 243 break 244 } 245 dst[nDst+0] = uint8(r2 >> 8) 246 dst[nDst+1] = uint8(r2) 247 nDst += 2 248 continue 249 250 write4: 251 if nDst+4 > len(dst) { 252 err = transform.ErrShortDst 253 break 254 } 255 dst[nDst+3] = uint8(r%10 + 0x30) 256 r /= 10 257 dst[nDst+2] = uint8(r%126 + 0x81) 258 r /= 126 259 dst[nDst+1] = uint8(r%10 + 0x30) 260 r /= 10 261 dst[nDst+0] = uint8(r + 0x81) 262 nDst += 4 263 continue 264 } 265 return nDst, nSrc, err 266 } 267 268 func init() { 269 // Check that the hard-coded encode switch covers all tables. 270 if numEncodeTables != 5 { 271 panic("bad numEncodeTables") 272 } 273 }