vitess.io/vitess@v0.16.2/go/mysql/collations/internal/charset/simplifiedchinese/gb18030.go (about) 1 /* 2 Copyright 2021 The Vitess Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package simplifiedchinese 18 19 import ( 20 "unicode/utf8" 21 _ "unsafe" 22 23 "vitess.io/vitess/go/mysql/collations/internal/charset/types" 24 ) 25 26 type Charset_gb18030 struct{} 27 28 func (Charset_gb18030) Name() string { 29 return "gb18030" 30 } 31 32 func (Charset_gb18030) IsSuperset(other types.Charset) bool { 33 switch other.(type) { 34 case Charset_gb18030: 35 return true 36 default: 37 return false 38 } 39 } 40 41 const isgb18030 = true 42 43 func (Charset_gb18030) EncodeRune(dst []byte, r rune) int { 44 _ = dst[3] 45 46 var r2 rune 47 switch { 48 case r < utf8.RuneSelf: 49 goto write1 50 case encode0Low <= r && r < encode0High: 51 if r2 = rune(encode0[r-encode0Low]); r2 != 0 { 52 goto write2 53 } 54 case encode1Low <= r && r < encode1High: 55 // Microsoft's Code Page 936 extends GBK 1.0 to encode the euro sign U+20AC 56 // as 0x80. The HTML5 specification at http://encoding.spec.whatwg.org/#gbk 57 // says to treat "gbk" as Code Page 936. 58 if r == '€' { 59 r = 0x80 60 goto write1 61 } 62 if r2 = rune(encode1[r-encode1Low]); r2 != 0 { 63 goto write2 64 } 65 case encode2Low <= r && r < encode2High: 66 if r2 = rune(encode2[r-encode2Low]); r2 != 0 { 67 goto write2 68 } 69 case encode3Low <= r && r < encode3High: 70 if r2 = rune(encode3[r-encode3Low]); r2 != 0 { 71 goto write2 72 } 73 case encode4Low <= r && r < encode4High: 74 if r2 = rune(encode4[r-encode4Low]); r2 != 0 { 75 goto write2 76 } 77 } 78 79 if isgb18030 { 80 if r < 0x10000 { 81 i, j := 0, len(gb18030) 82 for i < j { 83 h := i + (j-i)/2 84 if r >= rune(gb18030[h][1]) { 85 i = h + 1 86 } else { 87 j = h 88 } 89 } 90 dec := &gb18030[i-1] 91 r += rune(dec[0]) - rune(dec[1]) 92 goto write4 93 } else if r < 0x110000 { 94 r += 189000 - 0x10000 95 goto write4 96 } 97 } 98 return -1 99 100 write1: 101 dst[0] = uint8(r) 102 return 1 103 104 write2: 105 dst[0] = uint8(r2 >> 8) 106 dst[1] = uint8(r2) 107 return 2 108 109 write4: 110 dst[3] = uint8(r%10 + 0x30) 111 r /= 10 112 dst[2] = uint8(r%126 + 0x81) 113 r /= 126 114 dst[1] = uint8(r%10 + 0x30) 115 r /= 10 116 dst[0] = uint8(r + 0x81) 117 return 4 118 } 119 120 func (Charset_gb18030) DecodeRune(src []byte) (rune, int) { 121 if len(src) < 1 { 122 return utf8.RuneError, 0 123 } 124 125 switch c0 := src[0]; { 126 case c0 < utf8.RuneSelf: 127 return rune(c0), 1 128 129 // Microsoft's Code Page 936 extends GBK 1.0 to encode the euro sign U+20AC 130 // as 0x80. The HTML5 specification at http://encoding.spec.whatwg.org/#gbk 131 // says to treat "gbk" as Code Page 936. 132 case c0 == 0x80: 133 return '€', 1 134 135 case c0 < 0xff: 136 if len(src) < 2 { 137 return utf8.RuneError, 1 138 } 139 140 c1 := src[1] 141 switch { 142 case 0x40 <= c1 && c1 < 0x7f: 143 c1 -= 0x40 144 case 0x80 <= c1 && c1 < 0xff: 145 c1 -= 0x41 146 case isgb18030 && 0x30 <= c1 && c1 < 0x40: 147 if len(src) < 4 { 148 // The second byte here is always ASCII, so we can set size 149 // to 1 in all cases. 150 return utf8.RuneError, 1 151 } 152 c2 := src[2] 153 if c2 < 0x81 || 0xff <= c2 { 154 return utf8.RuneError, 1 155 } 156 c3 := src[3] 157 if c3 < 0x30 || 0x3a <= c3 { 158 return utf8.RuneError, 1 159 } 160 var r = ((rune(c0-0x81)*10+rune(c1-0x30))*126+rune(c2-0x81))*10 + rune(c3-0x30) 161 if r < 39420 { 162 i, j := 0, len(gb18030) 163 for i < j { 164 h := i + (j-i)/2 165 if r >= rune(gb18030[h][0]) { 166 i = h + 1 167 } else { 168 j = h 169 } 170 } 171 dec := &gb18030[i-1] 172 r += rune(dec[1]) - rune(dec[0]) 173 return r, 4 174 } 175 r -= 189000 176 if 0 <= r && r < 0x100000 { 177 r += 0x10000 178 } else { 179 return utf8.RuneError, 1 180 } 181 return r, 4 182 default: 183 return utf8.RuneError, 1 184 } 185 r := utf8.RuneError 186 if i := int(c0-0x81)*190 + int(c1); i < len(decode) { 187 r = rune(decode[i]) 188 if r == 0 { 189 r = utf8.RuneError 190 } 191 } 192 return r, 2 193 194 default: 195 return utf8.RuneError, 1 196 } 197 } 198 199 func (c Charset_gb18030) SupportsSupplementaryChars() bool { 200 return false 201 }