vitess.io/vitess@v0.16.2/go/mysql/collations/internal/charset/simplifiedchinese/gb18030.go (about)

     1  /*
     2  Copyright 2021 The Vitess Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package simplifiedchinese
    18  
    19  import (
    20  	"unicode/utf8"
    21  	_ "unsafe"
    22  
    23  	"vitess.io/vitess/go/mysql/collations/internal/charset/types"
    24  )
    25  
    26  type Charset_gb18030 struct{}
    27  
    28  func (Charset_gb18030) Name() string {
    29  	return "gb18030"
    30  }
    31  
    32  func (Charset_gb18030) IsSuperset(other types.Charset) bool {
    33  	switch other.(type) {
    34  	case Charset_gb18030:
    35  		return true
    36  	default:
    37  		return false
    38  	}
    39  }
    40  
    41  const isgb18030 = true
    42  
    43  func (Charset_gb18030) EncodeRune(dst []byte, r rune) int {
    44  	_ = dst[3]
    45  
    46  	var r2 rune
    47  	switch {
    48  	case r < utf8.RuneSelf:
    49  		goto write1
    50  	case encode0Low <= r && r < encode0High:
    51  		if r2 = rune(encode0[r-encode0Low]); r2 != 0 {
    52  			goto write2
    53  		}
    54  	case encode1Low <= r && r < encode1High:
    55  		// Microsoft's Code Page 936 extends GBK 1.0 to encode the euro sign U+20AC
    56  		// as 0x80. The HTML5 specification at http://encoding.spec.whatwg.org/#gbk
    57  		// says to treat "gbk" as Code Page 936.
    58  		if r == '€' {
    59  			r = 0x80
    60  			goto write1
    61  		}
    62  		if r2 = rune(encode1[r-encode1Low]); r2 != 0 {
    63  			goto write2
    64  		}
    65  	case encode2Low <= r && r < encode2High:
    66  		if r2 = rune(encode2[r-encode2Low]); r2 != 0 {
    67  			goto write2
    68  		}
    69  	case encode3Low <= r && r < encode3High:
    70  		if r2 = rune(encode3[r-encode3Low]); r2 != 0 {
    71  			goto write2
    72  		}
    73  	case encode4Low <= r && r < encode4High:
    74  		if r2 = rune(encode4[r-encode4Low]); r2 != 0 {
    75  			goto write2
    76  		}
    77  	}
    78  
    79  	if isgb18030 {
    80  		if r < 0x10000 {
    81  			i, j := 0, len(gb18030)
    82  			for i < j {
    83  				h := i + (j-i)/2
    84  				if r >= rune(gb18030[h][1]) {
    85  					i = h + 1
    86  				} else {
    87  					j = h
    88  				}
    89  			}
    90  			dec := &gb18030[i-1]
    91  			r += rune(dec[0]) - rune(dec[1])
    92  			goto write4
    93  		} else if r < 0x110000 {
    94  			r += 189000 - 0x10000
    95  			goto write4
    96  		}
    97  	}
    98  	return -1
    99  
   100  write1:
   101  	dst[0] = uint8(r)
   102  	return 1
   103  
   104  write2:
   105  	dst[0] = uint8(r2 >> 8)
   106  	dst[1] = uint8(r2)
   107  	return 2
   108  
   109  write4:
   110  	dst[3] = uint8(r%10 + 0x30)
   111  	r /= 10
   112  	dst[2] = uint8(r%126 + 0x81)
   113  	r /= 126
   114  	dst[1] = uint8(r%10 + 0x30)
   115  	r /= 10
   116  	dst[0] = uint8(r + 0x81)
   117  	return 4
   118  }
   119  
   120  func (Charset_gb18030) DecodeRune(src []byte) (rune, int) {
   121  	if len(src) < 1 {
   122  		return utf8.RuneError, 0
   123  	}
   124  
   125  	switch c0 := src[0]; {
   126  	case c0 < utf8.RuneSelf:
   127  		return rune(c0), 1
   128  
   129  	// Microsoft's Code Page 936 extends GBK 1.0 to encode the euro sign U+20AC
   130  	// as 0x80. The HTML5 specification at http://encoding.spec.whatwg.org/#gbk
   131  	// says to treat "gbk" as Code Page 936.
   132  	case c0 == 0x80:
   133  		return '€', 1
   134  
   135  	case c0 < 0xff:
   136  		if len(src) < 2 {
   137  			return utf8.RuneError, 1
   138  		}
   139  
   140  		c1 := src[1]
   141  		switch {
   142  		case 0x40 <= c1 && c1 < 0x7f:
   143  			c1 -= 0x40
   144  		case 0x80 <= c1 && c1 < 0xff:
   145  			c1 -= 0x41
   146  		case isgb18030 && 0x30 <= c1 && c1 < 0x40:
   147  			if len(src) < 4 {
   148  				// The second byte here is always ASCII, so we can set size
   149  				// to 1 in all cases.
   150  				return utf8.RuneError, 1
   151  			}
   152  			c2 := src[2]
   153  			if c2 < 0x81 || 0xff <= c2 {
   154  				return utf8.RuneError, 1
   155  			}
   156  			c3 := src[3]
   157  			if c3 < 0x30 || 0x3a <= c3 {
   158  				return utf8.RuneError, 1
   159  			}
   160  			var r = ((rune(c0-0x81)*10+rune(c1-0x30))*126+rune(c2-0x81))*10 + rune(c3-0x30)
   161  			if r < 39420 {
   162  				i, j := 0, len(gb18030)
   163  				for i < j {
   164  					h := i + (j-i)/2
   165  					if r >= rune(gb18030[h][0]) {
   166  						i = h + 1
   167  					} else {
   168  						j = h
   169  					}
   170  				}
   171  				dec := &gb18030[i-1]
   172  				r += rune(dec[1]) - rune(dec[0])
   173  				return r, 4
   174  			}
   175  			r -= 189000
   176  			if 0 <= r && r < 0x100000 {
   177  				r += 0x10000
   178  			} else {
   179  				return utf8.RuneError, 1
   180  			}
   181  			return r, 4
   182  		default:
   183  			return utf8.RuneError, 1
   184  		}
   185  		r := utf8.RuneError
   186  		if i := int(c0-0x81)*190 + int(c1); i < len(decode) {
   187  			r = rune(decode[i])
   188  			if r == 0 {
   189  				r = utf8.RuneError
   190  			}
   191  		}
   192  		return r, 2
   193  
   194  	default:
   195  		return utf8.RuneError, 1
   196  	}
   197  }
   198  
   199  func (c Charset_gb18030) SupportsSupplementaryChars() bool {
   200  	return false
   201  }