vitess.io/vitess@v0.16.2/go/mysql/collations/internal/charset/unicode/utf8.go (about)

     1  /*
     2  Copyright 2021 The Vitess Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package unicode
    18  
    19  import (
    20  	"unicode/utf8"
    21  
    22  	"vitess.io/vitess/go/mysql/collations/internal/charset/types"
    23  )
    24  
    25  // Code points in the surrogate range are not valid for UTF-8.
    26  const (
    27  	surrogateMin = 0xD800
    28  	surrogateMax = 0xDFFF
    29  )
    30  
    31  const (
    32  	// t1 = 0b00000000
    33  	tx = 0b10000000
    34  	t2 = 0b11000000
    35  	t3 = 0b11100000
    36  	// t4 = 0b11110000
    37  	// t5 = 0b11111000
    38  
    39  	maskx = 0b00111111
    40  	mask2 = 0b00011111
    41  	mask3 = 0b00001111
    42  
    43  	rune1Max = 1<<7 - 1
    44  	rune2Max = 1<<11 - 1
    45  	rune3Max = 1<<16 - 1
    46  
    47  	// The default lowest and highest continuation byte.
    48  	locb = 0b10000000
    49  	hicb = 0b10111111
    50  
    51  	// These names of these constants are chosen to give nice alignment in the
    52  	// table below. The first nibble is an index into acceptRanges or F for
    53  	// special one-byte cases. The second nibble is the Rune length or the
    54  	// Status for the special one-byte case.
    55  	xx = 0xF1 // invalid: size 1
    56  	as = 0xF0 // ASCII: size 1
    57  	s1 = 0x02 // accept 0, size 2
    58  	s2 = 0x13 // accept 1, size 3
    59  	s3 = 0x03 // accept 0, size 3
    60  	s4 = 0x23 // accept 2, size 3
    61  )
    62  
    63  // first is information about the first byte in a UTF-8 sequence.
    64  var first = [256]uint8{
    65  	//   1   2   3   4   5   6   7   8   9   A   B   C   D   E   F
    66  	as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x00-0x0F
    67  	as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x10-0x1F
    68  	as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x20-0x2F
    69  	as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x30-0x3F
    70  	as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x40-0x4F
    71  	as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x50-0x5F
    72  	as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x60-0x6F
    73  	as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x70-0x7F
    74  	//   1   2   3   4   5   6   7   8   9   A   B   C   D   E   F
    75  	xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0x80-0x8F
    76  	xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0x90-0x9F
    77  	xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xA0-0xAF
    78  	xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xB0-0xBF
    79  	xx, xx, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, // 0xC0-0xCF
    80  	s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, // 0xD0-0xDF
    81  	s2, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s4, s3, s3, // 0xE0-0xEF
    82  	xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xF0-0xFF
    83  }
    84  
    85  // acceptRange gives the range of valid values for the second byte in a UTF-8
    86  // sequence.
    87  type acceptRange struct {
    88  	lo uint8 // lowest value for second byte.
    89  	hi uint8 // highest value for second byte.
    90  }
    91  
    92  // acceptRanges has size 16 to avoid bounds checks in the code that uses it.
    93  var acceptRanges = [16]acceptRange{
    94  	0: {locb, hicb},
    95  	1: {0xA0, hicb},
    96  	2: {locb, 0x9F},
    97  	3: {0x90, hicb},
    98  	4: {locb, 0x8F},
    99  }
   100  
   101  type Charset_utf8mb3 struct{}
   102  
   103  func (u Charset_utf8mb3) Name() string {
   104  	return "utf8mb3"
   105  }
   106  
   107  func (u Charset_utf8mb3) IsSuperset(other types.Charset) bool {
   108  	switch other.(type) {
   109  	case Charset_utf8mb3:
   110  		return true
   111  	default:
   112  		return false
   113  	}
   114  }
   115  
   116  func (Charset_utf8mb3) EncodeRune(p []byte, r rune) int {
   117  	// Negative values are erroneous. Making it unsigned addresses the problem.
   118  	switch i := uint32(r); {
   119  	case i <= rune1Max:
   120  		p[0] = byte(r)
   121  		return 1
   122  	case i <= rune2Max:
   123  		_ = p[1] // eliminate bounds checks
   124  		p[0] = t2 | byte(r>>6)
   125  		p[1] = tx | byte(r)&maskx
   126  		return 2
   127  	case i > utf8.MaxRune, surrogateMin <= i && i <= surrogateMax:
   128  		return -1
   129  	case i <= rune3Max:
   130  		_ = p[2] // eliminate bounds checks
   131  		p[0] = t3 | byte(r>>12)
   132  		p[1] = tx | byte(r>>6)&maskx
   133  		p[2] = tx | byte(r)&maskx
   134  		return 3
   135  	default:
   136  		return -1
   137  	}
   138  }
   139  
   140  func (Charset_utf8mb3) DecodeRune(p []byte) (rune, int) {
   141  	n := len(p)
   142  	if n < 1 {
   143  		return utf8.RuneError, 0
   144  	}
   145  	p0 := p[0]
   146  	x := first[p0]
   147  	if x >= as {
   148  		// The following code simulates an additional check for x == xx and
   149  		// handling the ASCII and invalid cases accordingly. This mask-and-or
   150  		// approach prevents an additional branch.
   151  		mask := rune(x) << 31 >> 31 // Create 0x0000 or 0xFFFF.
   152  		return rune(p[0])&^mask | utf8.RuneError&mask, 1
   153  	}
   154  	sz := int(x & 7)
   155  	accept := acceptRanges[x>>4]
   156  	if n < sz {
   157  		return utf8.RuneError, 1
   158  	}
   159  	b1 := p[1]
   160  	if b1 < accept.lo || accept.hi < b1 {
   161  		return utf8.RuneError, 1
   162  	}
   163  	if sz <= 2 { // <= instead of == to help the compiler eliminate some bounds checks
   164  		return rune(p0&mask2)<<6 | rune(b1&maskx), 2
   165  	}
   166  	b2 := p[2]
   167  	if b2 < locb || hicb < b2 {
   168  		return utf8.RuneError, 1
   169  	}
   170  	if sz <= 3 {
   171  		return rune(p0&mask3)<<12 | rune(b1&maskx)<<6 | rune(b2&maskx), 3
   172  	}
   173  	return utf8.RuneError, 1
   174  }
   175  
   176  func (Charset_utf8mb3) SupportsSupplementaryChars() bool {
   177  	return false
   178  }
   179  
   180  func (Charset_utf8mb3) Length(src []byte) int {
   181  	return utf8.RuneCount(src)
   182  }
   183  
   184  type Charset_utf8mb4 struct{}
   185  
   186  func (Charset_utf8mb4) Name() string {
   187  	return "utf8mb4"
   188  }
   189  
   190  func (Charset_utf8mb4) IsSuperset(other types.Charset) bool {
   191  	switch other.(type) {
   192  	case Charset_utf8mb4, Charset_utf8mb3:
   193  		return true
   194  	default:
   195  		return false
   196  	}
   197  }
   198  
   199  func (Charset_utf8mb4) EncodeRune(p []byte, r rune) int {
   200  	return utf8.EncodeRune(p, r)
   201  }
   202  
   203  func (Charset_utf8mb4) DecodeRune(p []byte) (rune, int) {
   204  	return utf8.DecodeRune(p)
   205  }
   206  
   207  func (Charset_utf8mb4) SupportsSupplementaryChars() bool {
   208  	return true
   209  }
   210  
   211  func (Charset_utf8mb4) Validate(p []byte) bool {
   212  	return utf8.Valid(p)
   213  }
   214  
   215  func (Charset_utf8mb4) Length(src []byte) int {
   216  	return utf8.RuneCount(src)
   217  }