vitess.io/vitess@v0.16.2/go/mysql/collations/internal/charset/unicode/utf8.go (about) 1 /* 2 Copyright 2021 The Vitess Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package unicode 18 19 import ( 20 "unicode/utf8" 21 22 "vitess.io/vitess/go/mysql/collations/internal/charset/types" 23 ) 24 25 // Code points in the surrogate range are not valid for UTF-8. 26 const ( 27 surrogateMin = 0xD800 28 surrogateMax = 0xDFFF 29 ) 30 31 const ( 32 // t1 = 0b00000000 33 tx = 0b10000000 34 t2 = 0b11000000 35 t3 = 0b11100000 36 // t4 = 0b11110000 37 // t5 = 0b11111000 38 39 maskx = 0b00111111 40 mask2 = 0b00011111 41 mask3 = 0b00001111 42 43 rune1Max = 1<<7 - 1 44 rune2Max = 1<<11 - 1 45 rune3Max = 1<<16 - 1 46 47 // The default lowest and highest continuation byte. 48 locb = 0b10000000 49 hicb = 0b10111111 50 51 // These names of these constants are chosen to give nice alignment in the 52 // table below. The first nibble is an index into acceptRanges or F for 53 // special one-byte cases. The second nibble is the Rune length or the 54 // Status for the special one-byte case. 55 xx = 0xF1 // invalid: size 1 56 as = 0xF0 // ASCII: size 1 57 s1 = 0x02 // accept 0, size 2 58 s2 = 0x13 // accept 1, size 3 59 s3 = 0x03 // accept 0, size 3 60 s4 = 0x23 // accept 2, size 3 61 ) 62 63 // first is information about the first byte in a UTF-8 sequence. 64 var first = [256]uint8{ 65 // 1 2 3 4 5 6 7 8 9 A B C D E F 66 as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x00-0x0F 67 as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x10-0x1F 68 as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x20-0x2F 69 as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x30-0x3F 70 as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x40-0x4F 71 as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x50-0x5F 72 as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x60-0x6F 73 as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x70-0x7F 74 // 1 2 3 4 5 6 7 8 9 A B C D E F 75 xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0x80-0x8F 76 xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0x90-0x9F 77 xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xA0-0xAF 78 xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xB0-0xBF 79 xx, xx, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, // 0xC0-0xCF 80 s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, // 0xD0-0xDF 81 s2, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s4, s3, s3, // 0xE0-0xEF 82 xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xF0-0xFF 83 } 84 85 // acceptRange gives the range of valid values for the second byte in a UTF-8 86 // sequence. 87 type acceptRange struct { 88 lo uint8 // lowest value for second byte. 89 hi uint8 // highest value for second byte. 90 } 91 92 // acceptRanges has size 16 to avoid bounds checks in the code that uses it. 93 var acceptRanges = [16]acceptRange{ 94 0: {locb, hicb}, 95 1: {0xA0, hicb}, 96 2: {locb, 0x9F}, 97 3: {0x90, hicb}, 98 4: {locb, 0x8F}, 99 } 100 101 type Charset_utf8mb3 struct{} 102 103 func (u Charset_utf8mb3) Name() string { 104 return "utf8mb3" 105 } 106 107 func (u Charset_utf8mb3) IsSuperset(other types.Charset) bool { 108 switch other.(type) { 109 case Charset_utf8mb3: 110 return true 111 default: 112 return false 113 } 114 } 115 116 func (Charset_utf8mb3) EncodeRune(p []byte, r rune) int { 117 // Negative values are erroneous. Making it unsigned addresses the problem. 118 switch i := uint32(r); { 119 case i <= rune1Max: 120 p[0] = byte(r) 121 return 1 122 case i <= rune2Max: 123 _ = p[1] // eliminate bounds checks 124 p[0] = t2 | byte(r>>6) 125 p[1] = tx | byte(r)&maskx 126 return 2 127 case i > utf8.MaxRune, surrogateMin <= i && i <= surrogateMax: 128 return -1 129 case i <= rune3Max: 130 _ = p[2] // eliminate bounds checks 131 p[0] = t3 | byte(r>>12) 132 p[1] = tx | byte(r>>6)&maskx 133 p[2] = tx | byte(r)&maskx 134 return 3 135 default: 136 return -1 137 } 138 } 139 140 func (Charset_utf8mb3) DecodeRune(p []byte) (rune, int) { 141 n := len(p) 142 if n < 1 { 143 return utf8.RuneError, 0 144 } 145 p0 := p[0] 146 x := first[p0] 147 if x >= as { 148 // The following code simulates an additional check for x == xx and 149 // handling the ASCII and invalid cases accordingly. This mask-and-or 150 // approach prevents an additional branch. 151 mask := rune(x) << 31 >> 31 // Create 0x0000 or 0xFFFF. 152 return rune(p[0])&^mask | utf8.RuneError&mask, 1 153 } 154 sz := int(x & 7) 155 accept := acceptRanges[x>>4] 156 if n < sz { 157 return utf8.RuneError, 1 158 } 159 b1 := p[1] 160 if b1 < accept.lo || accept.hi < b1 { 161 return utf8.RuneError, 1 162 } 163 if sz <= 2 { // <= instead of == to help the compiler eliminate some bounds checks 164 return rune(p0&mask2)<<6 | rune(b1&maskx), 2 165 } 166 b2 := p[2] 167 if b2 < locb || hicb < b2 { 168 return utf8.RuneError, 1 169 } 170 if sz <= 3 { 171 return rune(p0&mask3)<<12 | rune(b1&maskx)<<6 | rune(b2&maskx), 3 172 } 173 return utf8.RuneError, 1 174 } 175 176 func (Charset_utf8mb3) SupportsSupplementaryChars() bool { 177 return false 178 } 179 180 func (Charset_utf8mb3) Length(src []byte) int { 181 return utf8.RuneCount(src) 182 } 183 184 type Charset_utf8mb4 struct{} 185 186 func (Charset_utf8mb4) Name() string { 187 return "utf8mb4" 188 } 189 190 func (Charset_utf8mb4) IsSuperset(other types.Charset) bool { 191 switch other.(type) { 192 case Charset_utf8mb4, Charset_utf8mb3: 193 return true 194 default: 195 return false 196 } 197 } 198 199 func (Charset_utf8mb4) EncodeRune(p []byte, r rune) int { 200 return utf8.EncodeRune(p, r) 201 } 202 203 func (Charset_utf8mb4) DecodeRune(p []byte) (rune, int) { 204 return utf8.DecodeRune(p) 205 } 206 207 func (Charset_utf8mb4) SupportsSupplementaryChars() bool { 208 return true 209 } 210 211 func (Charset_utf8mb4) Validate(p []byte) bool { 212 return utf8.Valid(p) 213 } 214 215 func (Charset_utf8mb4) Length(src []byte) int { 216 return utf8.RuneCount(src) 217 }