github.com/whtcorpsinc/milevadb-prod@v0.0.0-20211104133533-f57f4be3b597/soliton/collate/unicode_ci.go (about) 1 // Copyright 2020 WHTCORPS INC, Inc. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // See the License for the specific language governing permissions and 12 // limitations under the License. 13 14 package defCauslate 15 16 import ( 17 "github.com/whtcorpsinc/milevadb/soliton/stringutil" 18 ) 19 20 const ( 21 // magic number indicate weight has 2 uint64, should get from `longRuneMap` 22 longRune uint64 = 0xFFFD 23 // first byte of a 2-byte encoding starts 110 and carries 5 bits of data 24 b2Mask = 0x1F // 0001 1111 25 26 // first byte of a 3-byte encoding starts 1110 and carries 4 bits of data 27 b3Mask = 0x0F // 0000 1111 28 29 // first byte of a 4-byte encoding starts 11110 and carries 3 bits of data 30 b4Mask = 0x07 // 0000 0111 31 32 // non-first bytes start 10 and carry 6 bits of data 33 mbMask = 0x3F // 0011 1111 34 ) 35 36 // decode rune by hand 37 func decodeRune(s string, si int) (r rune, newIndex int) { 38 switch b := s[si]; { 39 case b < 0x80: 40 r = rune(b) 41 newIndex = si + 1 42 case b < 0xE0: 43 r = rune(b&b2Mask)<<6 | 44 rune(s[1+si]&mbMask) 45 newIndex = si + 2 46 case b < 0xF0: 47 r = rune(b&b3Mask)<<12 | 48 rune(s[si+1]&mbMask)<<6 | 49 rune(s[si+2]&mbMask) 50 newIndex = si + 3 51 default: 52 r = rune(b&b4Mask)<<18 | 53 rune(s[si+1]&mbMask)<<12 | 54 rune(s[si+2]&mbMask)<<6 | 55 rune(s[si+3]&mbMask) 56 newIndex = si + 4 57 } 58 return 59 } 60 61 // unicodeCIDefCauslator implements UCA. see http://unicode.org/reports/tr10/ 62 type unicodeCIDefCauslator struct { 63 } 64 65 // Compare implements DefCauslator interface. 66 func (uc *unicodeCIDefCauslator) Compare(a, b string) int { 67 a = truncateTailingSpace(a) 68 b = truncateTailingSpace(b) 69 // weight of a, b. weight in unicode_ci may has 8 uint16s. xn indicate first 4 u16s, xs indicate last 4 u16s 70 an, bn := uint64(0), uint64(0) 71 as, bs := uint64(0), uint64(0) 72 // rune of a, b 73 ar, br := rune(0), rune(0) 74 // decode index of a, b 75 ai, bi := 0, 0 76 for { 77 if an == 0 { 78 if as == 0 { 79 for an == 0 && ai < len(a) { 80 ar, ai = decodeRune(a, ai) 81 an, as = convertUnicode(ar) 82 } 83 } else { 84 an = as 85 as = 0 86 } 87 } 88 89 if bn == 0 { 90 if bs == 0 { 91 for bn == 0 && bi < len(b) { 92 br, bi = decodeRune(b, bi) 93 bn, bs = convertUnicode(br) 94 } 95 } else { 96 bn = bs 97 bs = 0 98 } 99 } 100 101 if an == 0 || bn == 0 { 102 return sign(int(an) - int(bn)) 103 } 104 105 if an == bn { 106 an, bn = 0, 0 107 continue 108 } 109 110 for an != 0 && bn != 0 { 111 if (an^bn)&0xFFFF == 0 { 112 an >>= 16 113 bn >>= 16 114 } else { 115 return sign(int(an&0xFFFF) - int(bn&0xFFFF)) 116 } 117 } 118 } 119 } 120 121 // Key implements DefCauslator interface. 122 func (uc *unicodeCIDefCauslator) Key(str string) []byte { 123 str = truncateTailingSpace(str) 124 buf := make([]byte, 0, len(str)*2) 125 r := rune(0) 126 si := 0 // decode index of s 127 sn, ss := uint64(0), uint64(0) // weight of str. weight in unicode_ci may has 8 uint16s. sn indicate first 4 u16s, ss indicate last 4 u16s 128 129 for si < len(str) { 130 r, si = decodeRune(str, si) 131 sn, ss = convertUnicode(r) 132 for sn != 0 { 133 buf = append(buf, byte((sn&0xFF00)>>8), byte(sn)) 134 sn >>= 16 135 } 136 for ss != 0 { 137 buf = append(buf, byte((ss&0xFF00)>>8), byte(ss)) 138 ss >>= 16 139 } 140 } 141 return buf 142 } 143 144 // convert rune to weights. 145 // `first` represent first 4 uint16 weights of rune 146 // `second` represent last 4 uint16 weights of rune if exist, 0 if not 147 func convertUnicode(r rune) (first, second uint64) { 148 if r > 0xFFFF { 149 return 0xFFFD, 0 150 } 151 if mapBlock[r] == longRune { 152 return longRuneMap[r][0], longRuneMap[r][1] 153 } 154 return mapBlock[r], 0 155 } 156 157 // Pattern implements DefCauslator interface. 158 func (uc *unicodeCIDefCauslator) Pattern() WildcardPattern { 159 return &unicodePattern{} 160 } 161 162 type unicodePattern struct { 163 patChars []rune 164 patTypes []byte 165 } 166 167 // Compile implements WildcardPattern interface. 168 func (p *unicodePattern) Compile(patternStr string, escape byte) { 169 p.patChars, p.patTypes = compilePatternUnicodeCI(patternStr, escape) 170 } 171 172 // DoMatch implements WildcardPattern interface. 173 func (p *unicodePattern) DoMatch(str string) bool { 174 return doMatchUnicodeCI(str, p.patChars, p.patTypes) 175 } 176 177 // compilePatternUnicodeCI handles escapes and wild cards, generate pattern weights and types. 178 // This function is modified from stringutil.CompilePattern. 179 func compilePatternUnicodeCI(pattern string, escape byte) (patWeights []rune, patTypes []byte) { 180 runes := []rune(pattern) 181 escapeRune := rune(escape) 182 lenRunes := len(runes) 183 patWeights = make([]rune, lenRunes) 184 patTypes = make([]byte, lenRunes) 185 patLen := 0 186 for i := 0; i < lenRunes; i++ { 187 var tp byte 188 var r = runes[i] 189 switch r { 190 case escapeRune: 191 tp = stringutil.PatMatch 192 if i < lenRunes-1 { 193 i++ 194 r = runes[i] 195 if r == escapeRune || r == '_' || r == '%' { 196 // Valid escape. 197 } else { 198 // Invalid escape, fall back to escape byte. 199 // allegrosql will treat escape character as the origin value even 200 // the escape sequence is invalid in Go or C. 201 // e.g., \m is invalid in Go, but in MyALLEGROSQL we will get "m" for select '\m'. 202 // Following case is correct just for escape \, not for others like +. 203 // TODO: Add more checks for other escapes. 204 i-- 205 r = escapeRune 206 } 207 } 208 case '_': 209 // %_ => _% 210 if patLen > 0 && patTypes[patLen-1] == stringutil.PatAny { 211 tp = stringutil.PatAny 212 r = '%' 213 patWeights[patLen-1], patTypes[patLen-1] = '_', stringutil.PatOne 214 } else { 215 tp = stringutil.PatOne 216 } 217 case '%': 218 // %% => % 219 if patLen > 0 && patTypes[patLen-1] == stringutil.PatAny { 220 continue 221 } 222 tp = stringutil.PatAny 223 default: 224 tp = stringutil.PatMatch 225 } 226 patWeights[patLen] = r 227 patTypes[patLen] = tp 228 patLen++ 229 } 230 patWeights = patWeights[:patLen] 231 patTypes = patTypes[:patLen] 232 return 233 } 234 235 // doMatchUnicodeCI matches the string with patWeights and patTypes. 236 // The algorithm has linear time complexity. 237 // https://research.swtch.com/glob 238 // This function is modified from stringutil.DoMatch. 239 func doMatchUnicodeCI(str string, patWeights []rune, patTypes []byte) bool { 240 runes := []rune(str) 241 lenRunes := len(runes) 242 var rIdx, pIdx, nextRIdx, nextPIdx int 243 for pIdx < len(patWeights) || rIdx < lenRunes { 244 if pIdx < len(patWeights) { 245 switch patTypes[pIdx] { 246 case stringutil.PatMatch: 247 if rIdx < lenRunes && runeEqual(runes[rIdx], patWeights[pIdx]) { 248 pIdx++ 249 rIdx++ 250 continue 251 } 252 case stringutil.PatOne: 253 if rIdx < lenRunes { 254 pIdx++ 255 rIdx++ 256 continue 257 } 258 case stringutil.PatAny: 259 // Try to match at sIdx. 260 // If that doesn't work out, 261 // restart at sIdx+1 next. 262 nextPIdx = pIdx 263 nextRIdx = rIdx + 1 264 pIdx++ 265 continue 266 } 267 } 268 // Mismatch. Maybe restart. 269 if 0 < nextRIdx && nextRIdx <= lenRunes { 270 pIdx = nextPIdx 271 rIdx = nextRIdx 272 continue 273 } 274 return false 275 } 276 // Matched all of pattern to all of name. Success. 277 return true 278 } 279 280 // runeEqual compare rune is equal with unicode_ci defCauslation 281 func runeEqual(a, b rune) bool { 282 if a > 0xFFFF || b > 0xFFFF { 283 return a == b 284 } 285 286 ar, br := mapBlock[a], mapBlock[b] 287 if ar != br { 288 return false 289 } 290 291 if ar == longRune { 292 return a == b 293 } 294 295 return true 296 }