vitess.io/vitess@v0.16.2/go/mysql/collations/wildcard.go (about) 1 /* 2 Copyright 2021 The Vitess Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 // The wildcard matching code in Vitess uses two different implementations for wildcard algorithms, 18 // as seen on https://en.wikipedia.org/wiki/Matching_wildcards 19 // 20 // The main implementation is based on the logic in INN (https://inn.eyrie.org/trac/browser/trunk/lib/uwildmat.c), 21 // and is originally MIT licensed. This is a recursive matching algorithm with important optimizations, as explained 22 // on the Wikipedia page: it is a traditional recursion algorithm with 3 return values for match, no match, and 23 // impossible match, which greatly stops the depth of the recursion tree. It also only tries to target the ending 24 // codepoint at the end of a 'star' match, which again cuts the recursion depth. 25 // 26 // In practice, this results in a very efficient algorithm which performs great in real world cases, however, 27 // as just explained, it DOES recurse, which may be an issue when the input pattern is complex enough to cause 28 // deep recursion. 29 // 30 // To prevent Vitess instances from crashing because of stack overflows, we've added a stack guard to the algorithm, 31 // controlled by the wildcardRecursionDepth constant. If the recursion limit is reached, the match will fail -- 32 // potentially leading to wrong results for the algorithm. 33 // 34 // If accuracy is of upmost importance, the wildcardRecursionDepth constant can be set to 0, in which case Vitess 35 // will use an alternative iterative algorithm, based on a public domain algorithm by Alessandro Cantatore 36 // (seen in http://xoomer.virgilio.it/acantato/dev/wildcard/wildmatch.html). This algorithm is much simpler and does 37 // not recurse, however it is significantly slower than our recursive implementation (~25% slower in our benchmarks). 38 // 39 // Because of this, we intend to enable the recursive algorithm by default. 40 41 package collations 42 43 import ( 44 "unicode/utf8" 45 46 "vitess.io/vitess/go/mysql/collations/internal/charset" 47 ) 48 49 type match byte 50 51 const ( 52 matchOK match = iota 53 matchFail 54 matchOver 55 ) 56 57 // wildcardRecursionDepth is the maximum amount of recursive calls that can be performed when 58 // matching a wildcard. If set to 0, the default wildcard matcher will use an alternative algorithm 59 // that does not use recursion. 60 const wildcardRecursionDepth = 32 61 62 // patternMatchOne is a special value for compiled patterns which matches a single char (it usually replaces '_' or '?') 63 const patternMatchOne = -128 64 65 // patternMatchMany is a special value for compiled pattern that matches any amount of chars (it usually replaces '%' or '*') 66 const patternMatchMany = -256 67 68 // nopMatcher is an implementation of WildcardPattern that never matches anything. 69 // It is returned when we detect that a provided wildcard pattern cannot match anything 70 type nopMatcher struct{} 71 72 func (nopMatcher) Match(_ []byte) bool { 73 return false 74 } 75 76 // emptyMatcher is an implementation of WildcardPattern that only matches the empty string 77 type emptyMatcher struct{} 78 79 func (emptyMatcher) Match(in []byte) bool { 80 return len(in) == 0 81 } 82 83 // fastMatcher is an implementation of WildcardPattern that uses a collation's Collate method 84 // to perform wildcard matching. 85 // It is returned: 86 // - when the wildcard pattern has no wildcard characters at all 87 // - when the wildcard pattern has a single '%' (patternMatchMany) and it is the very last 88 // character of the pattern (in this case, we set isPrefix to true to use prefix-match collation) 89 type fastMatcher struct { 90 collate func(left, right []byte, isPrefix bool) int 91 pattern []byte 92 isPrefix bool 93 } 94 95 func (cm *fastMatcher) Match(in []byte) bool { 96 return cm.collate(in, cm.pattern, cm.isPrefix) == 0 97 } 98 99 // unicodeWildcard is an implementation of WildcardPattern for multibyte charsets; 100 // it is used for all UCA collations, multibyte collations and all Unicode-based collations 101 type unicodeWildcard struct { 102 equals func(a, b rune) bool 103 charset charset.Charset 104 pattern []rune 105 } 106 107 func newUnicodeWildcardMatcher( 108 cs charset.Charset, 109 equals func(a rune, b rune) bool, 110 collate func(left []byte, right []byte, isPrefix bool) int, 111 pat []byte, chOne, chMany, chEsc rune, 112 ) WildcardPattern { 113 var escape bool 114 var chOneCount, chManyCount, chEscCount int 115 var parsedPattern = make([]rune, 0, len(pat)) 116 var patOriginal = pat 117 118 if chOne == 0 { 119 chOne = '_' 120 } 121 if chMany == 0 { 122 chMany = '%' 123 } 124 if chEsc == 0 { 125 chEsc = '\\' 126 } 127 128 for len(pat) > 0 { 129 cp, width := cs.DecodeRune(pat) 130 if cp == charset.RuneError && width < 3 { 131 return nopMatcher{} 132 } 133 pat = pat[width:] 134 135 if escape { 136 parsedPattern = append(parsedPattern, cp) 137 escape = false 138 continue 139 } 140 141 switch cp { 142 case chOne: 143 chOneCount++ 144 parsedPattern = append(parsedPattern, patternMatchOne) 145 case chMany: 146 if len(parsedPattern) > 0 && parsedPattern[len(parsedPattern)-1] == patternMatchMany { 147 continue 148 } 149 chManyCount++ 150 parsedPattern = append(parsedPattern, patternMatchMany) 151 case chEsc: 152 chEscCount++ 153 escape = true 154 default: 155 parsedPattern = append(parsedPattern, cp) 156 } 157 } 158 if escape { 159 parsedPattern = append(parsedPattern, chEsc) 160 } 161 162 // if we have a collation callback, we can detect some common cases for patterns 163 // here and optimize them away without having to return a full WildcardPattern 164 if collate != nil { 165 if len(parsedPattern) == 0 { 166 return emptyMatcher{} 167 } 168 if chOneCount == 0 && chEscCount == 0 { 169 if chManyCount == 0 { 170 return &fastMatcher{ 171 collate: collate, 172 pattern: patOriginal, 173 isPrefix: false, 174 } 175 } 176 if chManyCount == 1 && chMany < utf8.RuneSelf && parsedPattern[len(parsedPattern)-1] == chMany { 177 return &fastMatcher{ 178 collate: collate, 179 pattern: patOriginal[:len(patOriginal)-1], 180 isPrefix: true, 181 } 182 } 183 } 184 } 185 186 return &unicodeWildcard{ 187 equals: equals, 188 charset: cs, 189 pattern: parsedPattern, 190 } 191 } 192 193 func (wc *unicodeWildcard) matchIter(str []byte, pat []rune) bool { 194 var s []byte 195 var p []rune 196 var star = false 197 var cs = wc.charset 198 199 retry: 200 s = str 201 p = pat 202 for len(s) > 0 { 203 var p0 rune 204 if len(p) > 0 { 205 p0 = p[0] 206 } 207 208 switch p0 { 209 case patternMatchOne: 210 c0, width := cs.DecodeRune(s) 211 if c0 == charset.RuneError && width < 3 { 212 return false 213 } 214 s = s[width:] 215 case patternMatchMany: 216 star = true 217 str = s 218 pat = p[1:] 219 if len(pat) == 0 { 220 return true 221 } 222 goto retry 223 default: 224 c0, width := cs.DecodeRune(s) 225 if c0 == charset.RuneError && width < 3 { 226 return false 227 } 228 if !wc.equals(c0, p0) { 229 goto starCheck 230 } 231 s = s[width:] 232 } 233 p = p[1:] 234 } 235 return len(p) == 0 || (len(p) == 1 && p[0] == patternMatchMany) 236 237 starCheck: 238 if !star { 239 return false 240 } 241 if len(str) > 0 { 242 c0, width := cs.DecodeRune(str) 243 if c0 == charset.RuneError && width < 3 { 244 return false 245 } 246 str = str[width:] 247 } 248 goto retry 249 } 250 251 func (wc *unicodeWildcard) Match(in []byte) bool { 252 if wildcardRecursionDepth == 0 { 253 return wc.matchIter(in, wc.pattern) 254 } 255 return wc.matchRecursive(in, wc.pattern, 0) == matchOK 256 } 257 258 func (wc *unicodeWildcard) matchMany(in []byte, pat []rune, depth int) match { 259 var cs = wc.charset 260 var p0 rune 261 262 many: 263 if len(pat) == 0 { 264 return matchOK 265 } 266 p0 = pat[0] 267 pat = pat[1:] 268 269 switch p0 { 270 case patternMatchMany: 271 goto many 272 case patternMatchOne: 273 cpIn, width := cs.DecodeRune(in) 274 if cpIn == charset.RuneError && width < 3 { 275 return matchFail 276 } 277 in = in[width:] 278 goto many 279 } 280 281 if len(in) == 0 { 282 return matchOver 283 } 284 285 retry: 286 var width int 287 for len(in) > 0 { 288 var cpIn rune 289 cpIn, width = cs.DecodeRune(in) 290 if cpIn == charset.RuneError && width < 3 { 291 return matchFail 292 } 293 if wc.equals(cpIn, p0) { 294 break 295 } 296 in = in[width:] 297 } 298 299 if len(in) == 0 { 300 return matchOver 301 } 302 in = in[width:] 303 304 m := wc.matchRecursive(in, pat, depth+1) 305 if m == matchFail { 306 goto retry 307 } 308 return m 309 } 310 311 func (wc *unicodeWildcard) matchRecursive(in []byte, pat []rune, depth int) match { 312 if depth >= wildcardRecursionDepth { 313 return matchFail 314 } 315 316 var cs = wc.charset 317 for len(pat) > 0 { 318 if pat[0] == patternMatchMany { 319 return wc.matchMany(in, pat[1:], depth) 320 } 321 322 cpIn, width := cs.DecodeRune(in) 323 if cpIn == charset.RuneError && width < 3 { 324 return matchFail 325 } 326 327 switch { 328 case pat[0] == patternMatchOne: 329 case wc.equals(pat[0], cpIn): 330 default: 331 return matchFail 332 } 333 334 in = in[width:] 335 pat = pat[1:] 336 } 337 338 if len(in) == 0 { 339 return matchOK 340 } 341 return matchFail 342 } 343 344 // eightbitWildcard is an implementation of WildcardPattern used for 8-bit charsets. 345 // It is used for all 8-bit encodings. 346 type eightbitWildcard struct { 347 sort *[256]byte 348 pattern []int16 349 } 350 351 func newEightbitWildcardMatcher( 352 sort *[256]byte, 353 collate func(left []byte, right []byte, isPrefix bool) int, 354 pat []byte, chOneRune, chManyRune, chEscRune rune, 355 ) WildcardPattern { 356 var escape bool 357 var parsedPattern = make([]int16, 0, len(pat)) 358 var chOne, chMany, chEsc byte = '_', '%', '\\' 359 var chOneCount, chManyCount, chEscCount int 360 361 if chOneRune > 255 || chManyRune > 255 || chEscRune > 255 { 362 return nopMatcher{} 363 } 364 if chOneRune != 0 { 365 chOne = byte(chOneRune) 366 } 367 if chManyRune != 0 { 368 chMany = byte(chManyRune) 369 } 370 if chEscRune != 0 { 371 chEsc = byte(chEscRune) 372 } 373 374 for _, ch := range pat { 375 if escape { 376 parsedPattern = append(parsedPattern, int16(ch)) 377 escape = false 378 continue 379 } 380 381 switch ch { 382 case chOne: 383 chOneCount++ 384 parsedPattern = append(parsedPattern, patternMatchOne) 385 case chMany: 386 if len(parsedPattern) > 0 && parsedPattern[len(parsedPattern)-1] == patternMatchMany { 387 continue 388 } 389 chManyCount++ 390 parsedPattern = append(parsedPattern, patternMatchMany) 391 case chEsc: 392 chEscCount++ 393 escape = true 394 default: 395 parsedPattern = append(parsedPattern, int16(ch)) 396 } 397 } 398 if escape { 399 parsedPattern = append(parsedPattern, int16(chEsc)) 400 } 401 402 // if we have a collation callback, we can detect some common cases for patterns 403 // here and optimize them away without having to return a full WildcardPattern 404 if collate != nil { 405 if len(parsedPattern) == 0 { 406 return emptyMatcher{} 407 } 408 if chOneCount == 0 && chEscCount == 0 { 409 if chManyCount == 0 { 410 return &fastMatcher{ 411 collate: collate, 412 pattern: pat, 413 isPrefix: false, 414 } 415 } 416 if chManyCount == 1 && pat[len(pat)-1] == chMany { 417 return &fastMatcher{ 418 collate: collate, 419 pattern: pat[:len(pat)-1], 420 isPrefix: true, 421 } 422 } 423 } 424 } 425 426 return &eightbitWildcard{ 427 sort: sort, 428 pattern: parsedPattern, 429 } 430 } 431 432 func (wc *eightbitWildcard) Match(in []byte) bool { 433 if wildcardRecursionDepth == 0 { 434 return wc.matchIter(in, wc.pattern) 435 } 436 return wc.matchRecursive(in, wc.pattern, 0) == matchOK 437 } 438 439 func (wc *eightbitWildcard) matchMany(in []byte, pat []int16, depth int) match { 440 var p0 int16 441 442 many: 443 if len(pat) == 0 { 444 return matchOK 445 } 446 447 p0 = pat[0] 448 pat = pat[1:] 449 450 switch p0 { 451 case patternMatchMany: 452 goto many 453 case patternMatchOne: 454 if len(in) == 0 { 455 return matchFail 456 } 457 in = in[1:] 458 goto many 459 } 460 461 if len(in) == 0 { 462 return matchOver 463 } 464 465 retry: 466 for len(in) > 0 { 467 if wc.sort[in[0]] == wc.sort[byte(p0)] { 468 break 469 } 470 in = in[1:] 471 } 472 if len(in) == 0 { 473 return matchOver 474 } 475 in = in[1:] 476 477 m := wc.matchRecursive(in, pat, depth+1) 478 if m == matchFail { 479 goto retry 480 } 481 return m 482 } 483 484 func (wc *eightbitWildcard) matchRecursive(in []byte, pat []int16, depth int) match { 485 if depth >= wildcardRecursionDepth { 486 return matchFail 487 } 488 for len(pat) > 0 { 489 if pat[0] == patternMatchMany { 490 return wc.matchMany(in, pat[1:], depth) 491 } 492 493 if len(in) == 0 { 494 return matchFail 495 } 496 497 switch { 498 case pat[0] == patternMatchOne: 499 case wc.sort[byte(pat[0])] == wc.sort[in[0]]: 500 default: 501 return matchFail 502 } 503 504 in = in[1:] 505 pat = pat[1:] 506 } 507 508 if len(in) == 0 { 509 return matchOK 510 } 511 return matchFail 512 } 513 514 func (wc *eightbitWildcard) matchIter(str []byte, pat []int16) bool { 515 var s []byte 516 var p []int16 517 var star = false 518 519 retry: 520 s = str 521 p = pat 522 for len(s) > 0 { 523 var p0 int16 524 if len(p) > 0 { 525 p0 = p[0] 526 } 527 528 switch p0 { 529 case patternMatchOne: 530 break 531 case patternMatchMany: 532 star = true 533 str = s 534 pat = p[1:] 535 if len(pat) == 0 { 536 return true 537 } 538 goto retry 539 default: 540 if wc.sort[byte(p0)] != wc.sort[s[0]] { 541 goto starCheck 542 } 543 } 544 s = s[1:] 545 p = p[1:] 546 } 547 return len(p) == 0 || (len(p) == 1 && p[0] == patternMatchMany) 548 549 starCheck: 550 if !star { 551 return false 552 } 553 str = str[1:] 554 goto retry 555 }