vitess.io/vitess@v0.16.2/go/mysql/collations/wildcard.go (about)

     1  /*
     2  Copyright 2021 The Vitess Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  // The wildcard matching code in Vitess uses two different implementations for wildcard algorithms,
    18  // as seen on https://en.wikipedia.org/wiki/Matching_wildcards
    19  //
    20  // The main implementation is based on the logic in INN (https://inn.eyrie.org/trac/browser/trunk/lib/uwildmat.c),
    21  // and is originally MIT licensed. This is a recursive matching algorithm with important optimizations, as explained
    22  // on the Wikipedia page: it is a traditional recursion algorithm with 3 return values for match, no match, and
    23  // impossible match, which greatly stops the depth of the recursion tree. It also only tries to target the ending
    24  // codepoint at the end of a 'star' match, which again cuts the recursion depth.
    25  //
    26  // In practice, this results in a very efficient algorithm which performs great in real world cases, however,
    27  // as just explained, it DOES recurse, which may be an issue when the input pattern is complex enough to cause
    28  // deep recursion.
    29  //
    30  // To prevent Vitess instances from crashing because of stack overflows, we've added a stack guard to the algorithm,
    31  // controlled by the wildcardRecursionDepth constant. If the recursion limit is reached, the match will fail --
    32  // potentially leading to wrong results for the algorithm.
    33  //
    34  // If accuracy is of upmost importance, the wildcardRecursionDepth constant can be set to 0, in which case Vitess
    35  // will use an alternative iterative algorithm, based on a public domain algorithm by Alessandro Cantatore
    36  // (seen in http://xoomer.virgilio.it/acantato/dev/wildcard/wildmatch.html). This algorithm is much simpler and does
    37  // not recurse, however it is significantly slower than our recursive implementation (~25% slower in our benchmarks).
    38  //
    39  // Because of this, we intend to enable the recursive algorithm by default.
    40  
    41  package collations
    42  
    43  import (
    44  	"unicode/utf8"
    45  
    46  	"vitess.io/vitess/go/mysql/collations/internal/charset"
    47  )
    48  
    49  type match byte
    50  
    51  const (
    52  	matchOK match = iota
    53  	matchFail
    54  	matchOver
    55  )
    56  
    57  // wildcardRecursionDepth is the maximum amount of recursive calls that can be performed when
    58  // matching a wildcard. If set to 0, the default wildcard matcher will use an alternative algorithm
    59  // that does not use recursion.
    60  const wildcardRecursionDepth = 32
    61  
    62  // patternMatchOne is a special value for compiled patterns which matches a single char (it usually replaces '_' or '?')
    63  const patternMatchOne = -128
    64  
    65  // patternMatchMany is a special value for compiled pattern that matches any amount of chars (it usually replaces '%' or '*')
    66  const patternMatchMany = -256
    67  
    68  // nopMatcher is an implementation of WildcardPattern that never matches anything.
    69  // It is returned when we detect that a provided wildcard pattern cannot match anything
    70  type nopMatcher struct{}
    71  
    72  func (nopMatcher) Match(_ []byte) bool {
    73  	return false
    74  }
    75  
    76  // emptyMatcher is an implementation of WildcardPattern that only matches the empty string
    77  type emptyMatcher struct{}
    78  
    79  func (emptyMatcher) Match(in []byte) bool {
    80  	return len(in) == 0
    81  }
    82  
    83  // fastMatcher is an implementation of WildcardPattern that uses a collation's Collate method
    84  // to perform wildcard matching.
    85  // It is returned:
    86  //   - when the wildcard pattern has no wildcard characters at all
    87  //   - when the wildcard pattern has a single '%' (patternMatchMany) and it is the very last
    88  //     character of the pattern (in this case, we set isPrefix to true to use prefix-match collation)
    89  type fastMatcher struct {
    90  	collate  func(left, right []byte, isPrefix bool) int
    91  	pattern  []byte
    92  	isPrefix bool
    93  }
    94  
    95  func (cm *fastMatcher) Match(in []byte) bool {
    96  	return cm.collate(in, cm.pattern, cm.isPrefix) == 0
    97  }
    98  
    99  // unicodeWildcard is an implementation of WildcardPattern for multibyte charsets;
   100  // it is used for all UCA collations, multibyte collations and all Unicode-based collations
   101  type unicodeWildcard struct {
   102  	equals  func(a, b rune) bool
   103  	charset charset.Charset
   104  	pattern []rune
   105  }
   106  
   107  func newUnicodeWildcardMatcher(
   108  	cs charset.Charset,
   109  	equals func(a rune, b rune) bool,
   110  	collate func(left []byte, right []byte, isPrefix bool) int,
   111  	pat []byte, chOne, chMany, chEsc rune,
   112  ) WildcardPattern {
   113  	var escape bool
   114  	var chOneCount, chManyCount, chEscCount int
   115  	var parsedPattern = make([]rune, 0, len(pat))
   116  	var patOriginal = pat
   117  
   118  	if chOne == 0 {
   119  		chOne = '_'
   120  	}
   121  	if chMany == 0 {
   122  		chMany = '%'
   123  	}
   124  	if chEsc == 0 {
   125  		chEsc = '\\'
   126  	}
   127  
   128  	for len(pat) > 0 {
   129  		cp, width := cs.DecodeRune(pat)
   130  		if cp == charset.RuneError && width < 3 {
   131  			return nopMatcher{}
   132  		}
   133  		pat = pat[width:]
   134  
   135  		if escape {
   136  			parsedPattern = append(parsedPattern, cp)
   137  			escape = false
   138  			continue
   139  		}
   140  
   141  		switch cp {
   142  		case chOne:
   143  			chOneCount++
   144  			parsedPattern = append(parsedPattern, patternMatchOne)
   145  		case chMany:
   146  			if len(parsedPattern) > 0 && parsedPattern[len(parsedPattern)-1] == patternMatchMany {
   147  				continue
   148  			}
   149  			chManyCount++
   150  			parsedPattern = append(parsedPattern, patternMatchMany)
   151  		case chEsc:
   152  			chEscCount++
   153  			escape = true
   154  		default:
   155  			parsedPattern = append(parsedPattern, cp)
   156  		}
   157  	}
   158  	if escape {
   159  		parsedPattern = append(parsedPattern, chEsc)
   160  	}
   161  
   162  	// if we have a collation callback, we can detect some common cases for patterns
   163  	// here and optimize them away without having to return a full WildcardPattern
   164  	if collate != nil {
   165  		if len(parsedPattern) == 0 {
   166  			return emptyMatcher{}
   167  		}
   168  		if chOneCount == 0 && chEscCount == 0 {
   169  			if chManyCount == 0 {
   170  				return &fastMatcher{
   171  					collate:  collate,
   172  					pattern:  patOriginal,
   173  					isPrefix: false,
   174  				}
   175  			}
   176  			if chManyCount == 1 && chMany < utf8.RuneSelf && parsedPattern[len(parsedPattern)-1] == chMany {
   177  				return &fastMatcher{
   178  					collate:  collate,
   179  					pattern:  patOriginal[:len(patOriginal)-1],
   180  					isPrefix: true,
   181  				}
   182  			}
   183  		}
   184  	}
   185  
   186  	return &unicodeWildcard{
   187  		equals:  equals,
   188  		charset: cs,
   189  		pattern: parsedPattern,
   190  	}
   191  }
   192  
   193  func (wc *unicodeWildcard) matchIter(str []byte, pat []rune) bool {
   194  	var s []byte
   195  	var p []rune
   196  	var star = false
   197  	var cs = wc.charset
   198  
   199  retry:
   200  	s = str
   201  	p = pat
   202  	for len(s) > 0 {
   203  		var p0 rune
   204  		if len(p) > 0 {
   205  			p0 = p[0]
   206  		}
   207  
   208  		switch p0 {
   209  		case patternMatchOne:
   210  			c0, width := cs.DecodeRune(s)
   211  			if c0 == charset.RuneError && width < 3 {
   212  				return false
   213  			}
   214  			s = s[width:]
   215  		case patternMatchMany:
   216  			star = true
   217  			str = s
   218  			pat = p[1:]
   219  			if len(pat) == 0 {
   220  				return true
   221  			}
   222  			goto retry
   223  		default:
   224  			c0, width := cs.DecodeRune(s)
   225  			if c0 == charset.RuneError && width < 3 {
   226  				return false
   227  			}
   228  			if !wc.equals(c0, p0) {
   229  				goto starCheck
   230  			}
   231  			s = s[width:]
   232  		}
   233  		p = p[1:]
   234  	}
   235  	return len(p) == 0 || (len(p) == 1 && p[0] == patternMatchMany)
   236  
   237  starCheck:
   238  	if !star {
   239  		return false
   240  	}
   241  	if len(str) > 0 {
   242  		c0, width := cs.DecodeRune(str)
   243  		if c0 == charset.RuneError && width < 3 {
   244  			return false
   245  		}
   246  		str = str[width:]
   247  	}
   248  	goto retry
   249  }
   250  
   251  func (wc *unicodeWildcard) Match(in []byte) bool {
   252  	if wildcardRecursionDepth == 0 {
   253  		return wc.matchIter(in, wc.pattern)
   254  	}
   255  	return wc.matchRecursive(in, wc.pattern, 0) == matchOK
   256  }
   257  
   258  func (wc *unicodeWildcard) matchMany(in []byte, pat []rune, depth int) match {
   259  	var cs = wc.charset
   260  	var p0 rune
   261  
   262  many:
   263  	if len(pat) == 0 {
   264  		return matchOK
   265  	}
   266  	p0 = pat[0]
   267  	pat = pat[1:]
   268  
   269  	switch p0 {
   270  	case patternMatchMany:
   271  		goto many
   272  	case patternMatchOne:
   273  		cpIn, width := cs.DecodeRune(in)
   274  		if cpIn == charset.RuneError && width < 3 {
   275  			return matchFail
   276  		}
   277  		in = in[width:]
   278  		goto many
   279  	}
   280  
   281  	if len(in) == 0 {
   282  		return matchOver
   283  	}
   284  
   285  retry:
   286  	var width int
   287  	for len(in) > 0 {
   288  		var cpIn rune
   289  		cpIn, width = cs.DecodeRune(in)
   290  		if cpIn == charset.RuneError && width < 3 {
   291  			return matchFail
   292  		}
   293  		if wc.equals(cpIn, p0) {
   294  			break
   295  		}
   296  		in = in[width:]
   297  	}
   298  
   299  	if len(in) == 0 {
   300  		return matchOver
   301  	}
   302  	in = in[width:]
   303  
   304  	m := wc.matchRecursive(in, pat, depth+1)
   305  	if m == matchFail {
   306  		goto retry
   307  	}
   308  	return m
   309  }
   310  
   311  func (wc *unicodeWildcard) matchRecursive(in []byte, pat []rune, depth int) match {
   312  	if depth >= wildcardRecursionDepth {
   313  		return matchFail
   314  	}
   315  
   316  	var cs = wc.charset
   317  	for len(pat) > 0 {
   318  		if pat[0] == patternMatchMany {
   319  			return wc.matchMany(in, pat[1:], depth)
   320  		}
   321  
   322  		cpIn, width := cs.DecodeRune(in)
   323  		if cpIn == charset.RuneError && width < 3 {
   324  			return matchFail
   325  		}
   326  
   327  		switch {
   328  		case pat[0] == patternMatchOne:
   329  		case wc.equals(pat[0], cpIn):
   330  		default:
   331  			return matchFail
   332  		}
   333  
   334  		in = in[width:]
   335  		pat = pat[1:]
   336  	}
   337  
   338  	if len(in) == 0 {
   339  		return matchOK
   340  	}
   341  	return matchFail
   342  }
   343  
   344  // eightbitWildcard is an implementation of WildcardPattern used for 8-bit charsets.
   345  // It is used for all 8-bit encodings.
   346  type eightbitWildcard struct {
   347  	sort    *[256]byte
   348  	pattern []int16
   349  }
   350  
   351  func newEightbitWildcardMatcher(
   352  	sort *[256]byte,
   353  	collate func(left []byte, right []byte, isPrefix bool) int,
   354  	pat []byte, chOneRune, chManyRune, chEscRune rune,
   355  ) WildcardPattern {
   356  	var escape bool
   357  	var parsedPattern = make([]int16, 0, len(pat))
   358  	var chOne, chMany, chEsc byte = '_', '%', '\\'
   359  	var chOneCount, chManyCount, chEscCount int
   360  
   361  	if chOneRune > 255 || chManyRune > 255 || chEscRune > 255 {
   362  		return nopMatcher{}
   363  	}
   364  	if chOneRune != 0 {
   365  		chOne = byte(chOneRune)
   366  	}
   367  	if chManyRune != 0 {
   368  		chMany = byte(chManyRune)
   369  	}
   370  	if chEscRune != 0 {
   371  		chEsc = byte(chEscRune)
   372  	}
   373  
   374  	for _, ch := range pat {
   375  		if escape {
   376  			parsedPattern = append(parsedPattern, int16(ch))
   377  			escape = false
   378  			continue
   379  		}
   380  
   381  		switch ch {
   382  		case chOne:
   383  			chOneCount++
   384  			parsedPattern = append(parsedPattern, patternMatchOne)
   385  		case chMany:
   386  			if len(parsedPattern) > 0 && parsedPattern[len(parsedPattern)-1] == patternMatchMany {
   387  				continue
   388  			}
   389  			chManyCount++
   390  			parsedPattern = append(parsedPattern, patternMatchMany)
   391  		case chEsc:
   392  			chEscCount++
   393  			escape = true
   394  		default:
   395  			parsedPattern = append(parsedPattern, int16(ch))
   396  		}
   397  	}
   398  	if escape {
   399  		parsedPattern = append(parsedPattern, int16(chEsc))
   400  	}
   401  
   402  	// if we have a collation callback, we can detect some common cases for patterns
   403  	// here and optimize them away without having to return a full WildcardPattern
   404  	if collate != nil {
   405  		if len(parsedPattern) == 0 {
   406  			return emptyMatcher{}
   407  		}
   408  		if chOneCount == 0 && chEscCount == 0 {
   409  			if chManyCount == 0 {
   410  				return &fastMatcher{
   411  					collate:  collate,
   412  					pattern:  pat,
   413  					isPrefix: false,
   414  				}
   415  			}
   416  			if chManyCount == 1 && pat[len(pat)-1] == chMany {
   417  				return &fastMatcher{
   418  					collate:  collate,
   419  					pattern:  pat[:len(pat)-1],
   420  					isPrefix: true,
   421  				}
   422  			}
   423  		}
   424  	}
   425  
   426  	return &eightbitWildcard{
   427  		sort:    sort,
   428  		pattern: parsedPattern,
   429  	}
   430  }
   431  
   432  func (wc *eightbitWildcard) Match(in []byte) bool {
   433  	if wildcardRecursionDepth == 0 {
   434  		return wc.matchIter(in, wc.pattern)
   435  	}
   436  	return wc.matchRecursive(in, wc.pattern, 0) == matchOK
   437  }
   438  
   439  func (wc *eightbitWildcard) matchMany(in []byte, pat []int16, depth int) match {
   440  	var p0 int16
   441  
   442  many:
   443  	if len(pat) == 0 {
   444  		return matchOK
   445  	}
   446  
   447  	p0 = pat[0]
   448  	pat = pat[1:]
   449  
   450  	switch p0 {
   451  	case patternMatchMany:
   452  		goto many
   453  	case patternMatchOne:
   454  		if len(in) == 0 {
   455  			return matchFail
   456  		}
   457  		in = in[1:]
   458  		goto many
   459  	}
   460  
   461  	if len(in) == 0 {
   462  		return matchOver
   463  	}
   464  
   465  retry:
   466  	for len(in) > 0 {
   467  		if wc.sort[in[0]] == wc.sort[byte(p0)] {
   468  			break
   469  		}
   470  		in = in[1:]
   471  	}
   472  	if len(in) == 0 {
   473  		return matchOver
   474  	}
   475  	in = in[1:]
   476  
   477  	m := wc.matchRecursive(in, pat, depth+1)
   478  	if m == matchFail {
   479  		goto retry
   480  	}
   481  	return m
   482  }
   483  
   484  func (wc *eightbitWildcard) matchRecursive(in []byte, pat []int16, depth int) match {
   485  	if depth >= wildcardRecursionDepth {
   486  		return matchFail
   487  	}
   488  	for len(pat) > 0 {
   489  		if pat[0] == patternMatchMany {
   490  			return wc.matchMany(in, pat[1:], depth)
   491  		}
   492  
   493  		if len(in) == 0 {
   494  			return matchFail
   495  		}
   496  
   497  		switch {
   498  		case pat[0] == patternMatchOne:
   499  		case wc.sort[byte(pat[0])] == wc.sort[in[0]]:
   500  		default:
   501  			return matchFail
   502  		}
   503  
   504  		in = in[1:]
   505  		pat = pat[1:]
   506  	}
   507  
   508  	if len(in) == 0 {
   509  		return matchOK
   510  	}
   511  	return matchFail
   512  }
   513  
   514  func (wc *eightbitWildcard) matchIter(str []byte, pat []int16) bool {
   515  	var s []byte
   516  	var p []int16
   517  	var star = false
   518  
   519  retry:
   520  	s = str
   521  	p = pat
   522  	for len(s) > 0 {
   523  		var p0 int16
   524  		if len(p) > 0 {
   525  			p0 = p[0]
   526  		}
   527  
   528  		switch p0 {
   529  		case patternMatchOne:
   530  			break
   531  		case patternMatchMany:
   532  			star = true
   533  			str = s
   534  			pat = p[1:]
   535  			if len(pat) == 0 {
   536  				return true
   537  			}
   538  			goto retry
   539  		default:
   540  			if wc.sort[byte(p0)] != wc.sort[s[0]] {
   541  				goto starCheck
   542  			}
   543  		}
   544  		s = s[1:]
   545  		p = p[1:]
   546  	}
   547  	return len(p) == 0 || (len(p) == 1 && p[0] == patternMatchMany)
   548  
   549  starCheck:
   550  	if !star {
   551  		return false
   552  	}
   553  	str = str[1:]
   554  	goto retry
   555  }