vitess.io/vitess@v0.16.2/go/mysql/collations/wildcard_test.go (about)

     1  /*
     2  Copyright 2021 The Vitess Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License"},
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package collations
    18  
    19  import (
    20  	"testing"
    21  
    22  	"github.com/stretchr/testify/assert"
    23  
    24  	"vitess.io/vitess/go/mysql/collations/internal/charset"
    25  )
    26  
    27  type wildcardtest struct {
    28  	in, pat string
    29  	match   bool
    30  }
    31  
    32  func testWildcardMatches(t *testing.T, collName string, chOne, chMany, chEsc rune, cases []wildcardtest) {
    33  	t.Run(collName, func(t *testing.T) {
    34  		coll := testcollation(t, collName)
    35  		for _, tc := range cases {
    36  			pat := coll.Wildcard([]byte(tc.pat), chOne, chMany, chEsc)
    37  			match := pat.Match([]byte(tc.in))
    38  			assert.Equal(t, tc.match, match, "%q LIKE %q = %v (expected %v)", tc.in, tc.pat, match, tc.match)
    39  
    40  		}
    41  	})
    42  }
    43  
    44  func TestLikeMatches(t *testing.T) {
    45  	testWildcardMatches(t, "utf8mb4_0900_ai_ci", 0, 0, 0, []wildcardtest{
    46  		{"abc", "abc", true},
    47  		{"Abc", "aBc", true},
    48  		{"abc", "_bc", true},
    49  		{"abc", "a_c", true},
    50  		{"abc", "ab_", true},
    51  		{"abc", "%c", true},
    52  		{"abc", "a%c", true},
    53  		{"abc", "a%", true},
    54  		{"abcdef", "a%d_f", true},
    55  		{"abcdefg", "a%d%g", true},
    56  		{"a\\", "a\\", true},
    57  		{"aa\\", "a%\\", true},
    58  		{"Y", "\u00dd", true},
    59  		{"abcd", "abcde", false},
    60  		{"abcde", "abcd", false},
    61  		{"abcde", "a%f", false},
    62  		{"abcdef", "a%%f", true},
    63  		{"abcd", "a__d", true},
    64  		{"abcd", "a\\bcd", true},
    65  		{"a\\bcd", "abcd", false},
    66  		{"abdbcd", "a%cd", true},
    67  		{"abecd", "a%bd", false},
    68  	})
    69  
    70  	testWildcardMatches(t, "utf8mb4_0900_as_cs", 0, 0, 0, []wildcardtest{
    71  		{"abc", "abc", true},
    72  		{"Abc", "aBc", false},
    73  		{"abc", "_bc", true},
    74  		{"abc", "a_c", true},
    75  		{"abc", "ab_", true},
    76  		{"abc", "%c", true},
    77  		{"abc", "a%c", true},
    78  		{"abc", "a%", true},
    79  		{"abcdef", "a%d_f", true},
    80  		{"abcdefg", "a%d%g", true},
    81  		{"a\\", "a\\", true},
    82  		{"aa\\", "a%\\", true},
    83  		{"Y", "\u00dd", false},
    84  		{"abcd", "abcde", false},
    85  		{"abcde", "abcd", false},
    86  		{"abcde", "a%f", false},
    87  		{"abcdef", "a%%f", true},
    88  		{"abcd", "a__d", true},
    89  		{"abcd", "a\\bcd", true},
    90  		{"a\\bcd", "abcd", false},
    91  		{"abdbcd", "a%cd", true},
    92  		{"abecd", "a%bd", false},
    93  	})
    94  
    95  	testWildcardMatches(t, "utf8mb4_0900_as_ci", 0, 0, 0, []wildcardtest{
    96  		{"ǎḄÇ", "Ǎḅç", true},
    97  		{"ÁḆĈ", "Ǎḅç", false},
    98  		{"ǍBc", "_bc", true},
    99  		{"Aḅc", "a_c", true},
   100  		{"Abç", "ab_", true},
   101  		{"Ǎḅç", "%ç", true},
   102  		{"Ǎḅç", "ǎ%Ç", true},
   103  		{"aḅç", "a%", true},
   104  		{"Ǎḅçdef", "ǎ%d_f", true},
   105  		{"Ǎḅçdefg", "ǎ%d%g", true},
   106  		{"ǎ\\", "Ǎ\\", true},
   107  		{"ǎa\\", "Ǎ%\\", true},
   108  		{"Y", "\u00dd", false},
   109  		{"abcd", "Ǎḅçde", false},
   110  		{"abcde", "Ǎḅçd", false},
   111  		{"Ǎḅçde", "a%f", false},
   112  		{"Ǎḅçdef", "ǎ%%f", true},
   113  		{"Ǎḅçd", "ǎ__d", true},
   114  		{"Ǎḅçd", "ǎ\\ḄÇd", true},
   115  		{"a\\bcd", "Ǎḅçd", false},
   116  		{"Ǎḅdbçd", "ǎ%Çd", true},
   117  		{"Ǎḅeçd", "a%bd", false},
   118  	})
   119  }
   120  
   121  // from http://developforperformance.com/MatchingWildcards_AnImprovedAlgorithmForBigData.html
   122  // Copyright 2018 IBM Corporation
   123  // Licensed under the Apache License, Version 2.0
   124  var wildcardTestCases = []wildcardtest{
   125  	{"Hi", "Hi*", true},
   126  	{"abc", "ab*d", false},
   127  	{"abcccd", "*ccd", true},
   128  	{"mississipissippi", "*issip*ss*", true},
   129  	{"xxxx*zzzzzzzzy*f", "xxxx*zzy*fffff", false},
   130  	{"xxxx*zzzzzzzzy*f", "xxx*zzy*f", true},
   131  	{"xxxxzzzzzzzzyf", "xxxx*zzy*fffff", false},
   132  	{"xxxxzzzzzzzzyf", "xxxx*zzy*f", true},
   133  	{"xyxyxyzyxyz", "xy*z*xyz", true},
   134  	{"mississippi", "*sip*", true},
   135  	{"xyxyxyxyz", "xy*xyz", true},
   136  	{"mississippi", "mi*sip*", true},
   137  	{"ababac", "*abac*", true},
   138  	{"ababac", "*abac*", true},
   139  	{"aaazz", "a*zz*", true},
   140  	{"a12b12", "*12*23", false},
   141  	{"a12b12", "a12b", false},
   142  	{"a12b12", "*12*12*", true},
   143  	{"caaab", "*a?b", true},
   144  	{"*", "*", true},
   145  	{"a*abab", "a*b", true},
   146  	{"a*r", "a*", true},
   147  	{"a*ar", "a*aar", false},
   148  	{"XYXYXYZYXYz", "XY*Z*XYz", true},
   149  	{"missisSIPpi", "*SIP*", true},
   150  	{"mississipPI", "*issip*PI", true},
   151  	{"xyxyxyxyz", "xy*xyz", true},
   152  	{"miSsissippi", "mi*sip*", true},
   153  	{"miSsissippi", "mi*Sip*", false},
   154  	{"abAbac", "*Abac*", true},
   155  	{"abAbac", "*Abac*", true},
   156  	{"aAazz", "a*zz*", true},
   157  	{"A12b12", "*12*23", false},
   158  	{"a12B12", "*12*12*", true},
   159  	{"oWn", "*oWn*", true},
   160  	{"bLah", "bLah", true},
   161  	{"bLah", "bLaH", false},
   162  	{"a", "*?", true},
   163  	{"ab", "*?", true},
   164  	{"abc", "*?", true},
   165  	{"a", "??", false},
   166  	{"ab", "?*?", true},
   167  	{"ab", "*?*?*", true},
   168  	{"abc", "?**?*?", true},
   169  	{"abc", "?**?*&?", false},
   170  	{"abcd", "?b*??", true},
   171  	{"abcd", "?a*??", false},
   172  	{"abcd", "?**?c?", true},
   173  	{"abcd", "?**?d?", false},
   174  	{"abcde", "?*b*?*d*?", true},
   175  	{"bLah", "bL?h", true},
   176  	{"bLaaa", "bLa?", false},
   177  	{"bLah", "bLa?", true},
   178  	{"bLaH", "?Lah", false},
   179  	{"bLaH", "?LaH", true},
   180  	{"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaab", "a*a*a*a*a*a*aa*aaa*a*a*b", true},
   181  	{"abababababababababababababababababababaacacacacacacacadaeafagahaiajakalaaaaaaaaaaaaaaaaaffafagaagggagaaaaaaaab", "*a*b*ba*ca*a*aa*aaa*fa*ga*b*", true},
   182  	{"abababababababababababababababababababaacacacacacacacadaeafagahaiajakalaaaaaaaaaaaaaaaaaffafagaagggagaaaaaaaab", "*a*b*ba*ca*a*x*aaa*fa*ga*b*", false},
   183  	{"abababababababababababababababababababaacacacacacacacadaeafagahaiajakalaaaaaaaaaaaaaaaaaffafagaagggagaaaaaaaab", "*a*b*ba*ca*aaaa*fa*ga*gggg*b*", false},
   184  	{"abababababababababababababababababababaacacacacacacacadaeafagahaiajakalaaaaaaaaaaaaaaaaaffafagaagggagaaaaaaaab", "*a*b*ba*ca*aaaa*fa*ga*ggg*b*", true},
   185  	{"aaabbaabbaab", "*aabbaa*a*", true},
   186  	{"a*a*a*a*a*a*a*a*a*a*a*a*a*a*a*a*a*", "a*a*a*a*a*a*a*a*a*a*a*a*a*a*a*a*a*", true},
   187  	{"aaaaaaaaaaaaaaaaa", "*a*a*a*a*a*a*a*a*a*a*a*a*a*a*a*a*a*", true},
   188  	{"aaaaaaaaaaaaaaaa", "*a*a*a*a*a*a*a*a*a*a*a*a*a*a*a*a*a*", false},
   189  	{"abc*abcd*abcde*abcdef*abcdefg*abcdefgh*abcdefghi*abcdefghij*abcdefghijk*abcdefghijkl*abcdefghijklm*abcdefghijklmn", "abc*abc*abc*abc*abc*abc*abc*abc*abc*abc*abc*abc*abc*abc*abc*abc*abc*", false},
   190  	{"abc*abcd*abcde*abcdef*abcdefg*abcdefgh*abcdefghi*abcdefghij*abcdefghijk*abcdefghijkl*abcdefghijklm*abcdefghijklmn", "abc*abc*abc*abc*abc*abc*abc*abc*abc*abc*abc*abc*", true},
   191  	{"abc*abcd*abcd*abc*abcd", "abc*abc*abc*abc*abc", false},
   192  	{"abc*abcd*abcd*abc*abcd*abcd*abc*abcd*abc*abc*abcd", "abc*abc*abc*abc*abc*abc*abc*abc*abc*abc*abcd", true},
   193  	{"abc", "********a********b********c********", true},
   194  	{"********a********b********c********", "abc", false},
   195  	{"abc", "********a********b********b********", false},
   196  	{"*abc*", "***a*b*c***", true},
   197  	{"", "?", false},
   198  	{"", "*?", false},
   199  	{"", "", true},
   200  	{"a", "", false},
   201  
   202  	{"abc", "abd", false},
   203  	{"abcccd", "abcccd", true},
   204  	{"mississipissippi", "mississipissippi", true},
   205  	{"xxxxzzzzzzzzyf", "xxxxzzzzzzzzyfffff", false},
   206  	{"xxxxzzzzzzzzyf", "xxxxzzzzzzzzyf", true},
   207  	{"xxxxzzzzzzzzyf", "xxxxzzy.fffff", false},
   208  	{"xxxxzzzzzzzzyf", "xxxxzzzzzzzzyf", true},
   209  	{"xyxyxyzyxyz", "xyxyxyzyxyz", true},
   210  	{"mississippi", "mississippi", true},
   211  	{"xyxyxyxyz", "xyxyxyxyz", true},
   212  	{"m ississippi", "m ississippi", true},
   213  	{"ababac", "ababac?", false},
   214  	{"dababac", "ababac", false},
   215  	{"aaazz", "aaazz", true},
   216  	{"a12b12", "1212", false},
   217  	{"a12b12", "a12b", false},
   218  	{"a12b12", "a12b12", true},
   219  	{"n", "n", true},
   220  	{"aabab", "aabab", true},
   221  	{"ar", "ar", true},
   222  	{"aar", "aaar", false},
   223  	{"XYXYXYZYXYz", "XYXYXYZYXYz", true},
   224  	{"missisSIPpi", "missisSIPpi", true},
   225  	{"mississipPI", "mississipPI", true},
   226  	{"xyxyxyxyz", "xyxyxyxyz", true},
   227  	{"miSsissippi", "miSsissippi", true},
   228  	{"miSsissippi", "miSsisSippi", false},
   229  	{"abAbac", "abAbac", true},
   230  	{"abAbac", "abAbac", true},
   231  	{"aAazz", "aAazz", true},
   232  	{"A12b12", "A12b123", false},
   233  	{"a12B12", "a12B12", true},
   234  	{"oWn", "oWn", true},
   235  	{"bLah", "bLah", true},
   236  	{"bLah", "bLaH", false},
   237  	{"a", "a", true},
   238  	{"ab", "a?", true},
   239  	{"abc", "ab?", true},
   240  	{"a", "??", false},
   241  	{"ab", "??", true},
   242  	{"abc", "???", true},
   243  	{"abcd", "????", true},
   244  	{"abc", "????", false},
   245  	{"abcd", "?b??", true},
   246  	{"abcd", "?a??", false},
   247  	{"abcd", "??c?", true},
   248  	{"abcd", "??d?", false},
   249  	{"abcde", "?b?d*?", true},
   250  	{"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaab", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaab", true},
   251  	{"abababababababababababababababababababaacacacacacacacadaeafagahaiajakalaaaaaaaaaaaaaaaaaffafagaagggagaaaaaaaab", "abababababababababababababababababababaacacacacacacacadaeafagahaiajakalaaaaaaaaaaaaaaaaaffafagaagggagaaaaaaaab", true},
   252  	{"abababababababababababababababababababaacacacacacacacadaeafagahaiajakalaaaaaaaaaaaaaaaaaffafagaagggagaaaaaaaab", "abababababababababababababababababababaacacacacacacacadaeafagahaiajaxalaaaaaaaaaaaaaaaaaffafagaagggagaaaaaaaab", false},
   253  	{"abababababababababababababababababababaacacacacacacacadaeafagahaiajakalaaaaaaaaaaaaaaaaaffafagaagggagaaaaaaaab", "abababababababababababababababababababaacacacacacacacadaeafagahaiajakalaaaaaaaaaaaaaaaaaffafagaggggagaaaaaaaab", false},
   254  	{"abababababababababababababababababababaacacacacacacacadaeafagahaiajakalaaaaaaaaaaaaaaaaaffafagaagggagaaaaaaaab", "abababababababababababababababababababaacacacacacacacadaeafagahaiajakalaaaaaaaaaaaaaaaaaffafagaagggagaaaaaaaab", true},
   255  	{"aaabbaabbaab", "aaabbaabbaab", true},
   256  	{"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", true},
   257  	{"aaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaa", true},
   258  	{"aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaa", false},
   259  	{"abcabcdabcdeabcdefabcdefgabcdefghabcdefghiabcdefghijabcdefghijkabcdefghijklabcdefghijklmabcdefghijklmn", "abcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabc", false},
   260  	{"abcabcdabcdeabcdefabcdefgabcdefghabcdefghiabcdefghijabcdefghijkabcdefghijklabcdefghijklmabcdefghijklmn", "abcabcdabcdeabcdefabcdefgabcdefghabcdefghiabcdefghijabcdefghijkabcdefghijklabcdefghijklmabcdefghijklmn", true},
   261  	{"abcabcdabcdabcabcd", "abcabc?abcabcabc", false},
   262  	{"abcabcdabcdabcabcdabcdabcabcdabcabcabcd", "abcabc?abc?abcabc?abc?abc?bc?abc?bc?bcd", true},
   263  	{"?abc?", "?abc?", true},
   264  
   265  	{"", "abd", false},
   266  	{"", "abcccd", false},
   267  	{"", "mississipissippi", false},
   268  	{"", "xxxxzzzzzzzzyfffff", false},
   269  	{"", "xxxxzzzzzzzzyf", false},
   270  	{"", "xxxxzzy.fffff", false},
   271  	{"", "xxxxzzzzzzzzyf", false},
   272  	{"", "xyxyxyzyxyz", false},
   273  	{"", "mississippi", false},
   274  	{"", "xyxyxyxyz", false},
   275  	{"", "m ississippi", false},
   276  	{"", "ababac*", false},
   277  	{"", "ababac", false},
   278  	{"", "aaazz", false},
   279  	{"", "1212", false},
   280  	{"", "a12b", false},
   281  	{"", "a12b12", false},
   282  	{"", "n", false},
   283  	{"", "aabab", false},
   284  	{"", "ar", false},
   285  	{"", "aaar", false},
   286  	{"", "XYXYXYZYXYz", false},
   287  	{"", "missisSIPpi", false},
   288  	{"", "mississipPI", false},
   289  	{"", "xyxyxyxyz", false},
   290  	{"", "miSsissippi", false},
   291  	{"", "miSsisSippi", false},
   292  	{"", "abAbac", false},
   293  	{"", "abAbac", false},
   294  	{"", "aAazz", false},
   295  	{"", "A12b123", false},
   296  	{"", "a12B12", false},
   297  	{"", "oWn", false},
   298  	{"", "bLah", false},
   299  	{"", "bLaH", false},
   300  	{"", "", true},
   301  	{"abc", "", false},
   302  	{"abcccd", "", false},
   303  	{"mississipissippi", "", false},
   304  	{"xxxxzzzzzzzzyf", "", false},
   305  	{"xxxxzzzzzzzzyf", "", false},
   306  	{"xxxxzzzzzzzzyf", "", false},
   307  	{"xxxxzzzzzzzzyf", "", false},
   308  	{"xyxyxyzyxyz", "", false},
   309  	{"mississippi", "", false},
   310  	{"xyxyxyxyz", "", false},
   311  	{"m ississippi", "", false},
   312  	{"ababac", "", false},
   313  	{"dababac", "", false},
   314  	{"aaazz", "", false},
   315  	{"a12b12", "", false},
   316  	{"a12b12", "", false},
   317  	{"a12b12", "", false},
   318  	{"n", "", false},
   319  	{"aabab", "", false},
   320  	{"ar", "", false},
   321  	{"aar", "", false},
   322  	{"XYXYXYZYXYz", "", false},
   323  	{"missisSIPpi", "", false},
   324  	{"mississipPI", "", false},
   325  	{"xyxyxyxyz", "", false},
   326  	{"miSsissippi", "", false},
   327  	{"miSsissippi", "", false},
   328  	{"abAbac", "", false},
   329  	{"abAbac", "", false},
   330  	{"aAazz", "", false},
   331  	{"A12b12", "", false},
   332  	{"a12B12", "", false},
   333  	{"oWn", "", false},
   334  	{"bLah", "", false},
   335  	{"bLah", "", false},
   336  }
   337  
   338  func identity(a, b rune) bool {
   339  	return a == b
   340  }
   341  
   342  func TestWildcardMatches(t *testing.T) {
   343  	t.Run("UnicodeWildcardMatcher (no optimization)", func(t *testing.T) {
   344  		for _, tc := range wildcardTestCases {
   345  			wildcard := newUnicodeWildcardMatcher(charset.Charset_utf8mb4{}, identity, nil, []byte(tc.pat), '?', '*', '\\')
   346  			match := wildcard.Match([]byte(tc.in))
   347  			assert.Equal(t, tc.match, match, "wildcard(%q, %q) = %v (expected %v)", tc.in, tc.pat, match, tc.match)
   348  
   349  		}
   350  	})
   351  
   352  	t.Run("EightbitWildcardMatcher (no optimization)", func(t *testing.T) {
   353  		for _, tc := range wildcardTestCases {
   354  			wildcard := newEightbitWildcardMatcher(&sortOrderIdentity, nil, []byte(tc.pat), '?', '*', '\\')
   355  			match := wildcard.Match([]byte(tc.in))
   356  			assert.Equal(t, tc.match, match, "wildcard(%q, %q) = %v (expected %v)", tc.in, tc.pat, match, tc.match)
   357  
   358  		}
   359  	})
   360  
   361  	testWildcardMatches(t, "utf8mb4_0900_bin", '?', '*', '\\', wildcardTestCases)
   362  	testWildcardMatches(t, "utf8mb4_0900_as_cs", '?', '*', '\\', wildcardTestCases)
   363  }
   364  
   365  func BenchmarkWildcardMatching(b *testing.B) {
   366  	type bench struct {
   367  		input []byte
   368  		m1    WildcardPattern
   369  		m2    WildcardPattern
   370  	}
   371  
   372  	var patterns []bench
   373  	for _, tc := range wildcardTestCases {
   374  		patterns = append(patterns, bench{
   375  			input: []byte(tc.in),
   376  			m1:    newUnicodeWildcardMatcher(charset.Charset_utf8mb4{}, identity, nil, []byte(tc.pat), '?', '*', '\\'),
   377  			m2:    newEightbitWildcardMatcher(&sortOrderIdentity, nil, []byte(tc.pat), '?', '*', '\\'),
   378  		})
   379  	}
   380  
   381  	b.Run("unicode", func(b *testing.B) {
   382  		b.ReportAllocs()
   383  		b.ResetTimer()
   384  
   385  		for n := 0; n < b.N; n++ {
   386  			for _, bb := range patterns {
   387  				_ = bb.m1.Match(bb.input)
   388  			}
   389  		}
   390  	})
   391  
   392  	b.Run("8bit", func(b *testing.B) {
   393  		b.ReportAllocs()
   394  		b.ResetTimer()
   395  
   396  		for n := 0; n < b.N; n++ {
   397  			for _, bb := range patterns {
   398  				_ = bb.m2.Match(bb.input)
   399  			}
   400  		}
   401  	})
   402  }