github.com/m3db/m3@v1.5.1-0.20231129193456-75a402aa583b/src/m3ninx/index/regexp_test.go (about) 1 // Copyright (c) 2018 Uber Technologies, Inc. 2 // 3 // Permission is hereby granted, free of charge, to any person obtaining a copy 4 // of this software and associated documentation files (the "Software"), to deal 5 // in the Software without restriction, including without limitation the rights 6 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 // copies of the Software, and to permit persons to whom the Software is 8 // furnished to do so, subject to the following conditions: 9 // 10 // The above copyright notice and this permission notice shall be included in 11 // all copies or substantial portions of the Software. 12 // 13 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 // THE SOFTWARE. 20 21 package index 22 23 import ( 24 "fmt" 25 "regexp/syntax" 26 "strings" 27 "testing" 28 "unicode" 29 30 "github.com/m3db/m3/src/x/tallytest" 31 32 "github.com/stretchr/testify/assert" 33 "github.com/stretchr/testify/require" 34 "github.com/uber-go/tally" 35 ) 36 37 func TestEnsureSyntaxPerlTreatsAnchorsAsTextTerminator(t *testing.T) { 38 // Test to ensure future compatibility with changes in `regexp/syntax`. 39 // 40 // We require that '^' and '$' only match input terminating characters (i.e. 41 // text boundaries, not line boundaries within the input). The line of code 42 // below ensures that syntax.Perl does the same. 43 require.NotZero(t, syntax.Perl&syntax.OneLine) 44 45 // ensure our `parseRegexp` internal function uses the right flags too. 46 re, err := parseRegexp(".*") 47 require.NoError(t, err) 48 require.NotZero(t, re.Flags&syntax.OneLine) 49 } 50 51 func TestEnsureRegexpUnachoredee(t *testing.T) { 52 ast, err := parseRegexp("(?:^abc$){0,4}") 53 require.NoError(t, err) 54 pprintAst(ast) 55 println(fmt.Sprintf("%v", dumpRegexp(ast))) 56 } 57 58 func TestEnsureRegexpUnachored(t *testing.T) { 59 testCases := []testCase{ 60 { 61 name: "naked ^", 62 input: "^", 63 expectedOutput: "emp{}", 64 }, 65 { 66 name: "naked $", 67 input: "$", 68 expectedOutput: "emp{}", 69 }, 70 { 71 name: "empty string ^$", 72 input: "^$", 73 expectedOutput: "cat{}", 74 }, 75 { 76 name: "invalid naked concat ^$", 77 input: "$^", 78 expectedOutput: "cat{eot{}bot{}}", 79 }, 80 { 81 name: "simple case of ^", 82 input: "^abc", 83 expectedOutput: "str{abc}", 84 }, 85 { 86 name: "simple case of $", 87 input: "abc$", 88 expectedOutput: "str{abc}", 89 }, 90 { 91 name: "simple case of both ^ & $", 92 input: "^abc$", 93 expectedOutput: "str{abc}", 94 }, 95 { 96 name: "weird case of internal ^", 97 input: "^a^bc$", 98 expectedOutput: "cat{lit{a}bot{}str{bc}}", 99 }, 100 { 101 name: "weird case of internal $", 102 input: "^a$bc$", 103 expectedOutput: "cat{lit{a}eot{}str{bc}}", 104 }, 105 { 106 name: "alternate of sub expressions with only legal ^ and $", 107 input: "(?:^abc$)|(?:^xyz$)", 108 expectedOutput: "alt{str{abc}str{xyz}}", 109 }, 110 { 111 name: "concat of sub expressions with only legal ^ and $", 112 input: "(^abc$)(?:^xyz$)", 113 expectedOutput: "cat{cap{cat{str{abc}eot{}}}bot{}str{xyz}}", 114 }, 115 { 116 name: "alternate of sub expressions with illegal ^ and $", 117 input: "(?:^a$bc$)|(?:^xyz$)", 118 expectedOutput: "alt{cat{lit{a}eot{}str{bc}}str{xyz}}", 119 }, 120 { 121 name: "concat of sub expressions with illegal ^ and $", 122 input: "(?:^a$bc$)(?:^xyz$)", 123 expectedOutput: "cat{lit{a}eot{}str{bc}eot{}bot{}str{xyz}}", 124 }, 125 { 126 name: "question mark case both boundaries success", 127 input: "(?:^abc$)?", 128 expectedOutput: "que{str{abc}}", 129 }, 130 { 131 name: "question mark case only ^", 132 input: "(?:^abc)?", 133 expectedOutput: "que{str{abc}}", 134 }, 135 { 136 name: "question mark case only $", 137 input: "(?:abc$)?", 138 expectedOutput: "que{str{abc}}", 139 }, 140 { 141 name: "question concat case $", 142 input: "abc$?", 143 expectedOutput: "str{abc}", 144 }, 145 { 146 name: "star mark case both boundaries success", 147 input: "(?:^abc$)*", 148 expectedOutput: "cat{que{str{abc}}star{cat{bot{}str{abc}eot{}}}}", 149 }, 150 { 151 name: "star mark case only ^", 152 input: "(?:^abc)*", 153 expectedOutput: "cat{que{str{abc}}star{cat{bot{}str{abc}}}}", 154 }, 155 { 156 name: "star mark case only $", 157 input: "(?:abc$)*", 158 expectedOutput: "cat{que{str{abc}}star{cat{str{abc}eot{}}}}", 159 }, 160 { 161 name: "star concat case $", 162 input: "abc$*", 163 expectedOutput: "cat{str{abc}star{eot{}}}", 164 }, 165 { 166 name: "star concat case ^", 167 input: "^*abc", 168 expectedOutput: "cat{star{bot{}}str{abc}}", 169 }, 170 { 171 name: "plus mark case both boundaries success", 172 input: "(?:^abc$)+", 173 expectedOutput: "cat{str{abc}star{cat{bot{}str{abc}eot{}}}}", 174 }, 175 { 176 name: "plus mark case with capturing group", 177 input: "(^abc$)+", 178 expectedOutput: "cat{cap{str{abc}}star{cap{cat{bot{}str{abc}eot{}}}}}", 179 }, 180 { 181 name: "plus mark case only ^", 182 input: "(?:^abc)+", 183 expectedOutput: "cat{str{abc}star{cat{bot{}str{abc}}}}", 184 }, 185 { 186 name: "plus mark case only $", 187 input: "(?:abc$)+", 188 expectedOutput: "cat{str{abc}star{cat{str{abc}eot{}}}}", 189 }, 190 { 191 name: "plus concat case $", 192 input: "abc$+", 193 expectedOutput: "cat{str{abc}star{eot{}}}", 194 }, 195 { 196 name: "plus concat case ^", 197 input: "^+abc", 198 expectedOutput: "cat{star{bot{}}str{abc}}", 199 }, 200 { 201 name: "repeat case both boundaries success", 202 input: "(?:^abc$){3,4}", 203 expectedOutput: "cat{str{abc}rep{2,3 cat{bot{}str{abc}eot{}}}}", 204 }, 205 { 206 name: "repeat case unbounded max", 207 input: "(?:^abc$){3,}", 208 expectedOutput: "cat{str{abc}rep{2,-1 cat{bot{}str{abc}eot{}}}}", 209 }, 210 { 211 name: "repeat case unbounded max with 1 min", 212 input: "(?:^abc$){1,2}", 213 expectedOutput: "cat{str{abc}rep{0,1 cat{bot{}str{abc}eot{}}}}", 214 }, 215 { 216 name: "repeat case unbounded max with 0 min", 217 input: "(?:^abc$){0,2}", 218 expectedOutput: "rep{0,2 cat{bot{}str{abc}eot{}}}", 219 }, 220 } 221 for _, tc := range testCases { 222 t.Run(tc.name, func(t *testing.T) { 223 re, err := parseRegexp(tc.input) 224 require.NoError(t, err) 225 parsed, err := EnsureRegexpUnanchored(re) 226 require.NoError(t, err) 227 assert.Equal(t, tc.expectedOutput, dumpRegexp(parsed)) 228 }) 229 } 230 } 231 232 func TestEnsureRegexpAnchored(t *testing.T) { 233 testCases := []testCase{ 234 { 235 name: "naked ^", 236 input: "(?:)", 237 expectedOutput: "cat{bot{}eot{\\z}}", 238 }, 239 { 240 name: "invalid naked concat ^$", 241 input: "$^", 242 expectedOutput: "cat{bot{}eot{}bot{}eot{\\z}}", 243 }, 244 { 245 name: "simple case of literal", 246 input: "abc", 247 expectedOutput: "cat{bot{}str{abc}eot{\\z}}", 248 }, 249 { 250 name: "weird case of internal ^", 251 input: "a^bc", 252 expectedOutput: "cat{bot{}lit{a}bot{}str{bc}eot{\\z}}", 253 }, 254 { 255 name: "weird case of internal $", 256 input: "a$bc", 257 expectedOutput: "cat{bot{}lit{a}eot{}str{bc}eot{\\z}}", 258 }, 259 { 260 name: "alternate of sub expressions with only legal ^ and $", 261 input: "abc|xyz", 262 expectedOutput: "cat{bot{}alt{str{abc}str{xyz}}eot{\\z}}", 263 }, 264 { 265 name: "concat of sub expressions with only legal ^ and $", 266 input: "(?:abc)(?:xyz)", 267 expectedOutput: "cat{bot{}str{abcxyz}eot{\\z}}", 268 }, 269 { 270 name: "question mark case both boundaries success", 271 input: "(?:abc)?", 272 expectedOutput: "cat{bot{}que{str{abc}}eot{\\z}}", 273 }, 274 { 275 name: "star mark case both boundaries success", 276 input: "(?:abc)*", 277 expectedOutput: "cat{bot{}star{str{abc}}eot{\\z}}", 278 }, 279 { 280 name: "plus mark case both boundaries success", 281 input: "(?:abc)+", 282 expectedOutput: "cat{bot{}plus{str{abc}}eot{\\z}}", 283 }, 284 { 285 name: "repeat case both boundaries success", 286 input: "(?:abc){3,4}", 287 expectedOutput: "cat{bot{}str{abc}str{abc}str{abc}que{str{abc}}eot{\\z}}", 288 }, 289 } 290 for _, tc := range testCases { 291 t.Run(tc.name, func(t *testing.T) { 292 re, err := parseRegexp(tc.input) 293 require.NoError(t, err) 294 parsed := EnsureRegexpAnchored(re) 295 assert.Equal(t, tc.expectedOutput, dumpRegexp(parsed)) 296 }) 297 } 298 } 299 300 type testCase struct { 301 name string 302 input string 303 expectedOutput string 304 } 305 306 // nolint 307 // only used for debugging 308 func pprintAst(ast *syntax.Regexp) { 309 println(fmt.Sprintf("%+v", *ast)) 310 for i, s := range ast.Sub { 311 println(fmt.Sprintf("%d>", i)) 312 pprintAst(s) 313 } 314 } 315 316 // NB(prateek): adapted from https://golang.org/src/regexp/syntax/parse_test.go#L315 317 var opNames = []string{ 318 syntax.OpNoMatch: "no", 319 syntax.OpEmptyMatch: "emp", 320 syntax.OpLiteral: "lit", 321 syntax.OpCharClass: "cc", 322 syntax.OpAnyCharNotNL: "dnl", 323 syntax.OpAnyChar: "dot", 324 syntax.OpBeginLine: "bol", 325 syntax.OpEndLine: "eol", 326 syntax.OpBeginText: "bot", 327 syntax.OpEndText: "eot", 328 syntax.OpWordBoundary: "wb", 329 syntax.OpNoWordBoundary: "nwb", 330 syntax.OpCapture: "cap", 331 syntax.OpStar: "star", 332 syntax.OpPlus: "plus", 333 syntax.OpQuest: "que", 334 syntax.OpRepeat: "rep", 335 syntax.OpConcat: "cat", 336 syntax.OpAlternate: "alt", 337 } 338 339 // dumpRegexp writes an encoding of the syntax tree for the regexp re to b. 340 // It is used during testing to distinguish between parses that might print 341 // the same using re's String method. 342 func dumpRegexp(re *syntax.Regexp) string { 343 var b strings.Builder 344 dumpRegexpHelper(&b, re) 345 return b.String() 346 } 347 348 func dumpRegexpHelper(b *strings.Builder, re *syntax.Regexp) { 349 if int(re.Op) >= len(opNames) || opNames[re.Op] == "" { 350 fmt.Fprintf(b, "op%d", re.Op) 351 } else { 352 switch re.Op { 353 default: 354 b.WriteString(opNames[re.Op]) 355 case syntax.OpStar, syntax.OpPlus, syntax.OpQuest, syntax.OpRepeat: 356 if re.Flags&syntax.NonGreedy != 0 { 357 b.WriteByte('n') 358 } 359 b.WriteString(opNames[re.Op]) 360 case syntax.OpLiteral: 361 if len(re.Rune) > 1 { 362 b.WriteString("str") 363 } else { 364 b.WriteString("lit") 365 } 366 if re.Flags&syntax.FoldCase != 0 { 367 for _, r := range re.Rune { 368 if unicode.SimpleFold(r) != r { 369 b.WriteString("fold") 370 break 371 } 372 } 373 } 374 } 375 } 376 b.WriteByte('{') 377 switch re.Op { 378 case syntax.OpEndText: 379 if re.Flags&syntax.WasDollar == 0 { 380 b.WriteString(`\z`) 381 } 382 case syntax.OpLiteral: 383 for _, r := range re.Rune { 384 b.WriteRune(r) 385 } 386 case syntax.OpConcat, syntax.OpAlternate: 387 for _, sub := range re.Sub { 388 dumpRegexpHelper(b, sub) 389 } 390 case syntax.OpStar, syntax.OpPlus, syntax.OpQuest: 391 dumpRegexpHelper(b, re.Sub[0]) 392 case syntax.OpRepeat: 393 fmt.Fprintf(b, "%d,%d ", re.Min, re.Max) 394 dumpRegexpHelper(b, re.Sub[0]) 395 case syntax.OpCapture: 396 if re.Name != "" { 397 b.WriteString(re.Name) 398 b.WriteByte(':') 399 } 400 dumpRegexpHelper(b, re.Sub[0]) 401 case syntax.OpCharClass: 402 sep := "" 403 for i := 0; i < len(re.Rune); i += 2 { 404 b.WriteString(sep) 405 sep = " " 406 lo, hi := re.Rune[i], re.Rune[i+1] 407 if lo == hi { 408 fmt.Fprintf(b, "%#x", lo) 409 } else { 410 fmt.Fprintf(b, "%#x-%#x", lo, hi) 411 } 412 } 413 } 414 b.WriteByte('}') 415 } 416 417 func TestRegexpCache(t *testing.T) { 418 scope := tally.NewTestScope("", nil) 419 420 SetRegexpCacheOptions(RegexpCacheOptions{Size: 1, Scope: scope}) 421 defer SetRegexpCacheOptions(RegexpCacheOptions{Size: 0}) 422 423 _, err := CompileRegex([]byte("foo.*bar")) 424 require.NoError(t, err) 425 426 tallytest.AssertCounterValue(t, 1, scope.Snapshot(), "m3ninx.regexp.cache.miss", nil) 427 428 _, err = CompileRegex([]byte("foo.*bar")) 429 require.NoError(t, err) 430 431 tallytest.AssertCounterValue(t, 1, scope.Snapshot(), "m3ninx.regexp.cache.hit", nil) 432 }