github.com/m3db/m3@v1.5.0/src/m3ninx/index/regexp_test.go (about) 1 // Copyright (c) 2018 Uber Technologies, Inc. 2 // 3 // Permission is hereby granted, free of charge, to any person obtaining a copy 4 // of this software and associated documentation files (the "Software"), to deal 5 // in the Software without restriction, including without limitation the rights 6 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 // copies of the Software, and to permit persons to whom the Software is 8 // furnished to do so, subject to the following conditions: 9 // 10 // The above copyright notice and this permission notice shall be included in 11 // all copies or substantial portions of the Software. 12 // 13 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 // THE SOFTWARE. 20 21 package index 22 23 import ( 24 "fmt" 25 "regexp/syntax" 26 "strings" 27 "testing" 28 "unicode" 29 30 "github.com/m3db/m3/src/x/tallytest" 31 32 "github.com/stretchr/testify/assert" 33 "github.com/stretchr/testify/require" 34 "github.com/uber-go/tally" 35 ) 36 37 func TestEnsureSyntaxPerlTreatsAnchorsAsTextTerminator(t *testing.T) { 38 // Test to ensure future compatibility with changes in `regexp/syntax`. 39 // 40 // We require that '^' and '$' only match input terminating characters (i.e. 41 // text boundaries, not line boundaries within the input). The line of code 42 // below ensures that syntax.Perl does the same. 43 require.NotZero(t, syntax.Perl&syntax.OneLine) 44 45 // ensure our `parseRegexp` internal function uses the right flags too. 46 re, err := parseRegexp(".*") 47 require.NoError(t, err) 48 require.NotZero(t, re.Flags&syntax.OneLine) 49 } 50 51 func TestEnsureRegexpUnachoredee(t *testing.T) { 52 ast, err := parseRegexp("(?:^abc$){0,4}") 53 require.NoError(t, err) 54 pprintAst(ast) 55 println(fmt.Sprintf("%v", dumpRegexp(ast))) 56 } 57 58 func TestEnsureRegexpUnachored(t *testing.T) { 59 testCases := []testCase{ 60 testCase{ 61 name: "naked ^", 62 input: "^", 63 expectedOutput: "emp{}", 64 }, 65 testCase{ 66 name: "naked $", 67 input: "$", 68 expectedOutput: "emp{}", 69 }, 70 testCase{ 71 name: "empty string ^$", 72 input: "^$", 73 expectedOutput: "cat{}", 74 }, 75 testCase{ 76 name: "invalid naked concat ^$", 77 input: "$^", 78 expectedOutput: "cat{eot{}bot{}}", 79 }, 80 testCase{ 81 name: "simple case of ^", 82 input: "^abc", 83 expectedOutput: "str{abc}", 84 }, 85 testCase{ 86 name: "simple case of $", 87 input: "abc$", 88 expectedOutput: "str{abc}", 89 }, 90 testCase{ 91 name: "simple case of both ^ & $", 92 input: "^abc$", 93 expectedOutput: "str{abc}", 94 }, 95 testCase{ 96 name: "weird case of internal ^", 97 input: "^a^bc$", 98 expectedOutput: "cat{lit{a}bot{}str{bc}}", 99 }, 100 testCase{ 101 name: "weird case of internal $", 102 input: "^a$bc$", 103 expectedOutput: "cat{lit{a}eot{}str{bc}}", 104 }, 105 testCase{ 106 name: "alternate of sub expressions with only legal ^ and $", 107 input: "(?:^abc$)|(?:^xyz$)", 108 expectedOutput: "alt{str{abc}str{xyz}}", 109 }, 110 testCase{ 111 name: "concat of sub expressions with only legal ^ and $", 112 input: "(^abc$)(?:^xyz$)", 113 expectedOutput: "cat{cap{cat{str{abc}eot{}}}bot{}str{xyz}}", 114 }, 115 testCase{ 116 name: "alternate of sub expressions with illegal ^ and $", 117 input: "(?:^a$bc$)|(?:^xyz$)", 118 expectedOutput: "alt{cat{lit{a}eot{}str{bc}}str{xyz}}", 119 }, 120 testCase{ 121 name: "concat of sub expressions with illegal ^ and $", 122 input: "(?:^a$bc$)(?:^xyz$)", 123 expectedOutput: "cat{lit{a}eot{}str{bc}eot{}bot{}str{xyz}}", 124 }, 125 testCase{ 126 name: "question mark case both boundaries success", 127 input: "(?:^abc$)?", 128 expectedOutput: "que{str{abc}}", 129 }, 130 testCase{ 131 name: "question mark case only ^", 132 input: "(?:^abc)?", 133 expectedOutput: "que{str{abc}}", 134 }, 135 testCase{ 136 name: "question mark case only $", 137 input: "(?:abc$)?", 138 expectedOutput: "que{str{abc}}", 139 }, 140 testCase{ 141 name: "question concat case $", 142 input: "abc$?", 143 expectedOutput: "str{abc}", 144 }, 145 testCase{ 146 name: "star mark case both boundaries success", 147 input: "(?:^abc$)*", 148 expectedOutput: "cat{que{str{abc}}star{cat{bot{}str{abc}eot{}}}}", 149 }, 150 testCase{ 151 name: "star mark case only ^", 152 input: "(?:^abc)*", 153 expectedOutput: "cat{que{str{abc}}star{cat{bot{}str{abc}}}}", 154 }, 155 testCase{ 156 name: "star mark case only $", 157 input: "(?:abc$)*", 158 expectedOutput: "cat{que{str{abc}}star{cat{str{abc}eot{}}}}", 159 }, 160 testCase{ 161 name: "star concat case $", 162 input: "abc$*", 163 expectedOutput: "cat{str{abc}star{eot{}}}", 164 }, 165 testCase{ 166 name: "star concat case ^", 167 input: "^*abc", 168 expectedOutput: "cat{star{bot{}}str{abc}}", 169 }, 170 testCase{ 171 name: "plus mark case both boundaries success", 172 input: "(?:^abc$)+", 173 expectedOutput: "cat{str{abc}star{cat{bot{}str{abc}eot{}}}}", 174 }, 175 testCase{ 176 name: "plus mark case with capturing group", 177 input: "(^abc$)+", 178 expectedOutput: "cat{cap{str{abc}}star{cap{cat{bot{}str{abc}eot{}}}}}", 179 }, 180 testCase{ 181 name: "plus mark case only ^", 182 input: "(?:^abc)+", 183 expectedOutput: "cat{str{abc}star{cat{bot{}str{abc}}}}", 184 }, 185 testCase{ 186 name: "plus mark case only $", 187 input: "(?:abc$)+", 188 expectedOutput: "cat{str{abc}star{cat{str{abc}eot{}}}}", 189 }, 190 testCase{ 191 name: "plus concat case $", 192 input: "abc$+", 193 expectedOutput: "cat{str{abc}star{eot{}}}", 194 }, 195 testCase{ 196 name: "plus concat case ^", 197 input: "^+abc", 198 expectedOutput: "cat{star{bot{}}str{abc}}", 199 }, 200 testCase{ 201 name: "repeat case both boundaries success", 202 input: "(?:^abc$){3,4}", 203 expectedOutput: "cat{str{abc}rep{2,3 cat{bot{}str{abc}eot{}}}}", 204 }, 205 testCase{ 206 name: "repeat case unbounded max", 207 input: "(?:^abc$){3,}", 208 expectedOutput: "cat{str{abc}rep{2,-1 cat{bot{}str{abc}eot{}}}}", 209 }, 210 testCase{ 211 name: "repeat case unbounded max with 1 min", 212 input: "(?:^abc$){1,2}", 213 expectedOutput: "cat{str{abc}rep{0,1 cat{bot{}str{abc}eot{}}}}", 214 }, 215 testCase{ 216 name: "repeat case unbounded max with 0 min", 217 input: "(?:^abc$){0,2}", 218 expectedOutput: "rep{0,2 cat{bot{}str{abc}eot{}}}", 219 }, 220 } 221 for _, tc := range testCases { 222 t.Run(tc.name, func(t *testing.T) { 223 re, err := parseRegexp(tc.input) 224 require.NoError(t, err) 225 parsed, err := ensureRegexpUnanchored(re) 226 require.NoError(t, err) 227 assert.Equal(t, tc.expectedOutput, dumpRegexp(parsed)) 228 }) 229 } 230 } 231 232 func TestEnsureRegexpAnchored(t *testing.T) { 233 testCases := []testCase{ 234 testCase{ 235 name: "naked ^", 236 input: "(?:)", 237 expectedOutput: "cat{bot{}eot{\\z}}", 238 }, 239 testCase{ 240 name: "invalid naked concat ^$", 241 input: "$^", 242 expectedOutput: "cat{bot{}eot{}bot{}eot{\\z}}", 243 }, 244 testCase{ 245 name: "simple case of literal", 246 input: "abc", 247 expectedOutput: "cat{bot{}str{abc}eot{\\z}}", 248 }, 249 testCase{ 250 name: "weird case of internal ^", 251 input: "a^bc", 252 expectedOutput: "cat{bot{}lit{a}bot{}str{bc}eot{\\z}}", 253 }, 254 testCase{ 255 name: "weird case of internal $", 256 input: "a$bc", 257 expectedOutput: "cat{bot{}lit{a}eot{}str{bc}eot{\\z}}", 258 }, 259 testCase{ 260 name: "alternate of sub expressions with only legal ^ and $", 261 input: "abc|xyz", 262 expectedOutput: "cat{bot{}alt{str{abc}str{xyz}}eot{\\z}}", 263 }, 264 testCase{ 265 name: "concat of sub expressions with only legal ^ and $", 266 input: "(?:abc)(?:xyz)", 267 expectedOutput: "cat{bot{}str{abcxyz}eot{\\z}}", 268 }, 269 testCase{ 270 name: "question mark case both boundaries success", 271 input: "(?:abc)?", 272 expectedOutput: "cat{bot{}que{str{abc}}eot{\\z}}", 273 }, 274 testCase{ 275 name: "star mark case both boundaries success", 276 input: "(?:abc)*", 277 expectedOutput: "cat{bot{}star{str{abc}}eot{\\z}}", 278 }, 279 testCase{ 280 name: "plus mark case both boundaries success", 281 input: "(?:abc)+", 282 expectedOutput: "cat{bot{}plus{str{abc}}eot{\\z}}", 283 }, 284 testCase{ 285 name: "repeat case both boundaries success", 286 input: "(?:abc){3,4}", 287 expectedOutput: "cat{bot{}str{abc}str{abc}str{abc}que{str{abc}}eot{\\z}}", 288 }, 289 } 290 for _, tc := range testCases { 291 t.Run(tc.name, func(t *testing.T) { 292 re, err := parseRegexp(tc.input) 293 require.NoError(t, err) 294 parsed, err := ensureRegexpAnchored(re) 295 require.NoError(t, err) 296 assert.Equal(t, tc.expectedOutput, dumpRegexp(parsed)) 297 }) 298 } 299 } 300 301 type testCase struct { 302 name string 303 input string 304 expectedOutput string 305 } 306 307 // nolint 308 // only used for debugging 309 func pprintAst(ast *syntax.Regexp) { 310 println(fmt.Sprintf("%+v", *ast)) 311 for i, s := range ast.Sub { 312 println(fmt.Sprintf("%d>", i)) 313 pprintAst(s) 314 } 315 } 316 317 // NB(prateek): adapted from https://golang.org/src/regexp/syntax/parse_test.go#L315 318 var opNames = []string{ 319 syntax.OpNoMatch: "no", 320 syntax.OpEmptyMatch: "emp", 321 syntax.OpLiteral: "lit", 322 syntax.OpCharClass: "cc", 323 syntax.OpAnyCharNotNL: "dnl", 324 syntax.OpAnyChar: "dot", 325 syntax.OpBeginLine: "bol", 326 syntax.OpEndLine: "eol", 327 syntax.OpBeginText: "bot", 328 syntax.OpEndText: "eot", 329 syntax.OpWordBoundary: "wb", 330 syntax.OpNoWordBoundary: "nwb", 331 syntax.OpCapture: "cap", 332 syntax.OpStar: "star", 333 syntax.OpPlus: "plus", 334 syntax.OpQuest: "que", 335 syntax.OpRepeat: "rep", 336 syntax.OpConcat: "cat", 337 syntax.OpAlternate: "alt", 338 } 339 340 // dumpRegexp writes an encoding of the syntax tree for the regexp re to b. 341 // It is used during testing to distinguish between parses that might print 342 // the same using re's String method. 343 func dumpRegexp(re *syntax.Regexp) string { 344 var b strings.Builder 345 dumpRegexpHelper(&b, re) 346 return b.String() 347 } 348 349 func dumpRegexpHelper(b *strings.Builder, re *syntax.Regexp) { 350 if int(re.Op) >= len(opNames) || opNames[re.Op] == "" { 351 fmt.Fprintf(b, "op%d", re.Op) 352 } else { 353 switch re.Op { 354 default: 355 b.WriteString(opNames[re.Op]) 356 case syntax.OpStar, syntax.OpPlus, syntax.OpQuest, syntax.OpRepeat: 357 if re.Flags&syntax.NonGreedy != 0 { 358 b.WriteByte('n') 359 } 360 b.WriteString(opNames[re.Op]) 361 case syntax.OpLiteral: 362 if len(re.Rune) > 1 { 363 b.WriteString("str") 364 } else { 365 b.WriteString("lit") 366 } 367 if re.Flags&syntax.FoldCase != 0 { 368 for _, r := range re.Rune { 369 if unicode.SimpleFold(r) != r { 370 b.WriteString("fold") 371 break 372 } 373 } 374 } 375 } 376 } 377 b.WriteByte('{') 378 switch re.Op { 379 case syntax.OpEndText: 380 if re.Flags&syntax.WasDollar == 0 { 381 b.WriteString(`\z`) 382 } 383 case syntax.OpLiteral: 384 for _, r := range re.Rune { 385 b.WriteRune(r) 386 } 387 case syntax.OpConcat, syntax.OpAlternate: 388 for _, sub := range re.Sub { 389 dumpRegexpHelper(b, sub) 390 } 391 case syntax.OpStar, syntax.OpPlus, syntax.OpQuest: 392 dumpRegexpHelper(b, re.Sub[0]) 393 case syntax.OpRepeat: 394 fmt.Fprintf(b, "%d,%d ", re.Min, re.Max) 395 dumpRegexpHelper(b, re.Sub[0]) 396 case syntax.OpCapture: 397 if re.Name != "" { 398 b.WriteString(re.Name) 399 b.WriteByte(':') 400 } 401 dumpRegexpHelper(b, re.Sub[0]) 402 case syntax.OpCharClass: 403 sep := "" 404 for i := 0; i < len(re.Rune); i += 2 { 405 b.WriteString(sep) 406 sep = " " 407 lo, hi := re.Rune[i], re.Rune[i+1] 408 if lo == hi { 409 fmt.Fprintf(b, "%#x", lo) 410 } else { 411 fmt.Fprintf(b, "%#x-%#x", lo, hi) 412 } 413 } 414 } 415 b.WriteByte('}') 416 } 417 418 func TestRegexpCache(t *testing.T) { 419 scope := tally.NewTestScope("", nil) 420 421 SetRegexpCacheOptions(RegexpCacheOptions{Size: 1, Scope: scope}) 422 defer SetRegexpCacheOptions(RegexpCacheOptions{Size: 0}) 423 424 _, err := CompileRegex([]byte("foo.*bar")) 425 require.NoError(t, err) 426 427 tallytest.AssertCounterValue(t, 1, scope.Snapshot(), "m3ninx.regexp.cache.miss", nil) 428 429 _, err = CompileRegex([]byte("foo.*bar")) 430 require.NoError(t, err) 431 432 tallytest.AssertCounterValue(t, 1, scope.Snapshot(), "m3ninx.regexp.cache.hit", nil) 433 }