github.com/m3db/m3@v1.5.1-0.20231129193456-75a402aa583b/src/m3ninx/index/regexp.go (about) 1 // Copyright (c) 2018 Uber Technologies, Inc. 2 // 3 // Permission is hereby granted, free of charge, to any person obtaining a copy 4 // of this software and associated documentation files (the "Software"), to deal 5 // in the Software without restriction, including without limitation the rights 6 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 // copies of the Software, and to permit persons to whom the Software is 8 // furnished to do so, subject to the following conditions: 9 // 10 // The above copyright notice and this permission notice shall be included in 11 // all copies or substantial portions of the Software. 12 // 13 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 // THE SOFTWARE. 20 21 package index 22 23 import ( 24 "fmt" 25 re "regexp" 26 "regexp/syntax" 27 "sync" 28 29 fstregexp "github.com/m3db/m3/src/m3ninx/index/segment/fst/regexp" 30 "github.com/m3db/m3/src/x/cache" 31 32 "github.com/uber-go/tally" 33 ) 34 35 var ( 36 // dotStartCompiledRegex is a CompileRegex that matches any input. 37 // NB: It can be accessed through DotStartCompiledRegex(). 38 dotStarCompiledRegex CompiledRegex 39 ) 40 41 func init() { 42 re, err := CompileRegex([]byte(".*")) 43 if err != nil { 44 panic(err.Error()) 45 } 46 dotStarCompiledRegex = re 47 } 48 49 var ( 50 // cache for regexes, as per Go std lib: 51 // A Regexp is safe for concurrent use by multiple goroutines, except for 52 // configuration methods, such as Longest. 53 // The vellum Regexp is also safe for concurrent use as it is query for 54 // states but does not mutate internal state. 55 regexpCacheLock sync.RWMutex 56 regexpCache *cache.LRU 57 regexpCacheSize int 58 regexpCacheMetrics *cacheMetrics 59 ) 60 61 type cacheMetrics struct { 62 hit tally.Counter 63 miss tally.Counter 64 unwrapSuccess tally.Counter 65 unwrapError tally.Counter 66 } 67 68 // RegexpCacheOptions is a set of regexp cache options. 69 type RegexpCacheOptions struct { 70 Size int 71 Scope tally.Scope 72 } 73 74 // SetRegexpCacheOptions sets the regex cache options, size zero disables cache. 75 func SetRegexpCacheOptions(opts RegexpCacheOptions) { 76 regexpCacheLock.Lock() 77 defer regexpCacheLock.Unlock() 78 79 if opts.Size < 1 { 80 regexpCache = nil 81 regexpCacheMetrics = nil 82 return 83 } 84 85 scope := tally.NoopScope 86 if opts.Scope != nil { 87 scope = opts.Scope 88 } 89 90 scope = scope.SubScope("m3ninx").SubScope("regexp").SubScope("cache") 91 regexpCache = cache.NewLRU(&cache.LRUOptions{ 92 MaxEntries: opts.Size, 93 Metrics: scope.SubScope("lru"), 94 }) 95 regexpCacheMetrics = &cacheMetrics{ 96 hit: scope.Counter("hit"), 97 miss: scope.Counter("miss"), 98 unwrapSuccess: scope.SubScope("unwrap").Counter("success"), 99 unwrapError: scope.SubScope("unwrap").Counter("error"), 100 } 101 } 102 103 // DotStarCompiledRegex returns a regexp which matches ".*". 104 func DotStarCompiledRegex() CompiledRegex { 105 return dotStarCompiledRegex 106 } 107 108 // CompileRegex compiles the provided regexp into an object that can be used to query the various 109 // segment implementations. 110 func CompileRegex(r []byte) (CompiledRegex, error) { 111 // NB(prateek): We currently use two segment implementations: map-backed, and fst-backed (Vellum). 112 // Due to peculiarities in the implementation of Vellum, we have to make certain modifications 113 // to all incoming regular expressions to ensure compatibility between them. 114 115 reString := string(r) 116 117 // Check cache first. 118 regexpCacheLock.RLock() 119 cacheLRU := regexpCache 120 cacheLRUMetrics := regexpCacheMetrics 121 regexpCacheLock.RUnlock() 122 123 if cacheLRU != nil && cacheLRUMetrics != nil { 124 cached, ok := regexpCache.TryGet(reString) 125 if !ok { 126 cacheLRUMetrics.miss.Inc(1) 127 } else { 128 cacheLRUMetrics.hit.Inc(1) 129 if unwrapped, ok := cached.(*CompiledRegex); ok { 130 cacheLRUMetrics.unwrapSuccess.Inc(1) 131 return *unwrapped, nil 132 } 133 // Unable to unwrap into expected type. 134 cacheLRUMetrics.unwrapError.Inc(1) 135 } 136 } 137 138 // first, we parse the regular expression into the equivalent regex 139 reAst, err := parseRegexp(reString) 140 if err != nil { 141 return CompiledRegex{}, err 142 } 143 144 // Issue (a): Vellum does not allow regexps which use characters '^', or '$'. 145 // To address this issue, we strip these characters from appropriate locations in the parsed syntax.Regexp 146 // for Vellum's RE. 147 vellumRe, err := EnsureRegexpUnanchored(reAst) 148 if err != nil { 149 return CompiledRegex{}, fmt.Errorf("unable to create FST re: %v", err) 150 } 151 152 // Issue (b): Vellum treats every regular expression as anchored, where as the map-backed segment does not. 153 // To address this issue, we ensure that every incoming regular expression is modified to be anchored 154 // when querying the map-backed segment, and isn't anchored when querying Vellum's RE. 155 simpleRe := EnsureRegexpAnchored(vellumRe) 156 157 simpleRE, err := re.Compile(simpleRe.String()) 158 if err != nil { 159 return CompiledRegex{}, err 160 } 161 compiledRegex := CompiledRegex{ 162 Simple: simpleRE, 163 FSTSyntax: vellumRe, 164 } 165 166 fstRE, start, end, err := fstregexp.ParsedRegexp(vellumRe.String(), vellumRe) 167 if err != nil { 168 return CompiledRegex{}, err 169 } 170 compiledRegex.FST = fstRE 171 compiledRegex.PrefixBegin = start 172 compiledRegex.PrefixEnd = end 173 174 // Update cache if cache existed when we checked. 175 if cacheLRU != nil { 176 // Copy of compiled regex. 177 copied := compiledRegex 178 // No need to lock on Put since cache is locked. 179 cacheLRU.Put(reString, &copied) 180 } 181 182 return compiledRegex, nil 183 } 184 185 func parseRegexp(re string) (*syntax.Regexp, error) { 186 return syntax.Parse(re, syntax.Perl) 187 } 188 189 // EnsureRegexpAnchored adds '^' and '$' characters to appropriate locations in the parsed syntax.Regexp, 190 // to ensure every input regular expression is converted to its equivalent anchored regular expression. 191 // NB: assumes input regexp AST is un-anchored. 192 func EnsureRegexpAnchored(unanchoredRegexp *syntax.Regexp) *syntax.Regexp { 193 ast := &syntax.Regexp{ 194 Op: syntax.OpConcat, 195 Flags: syntax.Perl, 196 Sub: []*syntax.Regexp{ 197 { 198 Op: syntax.OpBeginText, 199 Flags: syntax.Perl, 200 }, 201 unanchoredRegexp, 202 { 203 Op: syntax.OpEndText, 204 Flags: syntax.Perl, 205 }, 206 }, 207 } 208 return simplify(ast.Simplify()) 209 } 210 211 // EnsureRegexpUnanchored strips '^' and '$' characters from appropriate locations in the parsed syntax.Regexp, 212 // to ensure every input regular expression is converted to its equivalent un-anchored regular expression 213 // assuming the entire input is matched. 214 func EnsureRegexpUnanchored(parsed *syntax.Regexp) (*syntax.Regexp, error) { 215 r, _, err := ensureRegexpUnanchoredHelper(parsed, true, true) 216 if err != nil { 217 return nil, err 218 } 219 return simplify(r), nil 220 } 221 222 func ensureRegexpUnanchoredHelper(parsed *syntax.Regexp, leftmost, rightmost bool) (output *syntax.Regexp, changed bool, err error) { 223 // short circuit when we know we won't make any changes to the underlying regexp. 224 if !leftmost && !rightmost { 225 return parsed, false, nil 226 } 227 228 switch parsed.Op { 229 case syntax.OpBeginLine, syntax.OpEndLine: 230 // i.e. the flags provided to syntax.Parse did not include the `OneLine` flag, which 231 // should never happen as we're using syntax.Perl which does include it (ensured by a test 232 // in this package). 233 return nil, false, fmt.Errorf("regular expressions are forced to be single line") 234 case syntax.OpBeginText: 235 if leftmost { 236 return &syntax.Regexp{ 237 Op: syntax.OpEmptyMatch, 238 Flags: parsed.Flags, 239 }, true, nil 240 } 241 case syntax.OpEndText: 242 if rightmost { 243 return &syntax.Regexp{ 244 Op: syntax.OpEmptyMatch, 245 Flags: parsed.Flags, 246 }, true, nil 247 } 248 case syntax.OpCapture: 249 // because golang regexp's don't allow backreferences, we don't care about maintaining capture 250 // group namings and can treate captures the same as we do conactenations. 251 fallthrough 252 case syntax.OpConcat: 253 changed := false 254 // strip left-most '^' 255 if l := len(parsed.Sub); leftmost && l > 0 { 256 newRe, c, err := ensureRegexpUnanchoredHelper(parsed.Sub[0], leftmost, rightmost && l == 1) 257 if err != nil { 258 return nil, false, err 259 } 260 if c { 261 parsed.Sub[0] = newRe 262 changed = true 263 } 264 } 265 // strip right-most '$' 266 if l := len(parsed.Sub); rightmost && l > 0 { 267 newRe, c, err := ensureRegexpUnanchoredHelper(parsed.Sub[l-1], leftmost && l == 1, rightmost) 268 if err != nil { 269 return nil, false, err 270 } 271 if c { 272 parsed.Sub[l-1] = newRe 273 changed = true 274 } 275 } 276 return parsed, changed, nil 277 case syntax.OpAlternate: 278 changed := false 279 // strip left-most '^' and right-most '$' in each sub-expression 280 for idx := range parsed.Sub { 281 newRe, c, err := ensureRegexpUnanchoredHelper(parsed.Sub[idx], leftmost, rightmost) 282 if err != nil { 283 return nil, false, err 284 } 285 if c { 286 parsed.Sub[idx] = newRe 287 changed = true 288 } 289 } 290 return parsed, changed, nil 291 case syntax.OpQuest: 292 if len(parsed.Sub) > 0 { 293 newRe, c, err := ensureRegexpUnanchoredHelper(parsed.Sub[0], leftmost, rightmost) 294 if err != nil { 295 return nil, false, err 296 } 297 if c { 298 parsed.Sub[0] = newRe 299 return parsed, true, nil 300 } 301 } 302 case syntax.OpStar: 303 if len(parsed.Sub) > 0 { 304 original := deepCopy(parsed) 305 newRe, c, err := ensureRegexpUnanchoredHelper(parsed.Sub[0], leftmost, rightmost) 306 if err != nil { 307 return nil, false, err 308 } 309 if !c { 310 return parsed, false, nil 311 } 312 return &syntax.Regexp{ 313 Op: syntax.OpConcat, 314 Flags: parsed.Flags, 315 Sub: []*syntax.Regexp{ 316 { 317 Op: syntax.OpQuest, 318 Flags: parsed.Flags, 319 Sub: []*syntax.Regexp{ 320 newRe, 321 }, 322 }, 323 original, 324 }, 325 }, true, nil 326 } 327 case syntax.OpPlus: 328 if len(parsed.Sub) > 0 { 329 original := deepCopy(parsed) 330 newRe, c, err := ensureRegexpUnanchoredHelper(parsed.Sub[0], leftmost, rightmost) 331 if err != nil { 332 return nil, false, err 333 } 334 if !c { 335 return parsed, false, nil 336 } 337 return &syntax.Regexp{ 338 Op: syntax.OpConcat, 339 Flags: parsed.Flags, 340 Sub: []*syntax.Regexp{ 341 newRe, 342 { 343 Op: syntax.OpStar, 344 Flags: parsed.Flags, 345 Sub: []*syntax.Regexp{ 346 original.Sub[0], 347 }, 348 }, 349 }, 350 }, true, nil 351 } 352 case syntax.OpRepeat: 353 if len(parsed.Sub) > 0 && parsed.Min > 0 { 354 original := deepCopy(parsed) 355 newRe, c, err := ensureRegexpUnanchoredHelper(parsed.Sub[0], leftmost, rightmost) 356 if err != nil { 357 return nil, false, err 358 } 359 if !c { 360 return parsed, false, nil 361 } 362 original.Min-- 363 if original.Max != -1 { 364 original.Max-- 365 } 366 return &syntax.Regexp{ 367 Op: syntax.OpConcat, 368 Flags: parsed.Flags, 369 Sub: []*syntax.Regexp{ 370 newRe, 371 original, 372 }, 373 }, true, nil 374 } 375 } 376 return parsed, false, nil 377 } 378 379 func deepCopy(ast *syntax.Regexp) *syntax.Regexp { 380 if ast == nil { 381 return nil 382 } 383 copied := *ast 384 copied.Sub = make([]*syntax.Regexp, 0, len(ast.Sub)) 385 for _, r := range ast.Sub { 386 copied.Sub = append(copied.Sub, deepCopy(r)) 387 } 388 if len(copied.Sub0) != 0 && copied.Sub0[0] != nil { 389 copied.Sub0[0] = deepCopy(copied.Sub0[0]) 390 } 391 // NB(prateek): we don't copy ast.Rune (which could be a heap allocated slice) intentionally, 392 // because none of the transformations we apply modify the Rune slice. 393 return &copied 394 } 395 396 var emptyStringOps = []syntax.Op{ 397 syntax.OpEmptyMatch, syntax.OpQuest, syntax.OpPlus, syntax.OpStar, syntax.OpRepeat, 398 } 399 400 func matchesEmptyString(ast *syntax.Regexp) bool { 401 if ast == nil { 402 return false 403 } 404 for _, op := range emptyStringOps { 405 if ast.Op == op { 406 if len(ast.Sub) > 0 { 407 return matchesEmptyString(ast.Sub[0]) 408 } 409 return true 410 } 411 } 412 return false 413 } 414 415 func simplify(ast *syntax.Regexp) *syntax.Regexp { 416 newAst, _ := simplifyHelper(ast) 417 return newAst 418 } 419 420 func simplifyHelper(ast *syntax.Regexp) (*syntax.Regexp, bool) { 421 if ast == nil { 422 return nil, false 423 } 424 switch ast.Op { 425 case syntax.OpConcat: 426 // a concatenation of a single sub-expression is the same as the sub-expression itself 427 if len(ast.Sub) == 1 { 428 return ast.Sub[0], true 429 } 430 431 changed := false 432 // check if we have any concats of concats, if so, we can pull the ones below this level up 433 subs := make([]*syntax.Regexp, 0, len(ast.Sub)) 434 for _, sub := range ast.Sub { 435 if sub.Op == syntax.OpConcat { 436 subs = append(subs, sub.Sub...) 437 changed = true 438 continue 439 } 440 // skip any sub expressions that devolve to matching only the empty string 441 if matchesEmptyString(sub) { 442 changed = true 443 continue 444 } 445 subs = append(subs, sub) 446 } 447 448 // now ensure we simplify all sub-expressions 449 for idx := range subs { 450 s, c := simplifyHelper(subs[idx]) 451 if c { 452 subs[idx] = s 453 changed = true 454 } 455 } 456 457 // if we have made any changes to sub-expressions, need to continue simplification 458 // until we are sure there are no more changes. 459 if changed { 460 ast.Sub = subs 461 return simplifyHelper(ast) 462 } 463 default: 464 changed := false 465 for idx := range ast.Sub { 466 newRe, c := simplifyHelper(ast.Sub[idx]) 467 if c { 468 ast.Sub[idx] = newRe 469 changed = true 470 } 471 } 472 return ast, changed 473 } 474 return ast, false 475 }