github.com/m3db/m3@v1.5.0/src/m3ninx/index/regexp.go (about) 1 // Copyright (c) 2018 Uber Technologies, Inc. 2 // 3 // Permission is hereby granted, free of charge, to any person obtaining a copy 4 // of this software and associated documentation files (the "Software"), to deal 5 // in the Software without restriction, including without limitation the rights 6 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 // copies of the Software, and to permit persons to whom the Software is 8 // furnished to do so, subject to the following conditions: 9 // 10 // The above copyright notice and this permission notice shall be included in 11 // all copies or substantial portions of the Software. 12 // 13 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 // THE SOFTWARE. 20 21 package index 22 23 import ( 24 "context" 25 "fmt" 26 re "regexp" 27 "regexp/syntax" 28 "sync" 29 30 fstregexp "github.com/m3db/m3/src/m3ninx/index/segment/fst/regexp" 31 "github.com/m3db/m3/src/x/cache" 32 33 "github.com/uber-go/tally" 34 ) 35 36 var ( 37 // dotStartCompiledRegex is a CompileRegex that matches any input. 38 // NB: It can be accessed through DotStartCompiledRegex(). 39 dotStarCompiledRegex CompiledRegex 40 cacheContext = context.Background() 41 ) 42 43 func init() { 44 re, err := CompileRegex([]byte(".*")) 45 if err != nil { 46 panic(err.Error()) 47 } 48 dotStarCompiledRegex = re 49 } 50 51 var ( 52 // cache for regexes, as per Go std lib: 53 // A Regexp is safe for concurrent use by multiple goroutines, except for 54 // configuration methods, such as Longest. 55 // The vellum Regexp is also safe for concurrent use as it is query for 56 // states but does not mutate internal state. 57 regexpCacheLock sync.RWMutex 58 regexpCache *cache.LRU 59 regexpCacheSize int 60 regexpCacheMetrics *cacheMetrics 61 ) 62 63 type cacheMetrics struct { 64 hit tally.Counter 65 miss tally.Counter 66 unwrapSuccess tally.Counter 67 unwrapError tally.Counter 68 } 69 70 // RegexpCacheOptions is a set of regexp cache options. 71 type RegexpCacheOptions struct { 72 Size int 73 Scope tally.Scope 74 } 75 76 // SetRegexpCacheOptions sets the regex cache options, size zero disables cache. 77 func SetRegexpCacheOptions(opts RegexpCacheOptions) { 78 regexpCacheLock.Lock() 79 defer regexpCacheLock.Unlock() 80 81 if opts.Size < 1 { 82 regexpCache = nil 83 regexpCacheMetrics = nil 84 return 85 } 86 87 scope := tally.NoopScope 88 if opts.Scope != nil { 89 scope = opts.Scope 90 } 91 92 scope = scope.SubScope("m3ninx").SubScope("regexp").SubScope("cache") 93 regexpCache = cache.NewLRU(&cache.LRUOptions{ 94 MaxEntries: opts.Size, 95 Metrics: scope.SubScope("lru"), 96 }) 97 regexpCacheMetrics = &cacheMetrics{ 98 hit: scope.Counter("hit"), 99 miss: scope.Counter("miss"), 100 unwrapSuccess: scope.SubScope("unwrap").Counter("success"), 101 unwrapError: scope.SubScope("unwrap").Counter("error"), 102 } 103 } 104 105 // DotStarCompiledRegex returns a regexp which matches ".*". 106 func DotStarCompiledRegex() CompiledRegex { 107 return dotStarCompiledRegex 108 } 109 110 // CompileRegex compiles the provided regexp into an object that can be used to query the various 111 // segment implementations. 112 func CompileRegex(r []byte) (CompiledRegex, error) { 113 // NB(prateek): We currently use two segment implementations: map-backed, and fst-backed (Vellum). 114 // Due to peculiarities in the implementation of Vellum, we have to make certain modifications 115 // to all incoming regular expressions to ensure compatibility between them. 116 117 reString := string(r) 118 119 // Check cache first. 120 regexpCacheLock.RLock() 121 cacheLRU := regexpCache 122 cacheLRUMetrics := regexpCacheMetrics 123 regexpCacheLock.RUnlock() 124 125 if cacheLRU != nil && cacheLRUMetrics != nil { 126 cached, ok := regexpCache.TryGet(reString) 127 if !ok { 128 cacheLRUMetrics.miss.Inc(1) 129 } else { 130 cacheLRUMetrics.hit.Inc(1) 131 if unwrapped, ok := cached.(*CompiledRegex); ok { 132 cacheLRUMetrics.unwrapSuccess.Inc(1) 133 return *unwrapped, nil 134 } 135 // Unable to unwrap into expected type. 136 cacheLRUMetrics.unwrapError.Inc(1) 137 } 138 } 139 140 // first, we parse the regular expression into the equivalent regex 141 reAst, err := parseRegexp(reString) 142 if err != nil { 143 return CompiledRegex{}, err 144 } 145 146 // Issue (a): Vellum does not allow regexps which use characters '^', or '$'. 147 // To address this issue, we strip these characters from appropriate locations in the parsed syntax.Regexp 148 // for Vellum's RE. 149 vellumRe, err := ensureRegexpUnanchored(reAst) 150 if err != nil { 151 return CompiledRegex{}, fmt.Errorf("unable to create FST re: %v", err) 152 } 153 154 // Issue (b): Vellum treats every regular expression as anchored, where as the map-backed segment does not. 155 // To address this issue, we ensure that every incoming regular expression is modified to be anchored 156 // when querying the map-backed segment, and isn't anchored when querying Vellum's RE. 157 simpleRe, err := ensureRegexpAnchored(vellumRe) 158 if err != nil { 159 return CompiledRegex{}, fmt.Errorf("unable to create map re: %v", err) 160 } 161 162 simpleRE, err := re.Compile(simpleRe.String()) 163 if err != nil { 164 return CompiledRegex{}, err 165 } 166 compiledRegex := CompiledRegex{ 167 Simple: simpleRE, 168 FSTSyntax: vellumRe, 169 } 170 171 fstRE, start, end, err := fstregexp.ParsedRegexp(vellumRe.String(), vellumRe) 172 if err != nil { 173 return CompiledRegex{}, err 174 } 175 compiledRegex.FST = fstRE 176 compiledRegex.PrefixBegin = start 177 compiledRegex.PrefixEnd = end 178 179 // Update cache if cache existed when we checked. 180 if cacheLRU != nil { 181 // Copy of compiled regex. 182 copied := compiledRegex 183 // No need to lock on Put since cache is locked. 184 cacheLRU.Put(reString, &copied) 185 } 186 187 return compiledRegex, nil 188 } 189 190 func parseRegexp(re string) (*syntax.Regexp, error) { 191 return syntax.Parse(re, syntax.Perl) 192 } 193 194 // ensureRegexpAnchored adds '^' and '$' characters to appropriate locations in the parsed syntax.Regexp, 195 // to ensure every input regular expression is converted to it's equivalent anchored regular expression. 196 // NB: assumes input regexp AST is un-anchored. 197 func ensureRegexpAnchored(unanchoredRegexp *syntax.Regexp) (*syntax.Regexp, error) { 198 ast := &syntax.Regexp{ 199 Op: syntax.OpConcat, 200 Flags: syntax.Perl, 201 Sub: []*syntax.Regexp{ 202 { 203 Op: syntax.OpBeginText, 204 Flags: syntax.Perl, 205 }, 206 unanchoredRegexp, 207 { 208 Op: syntax.OpEndText, 209 Flags: syntax.Perl, 210 }, 211 }, 212 } 213 return simplify(ast.Simplify()), nil 214 } 215 216 // ensureRegexpUnanchored strips '^' and '$' characters from appropriate locations in the parsed syntax.Regexp, 217 // to ensure every input regular expression is converted to it's equivalent un-anchored regular expression 218 // assuming the entire input is matched. 219 func ensureRegexpUnanchored(parsed *syntax.Regexp) (*syntax.Regexp, error) { 220 r, _, err := ensureRegexpUnanchoredHelper(parsed, true, true) 221 if err != nil { 222 return nil, err 223 } 224 return simplify(r), nil 225 } 226 227 func ensureRegexpUnanchoredHelper(parsed *syntax.Regexp, leftmost, rightmost bool) (output *syntax.Regexp, changed bool, err error) { 228 // short circuit when we know we won't make any changes to the underlying regexp. 229 if !leftmost && !rightmost { 230 return parsed, false, nil 231 } 232 233 switch parsed.Op { 234 case syntax.OpBeginLine, syntax.OpEndLine: 235 // i.e. the flags provided to syntax.Parse did not include the `OneLine` flag, which 236 // should never happen as we're using syntax.Perl which does include it (ensured by a test 237 // in this package). 238 return nil, false, fmt.Errorf("regular expressions are forced to be single line") 239 case syntax.OpBeginText: 240 if leftmost { 241 return &syntax.Regexp{ 242 Op: syntax.OpEmptyMatch, 243 Flags: parsed.Flags, 244 }, true, nil 245 } 246 case syntax.OpEndText: 247 if rightmost { 248 return &syntax.Regexp{ 249 Op: syntax.OpEmptyMatch, 250 Flags: parsed.Flags, 251 }, true, nil 252 } 253 case syntax.OpCapture: 254 // because golang regexp's don't allow backreferences, we don't care about maintaining capture 255 // group namings and can treate captures the same as we do conactenations. 256 fallthrough 257 case syntax.OpConcat: 258 changed := false 259 // strip left-most '^' 260 if l := len(parsed.Sub); leftmost && l > 0 { 261 newRe, c, err := ensureRegexpUnanchoredHelper(parsed.Sub[0], leftmost, rightmost && l == 1) 262 if err != nil { 263 return nil, false, err 264 } 265 if c { 266 parsed.Sub[0] = newRe 267 changed = true 268 } 269 } 270 // strip right-most '$' 271 if l := len(parsed.Sub); rightmost && l > 0 { 272 newRe, c, err := ensureRegexpUnanchoredHelper(parsed.Sub[l-1], leftmost && l == 1, rightmost) 273 if err != nil { 274 return nil, false, err 275 } 276 if c { 277 parsed.Sub[l-1] = newRe 278 changed = true 279 } 280 } 281 return parsed, changed, nil 282 case syntax.OpAlternate: 283 changed := false 284 // strip left-most '^' and right-most '$' in each sub-expression 285 for idx := range parsed.Sub { 286 newRe, c, err := ensureRegexpUnanchoredHelper(parsed.Sub[idx], leftmost, rightmost) 287 if err != nil { 288 return nil, false, err 289 } 290 if c { 291 parsed.Sub[idx] = newRe 292 changed = true 293 } 294 } 295 return parsed, changed, nil 296 case syntax.OpQuest: 297 if len(parsed.Sub) > 0 { 298 newRe, c, err := ensureRegexpUnanchoredHelper(parsed.Sub[0], leftmost, rightmost) 299 if err != nil { 300 return nil, false, err 301 } 302 if c { 303 parsed.Sub[0] = newRe 304 return parsed, true, nil 305 } 306 } 307 case syntax.OpStar: 308 if len(parsed.Sub) > 0 { 309 original := deepCopy(parsed) 310 newRe, c, err := ensureRegexpUnanchoredHelper(parsed.Sub[0], leftmost, rightmost) 311 if err != nil { 312 return nil, false, err 313 } 314 if !c { 315 return parsed, false, nil 316 } 317 return &syntax.Regexp{ 318 Op: syntax.OpConcat, 319 Flags: parsed.Flags, 320 Sub: []*syntax.Regexp{ 321 { 322 Op: syntax.OpQuest, 323 Flags: parsed.Flags, 324 Sub: []*syntax.Regexp{ 325 newRe, 326 }, 327 }, 328 original, 329 }, 330 }, true, nil 331 } 332 case syntax.OpPlus: 333 if len(parsed.Sub) > 0 { 334 original := deepCopy(parsed) 335 newRe, c, err := ensureRegexpUnanchoredHelper(parsed.Sub[0], leftmost, rightmost) 336 if err != nil { 337 return nil, false, err 338 } 339 if !c { 340 return parsed, false, nil 341 } 342 return &syntax.Regexp{ 343 Op: syntax.OpConcat, 344 Flags: parsed.Flags, 345 Sub: []*syntax.Regexp{ 346 newRe, 347 { 348 Op: syntax.OpStar, 349 Flags: parsed.Flags, 350 Sub: []*syntax.Regexp{ 351 original.Sub[0], 352 }, 353 }, 354 }, 355 }, true, nil 356 } 357 case syntax.OpRepeat: 358 if len(parsed.Sub) > 0 && parsed.Min > 0 { 359 original := deepCopy(parsed) 360 newRe, c, err := ensureRegexpUnanchoredHelper(parsed.Sub[0], leftmost, rightmost) 361 if err != nil { 362 return nil, false, err 363 } 364 if !c { 365 return parsed, false, nil 366 } 367 original.Min-- 368 if original.Max != -1 { 369 original.Max-- 370 } 371 return &syntax.Regexp{ 372 Op: syntax.OpConcat, 373 Flags: parsed.Flags, 374 Sub: []*syntax.Regexp{ 375 newRe, 376 original, 377 }, 378 }, true, nil 379 } 380 } 381 return parsed, false, nil 382 } 383 384 func deepCopy(ast *syntax.Regexp) *syntax.Regexp { 385 if ast == nil { 386 return nil 387 } 388 copied := *ast 389 copied.Sub = make([]*syntax.Regexp, 0, len(ast.Sub)) 390 for _, r := range ast.Sub { 391 copied.Sub = append(copied.Sub, deepCopy(r)) 392 } 393 if len(copied.Sub0) != 0 && copied.Sub0[0] != nil { 394 copied.Sub0[0] = deepCopy(copied.Sub0[0]) 395 } 396 // NB(prateek): we don't copy ast.Rune (which could be a heap allocated slice) intentionally, 397 // because none of the transformations we apply modify the Rune slice. 398 return &copied 399 } 400 401 var emptyStringOps = []syntax.Op{ 402 syntax.OpEmptyMatch, syntax.OpQuest, syntax.OpPlus, syntax.OpStar, syntax.OpRepeat, 403 } 404 405 func matchesEmptyString(ast *syntax.Regexp) bool { 406 if ast == nil { 407 return false 408 } 409 for _, op := range emptyStringOps { 410 if ast.Op == op { 411 if len(ast.Sub) > 0 { 412 return matchesEmptyString(ast.Sub[0]) 413 } 414 return true 415 } 416 } 417 return false 418 } 419 420 func simplify(ast *syntax.Regexp) *syntax.Regexp { 421 newAst, _ := simplifyHelper(ast) 422 return newAst 423 } 424 425 func simplifyHelper(ast *syntax.Regexp) (*syntax.Regexp, bool) { 426 if ast == nil { 427 return nil, false 428 } 429 switch ast.Op { 430 case syntax.OpConcat: 431 // a concatenation of a single sub-expression is the same as the sub-expression itself 432 if len(ast.Sub) == 1 { 433 return ast.Sub[0], true 434 } 435 436 changed := false 437 // check if we have any concats of concats, if so, we can pull the ones below this level up 438 subs := make([]*syntax.Regexp, 0, len(ast.Sub)) 439 for _, sub := range ast.Sub { 440 if sub.Op == syntax.OpConcat { 441 subs = append(subs, sub.Sub...) 442 changed = true 443 continue 444 } 445 // skip any sub expressions that devolve to matching only the empty string 446 if matchesEmptyString(sub) { 447 changed = true 448 continue 449 } 450 subs = append(subs, sub) 451 } 452 453 // now ensure we simplify all sub-expressions 454 for idx := range subs { 455 s, c := simplifyHelper(subs[idx]) 456 if c { 457 subs[idx] = s 458 changed = true 459 } 460 } 461 462 // if we have made any changes to sub-expressions, need to continue simplification 463 // until we are sure there are no more changes. 464 if changed { 465 ast.Sub = subs 466 return simplifyHelper(ast) 467 } 468 default: 469 changed := false 470 for idx := range ast.Sub { 471 newRe, c := simplifyHelper(ast.Sub[idx]) 472 if c { 473 ast.Sub[idx] = newRe 474 changed = true 475 } 476 } 477 return ast, changed 478 } 479 return ast, false 480 }