github.com/m3db/m3@v1.5.1-0.20231129193456-75a402aa583b/src/m3ninx/index/regexp.go (about)

     1  // Copyright (c) 2018 Uber Technologies, Inc.
     2  //
     3  // Permission is hereby granted, free of charge, to any person obtaining a copy
     4  // of this software and associated documentation files (the "Software"), to deal
     5  // in the Software without restriction, including without limitation the rights
     6  // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
     7  // copies of the Software, and to permit persons to whom the Software is
     8  // furnished to do so, subject to the following conditions:
     9  //
    10  // The above copyright notice and this permission notice shall be included in
    11  // all copies or substantial portions of the Software.
    12  //
    13  // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    14  // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    15  // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
    16  // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
    17  // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
    18  // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
    19  // THE SOFTWARE.
    20  
    21  package index
    22  
    23  import (
    24  	"fmt"
    25  	re "regexp"
    26  	"regexp/syntax"
    27  	"sync"
    28  
    29  	fstregexp "github.com/m3db/m3/src/m3ninx/index/segment/fst/regexp"
    30  	"github.com/m3db/m3/src/x/cache"
    31  
    32  	"github.com/uber-go/tally"
    33  )
    34  
    35  var (
    36  	// dotStartCompiledRegex is a CompileRegex that matches any input.
    37  	// NB: It can be accessed through DotStartCompiledRegex().
    38  	dotStarCompiledRegex CompiledRegex
    39  )
    40  
    41  func init() {
    42  	re, err := CompileRegex([]byte(".*"))
    43  	if err != nil {
    44  		panic(err.Error())
    45  	}
    46  	dotStarCompiledRegex = re
    47  }
    48  
    49  var (
    50  	// cache for regexes, as per Go std lib:
    51  	// A Regexp is safe for concurrent use by multiple goroutines, except for
    52  	// configuration methods, such as Longest.
    53  	// The vellum Regexp is also safe for concurrent use as it is query for
    54  	// states but does not mutate internal state.
    55  	regexpCacheLock    sync.RWMutex
    56  	regexpCache        *cache.LRU
    57  	regexpCacheSize    int
    58  	regexpCacheMetrics *cacheMetrics
    59  )
    60  
    61  type cacheMetrics struct {
    62  	hit           tally.Counter
    63  	miss          tally.Counter
    64  	unwrapSuccess tally.Counter
    65  	unwrapError   tally.Counter
    66  }
    67  
    68  // RegexpCacheOptions is a set of regexp cache options.
    69  type RegexpCacheOptions struct {
    70  	Size  int
    71  	Scope tally.Scope
    72  }
    73  
    74  // SetRegexpCacheOptions sets the regex cache options, size zero disables cache.
    75  func SetRegexpCacheOptions(opts RegexpCacheOptions) {
    76  	regexpCacheLock.Lock()
    77  	defer regexpCacheLock.Unlock()
    78  
    79  	if opts.Size < 1 {
    80  		regexpCache = nil
    81  		regexpCacheMetrics = nil
    82  		return
    83  	}
    84  
    85  	scope := tally.NoopScope
    86  	if opts.Scope != nil {
    87  		scope = opts.Scope
    88  	}
    89  
    90  	scope = scope.SubScope("m3ninx").SubScope("regexp").SubScope("cache")
    91  	regexpCache = cache.NewLRU(&cache.LRUOptions{
    92  		MaxEntries: opts.Size,
    93  		Metrics:    scope.SubScope("lru"),
    94  	})
    95  	regexpCacheMetrics = &cacheMetrics{
    96  		hit:           scope.Counter("hit"),
    97  		miss:          scope.Counter("miss"),
    98  		unwrapSuccess: scope.SubScope("unwrap").Counter("success"),
    99  		unwrapError:   scope.SubScope("unwrap").Counter("error"),
   100  	}
   101  }
   102  
   103  // DotStarCompiledRegex returns a regexp which matches ".*".
   104  func DotStarCompiledRegex() CompiledRegex {
   105  	return dotStarCompiledRegex
   106  }
   107  
   108  // CompileRegex compiles the provided regexp into an object that can be used to query the various
   109  // segment implementations.
   110  func CompileRegex(r []byte) (CompiledRegex, error) {
   111  	// NB(prateek): We currently use two segment implementations: map-backed, and fst-backed (Vellum).
   112  	// Due to peculiarities in the implementation of Vellum, we have to make certain modifications
   113  	// to all incoming regular expressions to ensure compatibility between them.
   114  
   115  	reString := string(r)
   116  
   117  	// Check cache first.
   118  	regexpCacheLock.RLock()
   119  	cacheLRU := regexpCache
   120  	cacheLRUMetrics := regexpCacheMetrics
   121  	regexpCacheLock.RUnlock()
   122  
   123  	if cacheLRU != nil && cacheLRUMetrics != nil {
   124  		cached, ok := regexpCache.TryGet(reString)
   125  		if !ok {
   126  			cacheLRUMetrics.miss.Inc(1)
   127  		} else {
   128  			cacheLRUMetrics.hit.Inc(1)
   129  			if unwrapped, ok := cached.(*CompiledRegex); ok {
   130  				cacheLRUMetrics.unwrapSuccess.Inc(1)
   131  				return *unwrapped, nil
   132  			}
   133  			// Unable to unwrap into expected type.
   134  			cacheLRUMetrics.unwrapError.Inc(1)
   135  		}
   136  	}
   137  
   138  	// first, we parse the regular expression into the equivalent regex
   139  	reAst, err := parseRegexp(reString)
   140  	if err != nil {
   141  		return CompiledRegex{}, err
   142  	}
   143  
   144  	// Issue (a): Vellum does not allow regexps which use characters '^', or '$'.
   145  	// To address this issue, we strip these characters from appropriate locations in the parsed syntax.Regexp
   146  	// for Vellum's RE.
   147  	vellumRe, err := EnsureRegexpUnanchored(reAst)
   148  	if err != nil {
   149  		return CompiledRegex{}, fmt.Errorf("unable to create FST re: %v", err)
   150  	}
   151  
   152  	// Issue (b): Vellum treats every regular expression as anchored, where as the map-backed segment does not.
   153  	// To address this issue, we ensure that every incoming regular expression is modified to be anchored
   154  	// when querying the map-backed segment, and isn't anchored when querying Vellum's RE.
   155  	simpleRe := EnsureRegexpAnchored(vellumRe)
   156  
   157  	simpleRE, err := re.Compile(simpleRe.String())
   158  	if err != nil {
   159  		return CompiledRegex{}, err
   160  	}
   161  	compiledRegex := CompiledRegex{
   162  		Simple:    simpleRE,
   163  		FSTSyntax: vellumRe,
   164  	}
   165  
   166  	fstRE, start, end, err := fstregexp.ParsedRegexp(vellumRe.String(), vellumRe)
   167  	if err != nil {
   168  		return CompiledRegex{}, err
   169  	}
   170  	compiledRegex.FST = fstRE
   171  	compiledRegex.PrefixBegin = start
   172  	compiledRegex.PrefixEnd = end
   173  
   174  	// Update cache if cache existed when we checked.
   175  	if cacheLRU != nil {
   176  		// Copy of compiled regex.
   177  		copied := compiledRegex
   178  		// No need to lock on Put since cache is locked.
   179  		cacheLRU.Put(reString, &copied)
   180  	}
   181  
   182  	return compiledRegex, nil
   183  }
   184  
   185  func parseRegexp(re string) (*syntax.Regexp, error) {
   186  	return syntax.Parse(re, syntax.Perl)
   187  }
   188  
   189  // EnsureRegexpAnchored adds '^' and '$' characters to appropriate locations in the parsed syntax.Regexp,
   190  // to ensure every input regular expression is converted to its equivalent anchored regular expression.
   191  // NB: assumes input regexp AST is un-anchored.
   192  func EnsureRegexpAnchored(unanchoredRegexp *syntax.Regexp) *syntax.Regexp {
   193  	ast := &syntax.Regexp{
   194  		Op:    syntax.OpConcat,
   195  		Flags: syntax.Perl,
   196  		Sub: []*syntax.Regexp{
   197  			{
   198  				Op:    syntax.OpBeginText,
   199  				Flags: syntax.Perl,
   200  			},
   201  			unanchoredRegexp,
   202  			{
   203  				Op:    syntax.OpEndText,
   204  				Flags: syntax.Perl,
   205  			},
   206  		},
   207  	}
   208  	return simplify(ast.Simplify())
   209  }
   210  
   211  // EnsureRegexpUnanchored strips '^' and '$' characters from appropriate locations in the parsed syntax.Regexp,
   212  // to ensure every input regular expression is converted to its equivalent un-anchored regular expression
   213  // assuming the entire input is matched.
   214  func EnsureRegexpUnanchored(parsed *syntax.Regexp) (*syntax.Regexp, error) {
   215  	r, _, err := ensureRegexpUnanchoredHelper(parsed, true, true)
   216  	if err != nil {
   217  		return nil, err
   218  	}
   219  	return simplify(r), nil
   220  }
   221  
   222  func ensureRegexpUnanchoredHelper(parsed *syntax.Regexp, leftmost, rightmost bool) (output *syntax.Regexp, changed bool, err error) {
   223  	// short circuit when we know we won't make any changes to the underlying regexp.
   224  	if !leftmost && !rightmost {
   225  		return parsed, false, nil
   226  	}
   227  
   228  	switch parsed.Op {
   229  	case syntax.OpBeginLine, syntax.OpEndLine:
   230  		// i.e. the flags provided to syntax.Parse did not include the `OneLine` flag, which
   231  		// should never happen as we're using syntax.Perl which does include it (ensured by a test
   232  		// in this package).
   233  		return nil, false, fmt.Errorf("regular expressions are forced to be single line")
   234  	case syntax.OpBeginText:
   235  		if leftmost {
   236  			return &syntax.Regexp{
   237  				Op:    syntax.OpEmptyMatch,
   238  				Flags: parsed.Flags,
   239  			}, true, nil
   240  		}
   241  	case syntax.OpEndText:
   242  		if rightmost {
   243  			return &syntax.Regexp{
   244  				Op:    syntax.OpEmptyMatch,
   245  				Flags: parsed.Flags,
   246  			}, true, nil
   247  		}
   248  	case syntax.OpCapture:
   249  		// because golang regexp's don't allow backreferences, we don't care about maintaining capture
   250  		// group namings and can treate captures the same as we do conactenations.
   251  		fallthrough
   252  	case syntax.OpConcat:
   253  		changed := false
   254  		// strip left-most '^'
   255  		if l := len(parsed.Sub); leftmost && l > 0 {
   256  			newRe, c, err := ensureRegexpUnanchoredHelper(parsed.Sub[0], leftmost, rightmost && l == 1)
   257  			if err != nil {
   258  				return nil, false, err
   259  			}
   260  			if c {
   261  				parsed.Sub[0] = newRe
   262  				changed = true
   263  			}
   264  		}
   265  		// strip right-most '$'
   266  		if l := len(parsed.Sub); rightmost && l > 0 {
   267  			newRe, c, err := ensureRegexpUnanchoredHelper(parsed.Sub[l-1], leftmost && l == 1, rightmost)
   268  			if err != nil {
   269  				return nil, false, err
   270  			}
   271  			if c {
   272  				parsed.Sub[l-1] = newRe
   273  				changed = true
   274  			}
   275  		}
   276  		return parsed, changed, nil
   277  	case syntax.OpAlternate:
   278  		changed := false
   279  		// strip left-most '^' and right-most '$' in each sub-expression
   280  		for idx := range parsed.Sub {
   281  			newRe, c, err := ensureRegexpUnanchoredHelper(parsed.Sub[idx], leftmost, rightmost)
   282  			if err != nil {
   283  				return nil, false, err
   284  			}
   285  			if c {
   286  				parsed.Sub[idx] = newRe
   287  				changed = true
   288  			}
   289  		}
   290  		return parsed, changed, nil
   291  	case syntax.OpQuest:
   292  		if len(parsed.Sub) > 0 {
   293  			newRe, c, err := ensureRegexpUnanchoredHelper(parsed.Sub[0], leftmost, rightmost)
   294  			if err != nil {
   295  				return nil, false, err
   296  			}
   297  			if c {
   298  				parsed.Sub[0] = newRe
   299  				return parsed, true, nil
   300  			}
   301  		}
   302  	case syntax.OpStar:
   303  		if len(parsed.Sub) > 0 {
   304  			original := deepCopy(parsed)
   305  			newRe, c, err := ensureRegexpUnanchoredHelper(parsed.Sub[0], leftmost, rightmost)
   306  			if err != nil {
   307  				return nil, false, err
   308  			}
   309  			if !c {
   310  				return parsed, false, nil
   311  			}
   312  			return &syntax.Regexp{
   313  				Op:    syntax.OpConcat,
   314  				Flags: parsed.Flags,
   315  				Sub: []*syntax.Regexp{
   316  					{
   317  						Op:    syntax.OpQuest,
   318  						Flags: parsed.Flags,
   319  						Sub: []*syntax.Regexp{
   320  							newRe,
   321  						},
   322  					},
   323  					original,
   324  				},
   325  			}, true, nil
   326  		}
   327  	case syntax.OpPlus:
   328  		if len(parsed.Sub) > 0 {
   329  			original := deepCopy(parsed)
   330  			newRe, c, err := ensureRegexpUnanchoredHelper(parsed.Sub[0], leftmost, rightmost)
   331  			if err != nil {
   332  				return nil, false, err
   333  			}
   334  			if !c {
   335  				return parsed, false, nil
   336  			}
   337  			return &syntax.Regexp{
   338  				Op:    syntax.OpConcat,
   339  				Flags: parsed.Flags,
   340  				Sub: []*syntax.Regexp{
   341  					newRe,
   342  					{
   343  						Op:    syntax.OpStar,
   344  						Flags: parsed.Flags,
   345  						Sub: []*syntax.Regexp{
   346  							original.Sub[0],
   347  						},
   348  					},
   349  				},
   350  			}, true, nil
   351  		}
   352  	case syntax.OpRepeat:
   353  		if len(parsed.Sub) > 0 && parsed.Min > 0 {
   354  			original := deepCopy(parsed)
   355  			newRe, c, err := ensureRegexpUnanchoredHelper(parsed.Sub[0], leftmost, rightmost)
   356  			if err != nil {
   357  				return nil, false, err
   358  			}
   359  			if !c {
   360  				return parsed, false, nil
   361  			}
   362  			original.Min--
   363  			if original.Max != -1 {
   364  				original.Max--
   365  			}
   366  			return &syntax.Regexp{
   367  				Op:    syntax.OpConcat,
   368  				Flags: parsed.Flags,
   369  				Sub: []*syntax.Regexp{
   370  					newRe,
   371  					original,
   372  				},
   373  			}, true, nil
   374  		}
   375  	}
   376  	return parsed, false, nil
   377  }
   378  
   379  func deepCopy(ast *syntax.Regexp) *syntax.Regexp {
   380  	if ast == nil {
   381  		return nil
   382  	}
   383  	copied := *ast
   384  	copied.Sub = make([]*syntax.Regexp, 0, len(ast.Sub))
   385  	for _, r := range ast.Sub {
   386  		copied.Sub = append(copied.Sub, deepCopy(r))
   387  	}
   388  	if len(copied.Sub0) != 0 && copied.Sub0[0] != nil {
   389  		copied.Sub0[0] = deepCopy(copied.Sub0[0])
   390  	}
   391  	// NB(prateek): we don't copy ast.Rune (which could be a heap allocated slice) intentionally,
   392  	// because none of the transformations we apply modify the Rune slice.
   393  	return &copied
   394  }
   395  
   396  var emptyStringOps = []syntax.Op{
   397  	syntax.OpEmptyMatch, syntax.OpQuest, syntax.OpPlus, syntax.OpStar, syntax.OpRepeat,
   398  }
   399  
   400  func matchesEmptyString(ast *syntax.Regexp) bool {
   401  	if ast == nil {
   402  		return false
   403  	}
   404  	for _, op := range emptyStringOps {
   405  		if ast.Op == op {
   406  			if len(ast.Sub) > 0 {
   407  				return matchesEmptyString(ast.Sub[0])
   408  			}
   409  			return true
   410  		}
   411  	}
   412  	return false
   413  }
   414  
   415  func simplify(ast *syntax.Regexp) *syntax.Regexp {
   416  	newAst, _ := simplifyHelper(ast)
   417  	return newAst
   418  }
   419  
   420  func simplifyHelper(ast *syntax.Regexp) (*syntax.Regexp, bool) {
   421  	if ast == nil {
   422  		return nil, false
   423  	}
   424  	switch ast.Op {
   425  	case syntax.OpConcat:
   426  		// a concatenation of a single sub-expression is the same as the sub-expression itself
   427  		if len(ast.Sub) == 1 {
   428  			return ast.Sub[0], true
   429  		}
   430  
   431  		changed := false
   432  		// check if we have any concats of concats, if so, we can pull the ones below this level up
   433  		subs := make([]*syntax.Regexp, 0, len(ast.Sub))
   434  		for _, sub := range ast.Sub {
   435  			if sub.Op == syntax.OpConcat {
   436  				subs = append(subs, sub.Sub...)
   437  				changed = true
   438  				continue
   439  			}
   440  			// skip any sub expressions that devolve to matching only the empty string
   441  			if matchesEmptyString(sub) {
   442  				changed = true
   443  				continue
   444  			}
   445  			subs = append(subs, sub)
   446  		}
   447  
   448  		// now ensure we simplify all sub-expressions
   449  		for idx := range subs {
   450  			s, c := simplifyHelper(subs[idx])
   451  			if c {
   452  				subs[idx] = s
   453  				changed = true
   454  			}
   455  		}
   456  
   457  		// if we have made any changes to sub-expressions, need to continue simplification
   458  		// until we are sure there are no more changes.
   459  		if changed {
   460  			ast.Sub = subs
   461  			return simplifyHelper(ast)
   462  		}
   463  	default:
   464  		changed := false
   465  		for idx := range ast.Sub {
   466  			newRe, c := simplifyHelper(ast.Sub[idx])
   467  			if c {
   468  				ast.Sub[idx] = newRe
   469  				changed = true
   470  			}
   471  		}
   472  		return ast, changed
   473  	}
   474  	return ast, false
   475  }