github.com/m3db/m3@v1.5.0/src/m3ninx/index/regexp.go (about)

     1  // Copyright (c) 2018 Uber Technologies, Inc.
     2  //
     3  // Permission is hereby granted, free of charge, to any person obtaining a copy
     4  // of this software and associated documentation files (the "Software"), to deal
     5  // in the Software without restriction, including without limitation the rights
     6  // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
     7  // copies of the Software, and to permit persons to whom the Software is
     8  // furnished to do so, subject to the following conditions:
     9  //
    10  // The above copyright notice and this permission notice shall be included in
    11  // all copies or substantial portions of the Software.
    12  //
    13  // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    14  // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    15  // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
    16  // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
    17  // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
    18  // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
    19  // THE SOFTWARE.
    20  
    21  package index
    22  
    23  import (
    24  	"context"
    25  	"fmt"
    26  	re "regexp"
    27  	"regexp/syntax"
    28  	"sync"
    29  
    30  	fstregexp "github.com/m3db/m3/src/m3ninx/index/segment/fst/regexp"
    31  	"github.com/m3db/m3/src/x/cache"
    32  
    33  	"github.com/uber-go/tally"
    34  )
    35  
    36  var (
    37  	// dotStartCompiledRegex is a CompileRegex that matches any input.
    38  	// NB: It can be accessed through DotStartCompiledRegex().
    39  	dotStarCompiledRegex CompiledRegex
    40  	cacheContext         = context.Background()
    41  )
    42  
    43  func init() {
    44  	re, err := CompileRegex([]byte(".*"))
    45  	if err != nil {
    46  		panic(err.Error())
    47  	}
    48  	dotStarCompiledRegex = re
    49  }
    50  
    51  var (
    52  	// cache for regexes, as per Go std lib:
    53  	// A Regexp is safe for concurrent use by multiple goroutines, except for
    54  	// configuration methods, such as Longest.
    55  	// The vellum Regexp is also safe for concurrent use as it is query for
    56  	// states but does not mutate internal state.
    57  	regexpCacheLock    sync.RWMutex
    58  	regexpCache        *cache.LRU
    59  	regexpCacheSize    int
    60  	regexpCacheMetrics *cacheMetrics
    61  )
    62  
    63  type cacheMetrics struct {
    64  	hit           tally.Counter
    65  	miss          tally.Counter
    66  	unwrapSuccess tally.Counter
    67  	unwrapError   tally.Counter
    68  }
    69  
    70  // RegexpCacheOptions is a set of regexp cache options.
    71  type RegexpCacheOptions struct {
    72  	Size  int
    73  	Scope tally.Scope
    74  }
    75  
    76  // SetRegexpCacheOptions sets the regex cache options, size zero disables cache.
    77  func SetRegexpCacheOptions(opts RegexpCacheOptions) {
    78  	regexpCacheLock.Lock()
    79  	defer regexpCacheLock.Unlock()
    80  
    81  	if opts.Size < 1 {
    82  		regexpCache = nil
    83  		regexpCacheMetrics = nil
    84  		return
    85  	}
    86  
    87  	scope := tally.NoopScope
    88  	if opts.Scope != nil {
    89  		scope = opts.Scope
    90  	}
    91  
    92  	scope = scope.SubScope("m3ninx").SubScope("regexp").SubScope("cache")
    93  	regexpCache = cache.NewLRU(&cache.LRUOptions{
    94  		MaxEntries: opts.Size,
    95  		Metrics:    scope.SubScope("lru"),
    96  	})
    97  	regexpCacheMetrics = &cacheMetrics{
    98  		hit:           scope.Counter("hit"),
    99  		miss:          scope.Counter("miss"),
   100  		unwrapSuccess: scope.SubScope("unwrap").Counter("success"),
   101  		unwrapError:   scope.SubScope("unwrap").Counter("error"),
   102  	}
   103  }
   104  
   105  // DotStarCompiledRegex returns a regexp which matches ".*".
   106  func DotStarCompiledRegex() CompiledRegex {
   107  	return dotStarCompiledRegex
   108  }
   109  
   110  // CompileRegex compiles the provided regexp into an object that can be used to query the various
   111  // segment implementations.
   112  func CompileRegex(r []byte) (CompiledRegex, error) {
   113  	// NB(prateek): We currently use two segment implementations: map-backed, and fst-backed (Vellum).
   114  	// Due to peculiarities in the implementation of Vellum, we have to make certain modifications
   115  	// to all incoming regular expressions to ensure compatibility between them.
   116  
   117  	reString := string(r)
   118  
   119  	// Check cache first.
   120  	regexpCacheLock.RLock()
   121  	cacheLRU := regexpCache
   122  	cacheLRUMetrics := regexpCacheMetrics
   123  	regexpCacheLock.RUnlock()
   124  
   125  	if cacheLRU != nil && cacheLRUMetrics != nil {
   126  		cached, ok := regexpCache.TryGet(reString)
   127  		if !ok {
   128  			cacheLRUMetrics.miss.Inc(1)
   129  		} else {
   130  			cacheLRUMetrics.hit.Inc(1)
   131  			if unwrapped, ok := cached.(*CompiledRegex); ok {
   132  				cacheLRUMetrics.unwrapSuccess.Inc(1)
   133  				return *unwrapped, nil
   134  			}
   135  			// Unable to unwrap into expected type.
   136  			cacheLRUMetrics.unwrapError.Inc(1)
   137  		}
   138  	}
   139  
   140  	// first, we parse the regular expression into the equivalent regex
   141  	reAst, err := parseRegexp(reString)
   142  	if err != nil {
   143  		return CompiledRegex{}, err
   144  	}
   145  
   146  	// Issue (a): Vellum does not allow regexps which use characters '^', or '$'.
   147  	// To address this issue, we strip these characters from appropriate locations in the parsed syntax.Regexp
   148  	// for Vellum's RE.
   149  	vellumRe, err := ensureRegexpUnanchored(reAst)
   150  	if err != nil {
   151  		return CompiledRegex{}, fmt.Errorf("unable to create FST re: %v", err)
   152  	}
   153  
   154  	// Issue (b): Vellum treats every regular expression as anchored, where as the map-backed segment does not.
   155  	// To address this issue, we ensure that every incoming regular expression is modified to be anchored
   156  	// when querying the map-backed segment, and isn't anchored when querying Vellum's RE.
   157  	simpleRe, err := ensureRegexpAnchored(vellumRe)
   158  	if err != nil {
   159  		return CompiledRegex{}, fmt.Errorf("unable to create map re: %v", err)
   160  	}
   161  
   162  	simpleRE, err := re.Compile(simpleRe.String())
   163  	if err != nil {
   164  		return CompiledRegex{}, err
   165  	}
   166  	compiledRegex := CompiledRegex{
   167  		Simple:    simpleRE,
   168  		FSTSyntax: vellumRe,
   169  	}
   170  
   171  	fstRE, start, end, err := fstregexp.ParsedRegexp(vellumRe.String(), vellumRe)
   172  	if err != nil {
   173  		return CompiledRegex{}, err
   174  	}
   175  	compiledRegex.FST = fstRE
   176  	compiledRegex.PrefixBegin = start
   177  	compiledRegex.PrefixEnd = end
   178  
   179  	// Update cache if cache existed when we checked.
   180  	if cacheLRU != nil {
   181  		// Copy of compiled regex.
   182  		copied := compiledRegex
   183  		// No need to lock on Put since cache is locked.
   184  		cacheLRU.Put(reString, &copied)
   185  	}
   186  
   187  	return compiledRegex, nil
   188  }
   189  
   190  func parseRegexp(re string) (*syntax.Regexp, error) {
   191  	return syntax.Parse(re, syntax.Perl)
   192  }
   193  
   194  // ensureRegexpAnchored adds '^' and '$' characters to appropriate locations in the parsed syntax.Regexp,
   195  // to ensure every input regular expression is converted to it's equivalent anchored regular expression.
   196  // NB: assumes input regexp AST is un-anchored.
   197  func ensureRegexpAnchored(unanchoredRegexp *syntax.Regexp) (*syntax.Regexp, error) {
   198  	ast := &syntax.Regexp{
   199  		Op:    syntax.OpConcat,
   200  		Flags: syntax.Perl,
   201  		Sub: []*syntax.Regexp{
   202  			{
   203  				Op:    syntax.OpBeginText,
   204  				Flags: syntax.Perl,
   205  			},
   206  			unanchoredRegexp,
   207  			{
   208  				Op:    syntax.OpEndText,
   209  				Flags: syntax.Perl,
   210  			},
   211  		},
   212  	}
   213  	return simplify(ast.Simplify()), nil
   214  }
   215  
   216  // ensureRegexpUnanchored strips '^' and '$' characters from appropriate locations in the parsed syntax.Regexp,
   217  // to ensure every input regular expression is converted to it's equivalent un-anchored regular expression
   218  // assuming the entire input is matched.
   219  func ensureRegexpUnanchored(parsed *syntax.Regexp) (*syntax.Regexp, error) {
   220  	r, _, err := ensureRegexpUnanchoredHelper(parsed, true, true)
   221  	if err != nil {
   222  		return nil, err
   223  	}
   224  	return simplify(r), nil
   225  }
   226  
   227  func ensureRegexpUnanchoredHelper(parsed *syntax.Regexp, leftmost, rightmost bool) (output *syntax.Regexp, changed bool, err error) {
   228  	// short circuit when we know we won't make any changes to the underlying regexp.
   229  	if !leftmost && !rightmost {
   230  		return parsed, false, nil
   231  	}
   232  
   233  	switch parsed.Op {
   234  	case syntax.OpBeginLine, syntax.OpEndLine:
   235  		// i.e. the flags provided to syntax.Parse did not include the `OneLine` flag, which
   236  		// should never happen as we're using syntax.Perl which does include it (ensured by a test
   237  		// in this package).
   238  		return nil, false, fmt.Errorf("regular expressions are forced to be single line")
   239  	case syntax.OpBeginText:
   240  		if leftmost {
   241  			return &syntax.Regexp{
   242  				Op:    syntax.OpEmptyMatch,
   243  				Flags: parsed.Flags,
   244  			}, true, nil
   245  		}
   246  	case syntax.OpEndText:
   247  		if rightmost {
   248  			return &syntax.Regexp{
   249  				Op:    syntax.OpEmptyMatch,
   250  				Flags: parsed.Flags,
   251  			}, true, nil
   252  		}
   253  	case syntax.OpCapture:
   254  		// because golang regexp's don't allow backreferences, we don't care about maintaining capture
   255  		// group namings and can treate captures the same as we do conactenations.
   256  		fallthrough
   257  	case syntax.OpConcat:
   258  		changed := false
   259  		// strip left-most '^'
   260  		if l := len(parsed.Sub); leftmost && l > 0 {
   261  			newRe, c, err := ensureRegexpUnanchoredHelper(parsed.Sub[0], leftmost, rightmost && l == 1)
   262  			if err != nil {
   263  				return nil, false, err
   264  			}
   265  			if c {
   266  				parsed.Sub[0] = newRe
   267  				changed = true
   268  			}
   269  		}
   270  		// strip right-most '$'
   271  		if l := len(parsed.Sub); rightmost && l > 0 {
   272  			newRe, c, err := ensureRegexpUnanchoredHelper(parsed.Sub[l-1], leftmost && l == 1, rightmost)
   273  			if err != nil {
   274  				return nil, false, err
   275  			}
   276  			if c {
   277  				parsed.Sub[l-1] = newRe
   278  				changed = true
   279  			}
   280  		}
   281  		return parsed, changed, nil
   282  	case syntax.OpAlternate:
   283  		changed := false
   284  		// strip left-most '^' and right-most '$' in each sub-expression
   285  		for idx := range parsed.Sub {
   286  			newRe, c, err := ensureRegexpUnanchoredHelper(parsed.Sub[idx], leftmost, rightmost)
   287  			if err != nil {
   288  				return nil, false, err
   289  			}
   290  			if c {
   291  				parsed.Sub[idx] = newRe
   292  				changed = true
   293  			}
   294  		}
   295  		return parsed, changed, nil
   296  	case syntax.OpQuest:
   297  		if len(parsed.Sub) > 0 {
   298  			newRe, c, err := ensureRegexpUnanchoredHelper(parsed.Sub[0], leftmost, rightmost)
   299  			if err != nil {
   300  				return nil, false, err
   301  			}
   302  			if c {
   303  				parsed.Sub[0] = newRe
   304  				return parsed, true, nil
   305  			}
   306  		}
   307  	case syntax.OpStar:
   308  		if len(parsed.Sub) > 0 {
   309  			original := deepCopy(parsed)
   310  			newRe, c, err := ensureRegexpUnanchoredHelper(parsed.Sub[0], leftmost, rightmost)
   311  			if err != nil {
   312  				return nil, false, err
   313  			}
   314  			if !c {
   315  				return parsed, false, nil
   316  			}
   317  			return &syntax.Regexp{
   318  				Op:    syntax.OpConcat,
   319  				Flags: parsed.Flags,
   320  				Sub: []*syntax.Regexp{
   321  					{
   322  						Op:    syntax.OpQuest,
   323  						Flags: parsed.Flags,
   324  						Sub: []*syntax.Regexp{
   325  							newRe,
   326  						},
   327  					},
   328  					original,
   329  				},
   330  			}, true, nil
   331  		}
   332  	case syntax.OpPlus:
   333  		if len(parsed.Sub) > 0 {
   334  			original := deepCopy(parsed)
   335  			newRe, c, err := ensureRegexpUnanchoredHelper(parsed.Sub[0], leftmost, rightmost)
   336  			if err != nil {
   337  				return nil, false, err
   338  			}
   339  			if !c {
   340  				return parsed, false, nil
   341  			}
   342  			return &syntax.Regexp{
   343  				Op:    syntax.OpConcat,
   344  				Flags: parsed.Flags,
   345  				Sub: []*syntax.Regexp{
   346  					newRe,
   347  					{
   348  						Op:    syntax.OpStar,
   349  						Flags: parsed.Flags,
   350  						Sub: []*syntax.Regexp{
   351  							original.Sub[0],
   352  						},
   353  					},
   354  				},
   355  			}, true, nil
   356  		}
   357  	case syntax.OpRepeat:
   358  		if len(parsed.Sub) > 0 && parsed.Min > 0 {
   359  			original := deepCopy(parsed)
   360  			newRe, c, err := ensureRegexpUnanchoredHelper(parsed.Sub[0], leftmost, rightmost)
   361  			if err != nil {
   362  				return nil, false, err
   363  			}
   364  			if !c {
   365  				return parsed, false, nil
   366  			}
   367  			original.Min--
   368  			if original.Max != -1 {
   369  				original.Max--
   370  			}
   371  			return &syntax.Regexp{
   372  				Op:    syntax.OpConcat,
   373  				Flags: parsed.Flags,
   374  				Sub: []*syntax.Regexp{
   375  					newRe,
   376  					original,
   377  				},
   378  			}, true, nil
   379  		}
   380  	}
   381  	return parsed, false, nil
   382  }
   383  
   384  func deepCopy(ast *syntax.Regexp) *syntax.Regexp {
   385  	if ast == nil {
   386  		return nil
   387  	}
   388  	copied := *ast
   389  	copied.Sub = make([]*syntax.Regexp, 0, len(ast.Sub))
   390  	for _, r := range ast.Sub {
   391  		copied.Sub = append(copied.Sub, deepCopy(r))
   392  	}
   393  	if len(copied.Sub0) != 0 && copied.Sub0[0] != nil {
   394  		copied.Sub0[0] = deepCopy(copied.Sub0[0])
   395  	}
   396  	// NB(prateek): we don't copy ast.Rune (which could be a heap allocated slice) intentionally,
   397  	// because none of the transformations we apply modify the Rune slice.
   398  	return &copied
   399  }
   400  
   401  var emptyStringOps = []syntax.Op{
   402  	syntax.OpEmptyMatch, syntax.OpQuest, syntax.OpPlus, syntax.OpStar, syntax.OpRepeat,
   403  }
   404  
   405  func matchesEmptyString(ast *syntax.Regexp) bool {
   406  	if ast == nil {
   407  		return false
   408  	}
   409  	for _, op := range emptyStringOps {
   410  		if ast.Op == op {
   411  			if len(ast.Sub) > 0 {
   412  				return matchesEmptyString(ast.Sub[0])
   413  			}
   414  			return true
   415  		}
   416  	}
   417  	return false
   418  }
   419  
   420  func simplify(ast *syntax.Regexp) *syntax.Regexp {
   421  	newAst, _ := simplifyHelper(ast)
   422  	return newAst
   423  }
   424  
   425  func simplifyHelper(ast *syntax.Regexp) (*syntax.Regexp, bool) {
   426  	if ast == nil {
   427  		return nil, false
   428  	}
   429  	switch ast.Op {
   430  	case syntax.OpConcat:
   431  		// a concatenation of a single sub-expression is the same as the sub-expression itself
   432  		if len(ast.Sub) == 1 {
   433  			return ast.Sub[0], true
   434  		}
   435  
   436  		changed := false
   437  		// check if we have any concats of concats, if so, we can pull the ones below this level up
   438  		subs := make([]*syntax.Regexp, 0, len(ast.Sub))
   439  		for _, sub := range ast.Sub {
   440  			if sub.Op == syntax.OpConcat {
   441  				subs = append(subs, sub.Sub...)
   442  				changed = true
   443  				continue
   444  			}
   445  			// skip any sub expressions that devolve to matching only the empty string
   446  			if matchesEmptyString(sub) {
   447  				changed = true
   448  				continue
   449  			}
   450  			subs = append(subs, sub)
   451  		}
   452  
   453  		// now ensure we simplify all sub-expressions
   454  		for idx := range subs {
   455  			s, c := simplifyHelper(subs[idx])
   456  			if c {
   457  				subs[idx] = s
   458  				changed = true
   459  			}
   460  		}
   461  
   462  		// if we have made any changes to sub-expressions, need to continue simplification
   463  		// until we are sure there are no more changes.
   464  		if changed {
   465  			ast.Sub = subs
   466  			return simplifyHelper(ast)
   467  		}
   468  	default:
   469  		changed := false
   470  		for idx := range ast.Sub {
   471  			newRe, c := simplifyHelper(ast.Sub[idx])
   472  			if c {
   473  				ast.Sub[idx] = newRe
   474  				changed = true
   475  			}
   476  		}
   477  		return ast, changed
   478  	}
   479  	return ast, false
   480  }