github.com/yankunsam/loki/v2@v2.6.3-0.20220817130409-389df5235c27/pkg/logql/log/filter.go (about)

     1  package log
     2  
     3  import (
     4  	"bytes"
     5  	"fmt"
     6  	"unicode"
     7  	"unicode/utf8"
     8  
     9  	"github.com/grafana/regexp"
    10  	"github.com/grafana/regexp/syntax"
    11  
    12  	"github.com/prometheus/prometheus/model/labels"
    13  )
    14  
    15  // Filterer is a interface to filter log lines.
    16  type Filterer interface {
    17  	Filter(line []byte) bool
    18  	ToStage() Stage
    19  }
    20  
    21  // LineFilterFunc is a syntax sugar for creating line filter from a function
    22  type FiltererFunc func(line []byte) bool
    23  
    24  func (f FiltererFunc) Filter(line []byte) bool {
    25  	return f(line)
    26  }
    27  
    28  type trueFilter struct{}
    29  
    30  func (trueFilter) Filter(_ []byte) bool { return true }
    31  func (trueFilter) ToStage() Stage       { return NoopStage }
    32  
    33  // TrueFilter is a filter that returns and matches all log lines whatever their content.
    34  var TrueFilter = trueFilter{}
    35  
    36  type notFilter struct {
    37  	Filterer
    38  }
    39  
    40  func (n notFilter) Filter(line []byte) bool {
    41  	return !n.Filterer.Filter(line)
    42  }
    43  
    44  func (n notFilter) ToStage() Stage {
    45  	return StageFunc{
    46  		process: func(_ int64, line []byte, _ *LabelsBuilder) ([]byte, bool) {
    47  			return line, n.Filter(line)
    48  		},
    49  	}
    50  }
    51  
    52  // newNotFilter creates a new filter which matches only if the base filter doesn't match.
    53  // If the base filter is a `or` it will recursively simplify with `and` operations.
    54  func newNotFilter(base Filterer) Filterer {
    55  	// not(a|b) = not(a) and not(b) , and operation can't benefit from this optimization because both legs always needs to be executed.
    56  	if or, ok := base.(orFilter); ok {
    57  		return NewAndFilter(newNotFilter(or.left), newNotFilter(or.right))
    58  	}
    59  	return notFilter{Filterer: base}
    60  }
    61  
    62  type andFilter struct {
    63  	left  Filterer
    64  	right Filterer
    65  }
    66  
    67  // NewAndFilter creates a new filter which matches only if left and right matches.
    68  func NewAndFilter(left Filterer, right Filterer) Filterer {
    69  	// Make sure we take care of panics in case a nil or noop filter is passed.
    70  	if right == nil || right == TrueFilter {
    71  		return left
    72  	}
    73  
    74  	if left == nil || left == TrueFilter {
    75  		return right
    76  	}
    77  
    78  	return andFilter{
    79  		left:  left,
    80  		right: right,
    81  	}
    82  }
    83  
    84  func (a andFilter) Filter(line []byte) bool {
    85  	return a.left.Filter(line) && a.right.Filter(line)
    86  }
    87  
    88  func (a andFilter) ToStage() Stage {
    89  	return StageFunc{
    90  		process: func(_ int64, line []byte, _ *LabelsBuilder) ([]byte, bool) {
    91  			return line, a.Filter(line)
    92  		},
    93  	}
    94  }
    95  
    96  type andFilters struct {
    97  	filters []Filterer
    98  }
    99  
   100  // NewAndFilters creates a new filter which matches only if all filters match
   101  func NewAndFilters(filters []Filterer) Filterer {
   102  	var containsFilterAcc *containsAllFilter
   103  	regexpFilters := make([]Filterer, 0)
   104  	n := 0
   105  	for _, filter := range filters {
   106  		// Make sure we take care of panics in case a nil or noop filter is passed.
   107  		if !(filter == nil || filter == TrueFilter) {
   108  			switch c := filter.(type) {
   109  			case *containsFilter:
   110  				// Start accumulating contains filters.
   111  				if containsFilterAcc == nil {
   112  					containsFilterAcc = &containsAllFilter{}
   113  				}
   114  
   115  				// Join all contain filters.
   116  				containsFilterAcc.Add(*c)
   117  			case regexpFilter:
   118  				regexpFilters = append(regexpFilters, c)
   119  
   120  			default:
   121  				// Finish accumulating contains filters.
   122  				if containsFilterAcc != nil {
   123  					filters[n] = containsFilterAcc
   124  					n++
   125  					containsFilterAcc = nil
   126  				}
   127  
   128  				// Keep filter
   129  				filters[n] = filter
   130  				n++
   131  			}
   132  		}
   133  	}
   134  	filters = filters[:n]
   135  
   136  	if containsFilterAcc != nil {
   137  		filters = append(filters, containsFilterAcc)
   138  	}
   139  
   140  	// Push regex filters to end
   141  	if len(regexpFilters) > 0 {
   142  		filters = append(filters, regexpFilters...)
   143  	}
   144  
   145  	if len(filters) == 0 {
   146  		return TrueFilter
   147  	} else if len(filters) == 1 {
   148  		return filters[0]
   149  	}
   150  
   151  	return andFilters{
   152  		filters: filters,
   153  	}
   154  }
   155  
   156  func (a andFilters) Filter(line []byte) bool {
   157  	for _, filter := range a.filters {
   158  		if !filter.Filter(line) {
   159  			return false
   160  		}
   161  	}
   162  	return true
   163  }
   164  
   165  func (a andFilters) ToStage() Stage {
   166  	return StageFunc{
   167  		process: func(_ int64, line []byte, _ *LabelsBuilder) ([]byte, bool) {
   168  			return line, a.Filter(line)
   169  		},
   170  	}
   171  }
   172  
   173  type orFilter struct {
   174  	left  Filterer
   175  	right Filterer
   176  }
   177  
   178  // newOrFilter creates a new filter which matches only if left or right matches.
   179  func newOrFilter(left Filterer, right Filterer) Filterer {
   180  	if left == nil || left == TrueFilter {
   181  		return right
   182  	}
   183  
   184  	if right == nil || right == TrueFilter {
   185  		return left
   186  	}
   187  
   188  	return orFilter{
   189  		left:  left,
   190  		right: right,
   191  	}
   192  }
   193  
   194  // chainOrFilter is a syntax sugar to chain multiple `or` filters. (1 or many)
   195  func chainOrFilter(curr, new Filterer) Filterer {
   196  	if curr == nil {
   197  		return new
   198  	}
   199  	return newOrFilter(curr, new)
   200  }
   201  
   202  func (a orFilter) Filter(line []byte) bool {
   203  	return a.left.Filter(line) || a.right.Filter(line)
   204  }
   205  
   206  func (a orFilter) ToStage() Stage {
   207  	return StageFunc{
   208  		process: func(_ int64, line []byte, _ *LabelsBuilder) ([]byte, bool) {
   209  			return line, a.Filter(line)
   210  		},
   211  	}
   212  }
   213  
   214  type regexpFilter struct {
   215  	*regexp.Regexp
   216  }
   217  
   218  // newRegexpFilter creates a new line filter for a given regexp.
   219  // If match is false the filter is the negation of the regexp.
   220  func newRegexpFilter(re string, match bool) (Filterer, error) {
   221  	reg, err := regexp.Compile(re)
   222  	if err != nil {
   223  		return nil, err
   224  	}
   225  	f := regexpFilter{reg}
   226  	if match {
   227  		return f, nil
   228  	}
   229  	return newNotFilter(f), nil
   230  }
   231  
   232  func (r regexpFilter) Filter(line []byte) bool {
   233  	return r.Match(line)
   234  }
   235  
   236  func (r regexpFilter) ToStage() Stage {
   237  	return StageFunc{
   238  		process: func(_ int64, line []byte, _ *LabelsBuilder) ([]byte, bool) {
   239  			return line, r.Filter(line)
   240  		},
   241  	}
   242  }
   243  
   244  type containsFilter struct {
   245  	match           []byte
   246  	caseInsensitive bool
   247  }
   248  
   249  func (l *containsFilter) Filter(line []byte) bool {
   250  	return contains(line, l.match, l.caseInsensitive)
   251  }
   252  
   253  func contains(line, substr []byte, caseInsensitive bool) bool {
   254  	if !caseInsensitive {
   255  		return bytes.Contains(line, substr)
   256  	}
   257  	return containsLower(line, substr)
   258  }
   259  
   260  func containsLower(line, substr []byte) bool {
   261  	if len(substr) == 0 {
   262  		return true
   263  	}
   264  	if len(substr) > len(line) {
   265  		return false
   266  	}
   267  	j := 0
   268  	for len(line) > 0 {
   269  		// ascii fast case
   270  		if c := line[0]; c < utf8.RuneSelf {
   271  			if c == substr[j] || c+'a'-'A' == substr[j] {
   272  				j++
   273  				if j == len(substr) {
   274  					return true
   275  				}
   276  				line = line[1:]
   277  				continue
   278  			}
   279  			line = line[1:]
   280  			j = 0
   281  			continue
   282  		}
   283  		// unicode slow case
   284  		lr, lwid := utf8.DecodeRune(line)
   285  		mr, mwid := utf8.DecodeRune(substr[j:])
   286  		if lr == mr || mr == unicode.To(unicode.LowerCase, lr) {
   287  			j += mwid
   288  			if j == len(substr) {
   289  				return true
   290  			}
   291  			line = line[lwid:]
   292  			continue
   293  		}
   294  		line = line[lwid:]
   295  		j = 0
   296  	}
   297  	return false
   298  }
   299  
   300  func (l containsFilter) ToStage() Stage {
   301  	return StageFunc{
   302  		process: func(_ int64, line []byte, _ *LabelsBuilder) ([]byte, bool) {
   303  			return line, l.Filter(line)
   304  		},
   305  	}
   306  }
   307  
   308  func (l containsFilter) String() string {
   309  	return string(l.match)
   310  }
   311  
   312  // newContainsFilter creates a contains filter that checks if a log line contains a match.
   313  func newContainsFilter(match []byte, caseInsensitive bool) Filterer {
   314  	if len(match) == 0 {
   315  		return TrueFilter
   316  	}
   317  	if caseInsensitive {
   318  		match = bytes.ToLower(match)
   319  	}
   320  	return &containsFilter{
   321  		match:           match,
   322  		caseInsensitive: caseInsensitive,
   323  	}
   324  }
   325  
   326  type containsAllFilter struct {
   327  	matches []containsFilter
   328  }
   329  
   330  func (f *containsAllFilter) Add(filter containsFilter) {
   331  	f.matches = append(f.matches, filter)
   332  }
   333  
   334  func (f *containsAllFilter) Empty() bool {
   335  	return len(f.matches) == 0
   336  }
   337  
   338  func (f containsAllFilter) Filter(line []byte) bool {
   339  	for _, m := range f.matches {
   340  		if !contains(line, m.match, m.caseInsensitive) {
   341  			return false
   342  		}
   343  	}
   344  	return true
   345  }
   346  
   347  func (f containsAllFilter) ToStage() Stage {
   348  	return StageFunc{
   349  		process: func(_ int64, line []byte, _ *LabelsBuilder) ([]byte, bool) {
   350  			return line, f.Filter(line)
   351  		},
   352  	}
   353  }
   354  
   355  // NewFilter creates a new line filter from a match string and type.
   356  func NewFilter(match string, mt labels.MatchType) (Filterer, error) {
   357  	switch mt {
   358  	case labels.MatchRegexp:
   359  		return parseRegexpFilter(match, true)
   360  	case labels.MatchNotRegexp:
   361  		return parseRegexpFilter(match, false)
   362  	case labels.MatchEqual:
   363  		return newContainsFilter([]byte(match), false), nil
   364  	case labels.MatchNotEqual:
   365  		return newNotFilter(newContainsFilter([]byte(match), false)), nil
   366  	default:
   367  		return nil, fmt.Errorf("unknown matcher: %v", match)
   368  	}
   369  }
   370  
   371  // parseRegexpFilter parses a regexp and attempt to simplify it with only literal filters.
   372  // If not possible it will returns the original regexp filter.
   373  func parseRegexpFilter(re string, match bool) (Filterer, error) {
   374  	reg, err := syntax.Parse(re, syntax.Perl)
   375  	if err != nil {
   376  		return nil, err
   377  	}
   378  	reg = reg.Simplify()
   379  
   380  	// attempt to improve regex with tricks
   381  	f, ok := simplify(reg)
   382  	if !ok {
   383  		allNonGreedy(reg)
   384  		return newRegexpFilter(reg.String(), match)
   385  	}
   386  	if match {
   387  		return f, nil
   388  	}
   389  	return newNotFilter(f), nil
   390  }
   391  
   392  // allNonGreedy turns greedy quantifiers such as `.*` and `.+` into non-greedy ones. This is the same effect as writing
   393  // `.*?` and `.+?`. This is only safe because we use `Match`. If we were to find the exact position and length of the match
   394  // we would not be allowed to make this optimization. `Match` can return quicker because it is not looking for the longest match.
   395  // Prepending the expression with `(?U)` or passing `NonGreedy` to the expression compiler is not enough since it will
   396  // just negate `.*` and `.*?`.
   397  func allNonGreedy(regs ...*syntax.Regexp) {
   398  	clearCapture(regs...)
   399  	for _, re := range regs {
   400  		switch re.Op {
   401  		case syntax.OpCapture, syntax.OpConcat, syntax.OpAlternate:
   402  			allNonGreedy(re.Sub...)
   403  		case syntax.OpStar, syntax.OpPlus:
   404  			re.Flags = re.Flags | syntax.NonGreedy
   405  		default:
   406  			continue
   407  		}
   408  	}
   409  }
   410  
   411  // simplify a regexp expression by replacing it, when possible, with a succession of literal filters.
   412  // For example `(foo|bar)` will be replaced by  `containsFilter(foo) or containsFilter(bar)`
   413  func simplify(reg *syntax.Regexp) (Filterer, bool) {
   414  	switch reg.Op {
   415  	case syntax.OpAlternate:
   416  		return simplifyAlternate(reg)
   417  	case syntax.OpConcat:
   418  		return simplifyConcat(reg, nil)
   419  	case syntax.OpCapture:
   420  		clearCapture(reg)
   421  		return simplify(reg)
   422  	case syntax.OpLiteral:
   423  		return newContainsFilter([]byte(string((reg.Rune))), isCaseInsensitive(reg)), true
   424  	case syntax.OpStar:
   425  		if reg.Sub[0].Op == syntax.OpAnyCharNotNL {
   426  			return TrueFilter, true
   427  		}
   428  	case syntax.OpEmptyMatch:
   429  		return TrueFilter, true
   430  	}
   431  	return nil, false
   432  }
   433  
   434  func isCaseInsensitive(reg *syntax.Regexp) bool {
   435  	return (reg.Flags & syntax.FoldCase) != 0
   436  }
   437  
   438  // clearCapture removes capture operation as they are not used for filtering.
   439  func clearCapture(regs ...*syntax.Regexp) {
   440  	for _, r := range regs {
   441  		if r.Op == syntax.OpCapture {
   442  			*r = *r.Sub[0]
   443  		}
   444  	}
   445  }
   446  
   447  // simplifyAlternate simplifies, when possible, alternate regexp expressions such as:
   448  // (foo|bar) or (foo|(bar|buzz)).
   449  func simplifyAlternate(reg *syntax.Regexp) (Filterer, bool) {
   450  	clearCapture(reg.Sub...)
   451  	// attempt to simplify the first leg
   452  	f, ok := simplify(reg.Sub[0])
   453  	if !ok {
   454  		return nil, false
   455  	}
   456  	// merge the rest of the legs
   457  	for i := 1; i < len(reg.Sub); i++ {
   458  		f2, ok := simplify(reg.Sub[i])
   459  		if !ok {
   460  			return nil, false
   461  		}
   462  		f = newOrFilter(f, f2)
   463  	}
   464  	return f, true
   465  }
   466  
   467  // simplifyConcat attempt to simplify concat operations.
   468  // Concat operations are either literal and star such as foo.* .*foo.* .*foo
   469  // which is a literalFilter.
   470  // Or a literal and alternates operation (see simplifyConcatAlternate), which represent a multiplication of alternates.
   471  // Anything else is rejected.
   472  func simplifyConcat(reg *syntax.Regexp, baseLiteral []byte) (Filterer, bool) {
   473  	clearCapture(reg.Sub...)
   474  	// remove empty match as we don't need them for filtering
   475  	i := 0
   476  	for _, r := range reg.Sub {
   477  		if r.Op == syntax.OpEmptyMatch {
   478  			continue
   479  		}
   480  		reg.Sub[i] = r
   481  		i++
   482  	}
   483  	reg.Sub = reg.Sub[:i]
   484  	// we support only simplication of concat operation with 3 sub expressions.
   485  	// for instance .*foo.*bar contains 4 subs (.*+foo+.*+bar) and can't be simplified.
   486  	if len(reg.Sub) > 3 {
   487  		return nil, false
   488  	}
   489  
   490  	var curr Filterer
   491  	var ok bool
   492  	literals := 0
   493  	for _, sub := range reg.Sub {
   494  		if sub.Op == syntax.OpLiteral {
   495  			// only one literal is allowed.
   496  			if literals != 0 {
   497  				return nil, false
   498  			}
   499  			literals++
   500  			baseLiteral = append(baseLiteral, []byte(string(sub.Rune))...)
   501  			continue
   502  		}
   503  		// if we have an alternate we must also have a base literal to apply the concatenation with.
   504  		if sub.Op == syntax.OpAlternate && baseLiteral != nil {
   505  			if curr, ok = simplifyConcatAlternate(sub, baseLiteral, curr); !ok {
   506  				return nil, false
   507  			}
   508  			continue
   509  		}
   510  		if sub.Op == syntax.OpStar && sub.Sub[0].Op == syntax.OpAnyCharNotNL {
   511  			continue
   512  		}
   513  		return nil, false
   514  	}
   515  
   516  	// if we have a filter from concat alternates.
   517  	if curr != nil {
   518  		return curr, true
   519  	}
   520  
   521  	// if we have only a concat with literals.
   522  	if baseLiteral != nil {
   523  		return newContainsFilter(baseLiteral, isCaseInsensitive(reg)), true
   524  	}
   525  
   526  	return nil, false
   527  }
   528  
   529  // simplifyConcatAlternate simplifies concat alternate operations.
   530  // A concat alternate is found when a concat operation has a sub alternate and is preceded by a literal.
   531  // For instance bar|b|buzz is expressed as b(ar|(?:)|uzz) => b concat alternate(ar,(?:),uzz).
   532  // (?:) being an OpEmptyMatch and b being the literal to concat all alternates (ar,(?:),uzz) with.
   533  func simplifyConcatAlternate(reg *syntax.Regexp, literal []byte, curr Filterer) (Filterer, bool) {
   534  	for _, alt := range reg.Sub {
   535  		switch alt.Op {
   536  		case syntax.OpEmptyMatch:
   537  			curr = chainOrFilter(curr, newContainsFilter(literal, isCaseInsensitive(reg)))
   538  		case syntax.OpLiteral:
   539  			// concat the root literal with the alternate one.
   540  			altBytes := []byte(string(alt.Rune))
   541  			altLiteral := make([]byte, 0, len(literal)+len(altBytes))
   542  			altLiteral = append(altLiteral, literal...)
   543  			altLiteral = append(altLiteral, altBytes...)
   544  			curr = chainOrFilter(curr, newContainsFilter(altLiteral, isCaseInsensitive(reg)))
   545  		case syntax.OpConcat:
   546  			f, ok := simplifyConcat(alt, literal)
   547  			if !ok {
   548  				return nil, false
   549  			}
   550  			curr = chainOrFilter(curr, f)
   551  		case syntax.OpStar:
   552  			if alt.Sub[0].Op != syntax.OpAnyCharNotNL {
   553  				return nil, false
   554  			}
   555  			curr = chainOrFilter(curr, newContainsFilter(literal, isCaseInsensitive(reg)))
   556  		default:
   557  			return nil, false
   558  		}
   559  	}
   560  	if curr != nil {
   561  		return curr, true
   562  	}
   563  	return nil, false
   564  }