github.com/yankunsam/loki/v2@v2.6.3-0.20220817130409-389df5235c27/pkg/logql/log/filter.go (about) 1 package log 2 3 import ( 4 "bytes" 5 "fmt" 6 "unicode" 7 "unicode/utf8" 8 9 "github.com/grafana/regexp" 10 "github.com/grafana/regexp/syntax" 11 12 "github.com/prometheus/prometheus/model/labels" 13 ) 14 15 // Filterer is a interface to filter log lines. 16 type Filterer interface { 17 Filter(line []byte) bool 18 ToStage() Stage 19 } 20 21 // LineFilterFunc is a syntax sugar for creating line filter from a function 22 type FiltererFunc func(line []byte) bool 23 24 func (f FiltererFunc) Filter(line []byte) bool { 25 return f(line) 26 } 27 28 type trueFilter struct{} 29 30 func (trueFilter) Filter(_ []byte) bool { return true } 31 func (trueFilter) ToStage() Stage { return NoopStage } 32 33 // TrueFilter is a filter that returns and matches all log lines whatever their content. 34 var TrueFilter = trueFilter{} 35 36 type notFilter struct { 37 Filterer 38 } 39 40 func (n notFilter) Filter(line []byte) bool { 41 return !n.Filterer.Filter(line) 42 } 43 44 func (n notFilter) ToStage() Stage { 45 return StageFunc{ 46 process: func(_ int64, line []byte, _ *LabelsBuilder) ([]byte, bool) { 47 return line, n.Filter(line) 48 }, 49 } 50 } 51 52 // newNotFilter creates a new filter which matches only if the base filter doesn't match. 53 // If the base filter is a `or` it will recursively simplify with `and` operations. 54 func newNotFilter(base Filterer) Filterer { 55 // not(a|b) = not(a) and not(b) , and operation can't benefit from this optimization because both legs always needs to be executed. 56 if or, ok := base.(orFilter); ok { 57 return NewAndFilter(newNotFilter(or.left), newNotFilter(or.right)) 58 } 59 return notFilter{Filterer: base} 60 } 61 62 type andFilter struct { 63 left Filterer 64 right Filterer 65 } 66 67 // NewAndFilter creates a new filter which matches only if left and right matches. 68 func NewAndFilter(left Filterer, right Filterer) Filterer { 69 // Make sure we take care of panics in case a nil or noop filter is passed. 70 if right == nil || right == TrueFilter { 71 return left 72 } 73 74 if left == nil || left == TrueFilter { 75 return right 76 } 77 78 return andFilter{ 79 left: left, 80 right: right, 81 } 82 } 83 84 func (a andFilter) Filter(line []byte) bool { 85 return a.left.Filter(line) && a.right.Filter(line) 86 } 87 88 func (a andFilter) ToStage() Stage { 89 return StageFunc{ 90 process: func(_ int64, line []byte, _ *LabelsBuilder) ([]byte, bool) { 91 return line, a.Filter(line) 92 }, 93 } 94 } 95 96 type andFilters struct { 97 filters []Filterer 98 } 99 100 // NewAndFilters creates a new filter which matches only if all filters match 101 func NewAndFilters(filters []Filterer) Filterer { 102 var containsFilterAcc *containsAllFilter 103 regexpFilters := make([]Filterer, 0) 104 n := 0 105 for _, filter := range filters { 106 // Make sure we take care of panics in case a nil or noop filter is passed. 107 if !(filter == nil || filter == TrueFilter) { 108 switch c := filter.(type) { 109 case *containsFilter: 110 // Start accumulating contains filters. 111 if containsFilterAcc == nil { 112 containsFilterAcc = &containsAllFilter{} 113 } 114 115 // Join all contain filters. 116 containsFilterAcc.Add(*c) 117 case regexpFilter: 118 regexpFilters = append(regexpFilters, c) 119 120 default: 121 // Finish accumulating contains filters. 122 if containsFilterAcc != nil { 123 filters[n] = containsFilterAcc 124 n++ 125 containsFilterAcc = nil 126 } 127 128 // Keep filter 129 filters[n] = filter 130 n++ 131 } 132 } 133 } 134 filters = filters[:n] 135 136 if containsFilterAcc != nil { 137 filters = append(filters, containsFilterAcc) 138 } 139 140 // Push regex filters to end 141 if len(regexpFilters) > 0 { 142 filters = append(filters, regexpFilters...) 143 } 144 145 if len(filters) == 0 { 146 return TrueFilter 147 } else if len(filters) == 1 { 148 return filters[0] 149 } 150 151 return andFilters{ 152 filters: filters, 153 } 154 } 155 156 func (a andFilters) Filter(line []byte) bool { 157 for _, filter := range a.filters { 158 if !filter.Filter(line) { 159 return false 160 } 161 } 162 return true 163 } 164 165 func (a andFilters) ToStage() Stage { 166 return StageFunc{ 167 process: func(_ int64, line []byte, _ *LabelsBuilder) ([]byte, bool) { 168 return line, a.Filter(line) 169 }, 170 } 171 } 172 173 type orFilter struct { 174 left Filterer 175 right Filterer 176 } 177 178 // newOrFilter creates a new filter which matches only if left or right matches. 179 func newOrFilter(left Filterer, right Filterer) Filterer { 180 if left == nil || left == TrueFilter { 181 return right 182 } 183 184 if right == nil || right == TrueFilter { 185 return left 186 } 187 188 return orFilter{ 189 left: left, 190 right: right, 191 } 192 } 193 194 // chainOrFilter is a syntax sugar to chain multiple `or` filters. (1 or many) 195 func chainOrFilter(curr, new Filterer) Filterer { 196 if curr == nil { 197 return new 198 } 199 return newOrFilter(curr, new) 200 } 201 202 func (a orFilter) Filter(line []byte) bool { 203 return a.left.Filter(line) || a.right.Filter(line) 204 } 205 206 func (a orFilter) ToStage() Stage { 207 return StageFunc{ 208 process: func(_ int64, line []byte, _ *LabelsBuilder) ([]byte, bool) { 209 return line, a.Filter(line) 210 }, 211 } 212 } 213 214 type regexpFilter struct { 215 *regexp.Regexp 216 } 217 218 // newRegexpFilter creates a new line filter for a given regexp. 219 // If match is false the filter is the negation of the regexp. 220 func newRegexpFilter(re string, match bool) (Filterer, error) { 221 reg, err := regexp.Compile(re) 222 if err != nil { 223 return nil, err 224 } 225 f := regexpFilter{reg} 226 if match { 227 return f, nil 228 } 229 return newNotFilter(f), nil 230 } 231 232 func (r regexpFilter) Filter(line []byte) bool { 233 return r.Match(line) 234 } 235 236 func (r regexpFilter) ToStage() Stage { 237 return StageFunc{ 238 process: func(_ int64, line []byte, _ *LabelsBuilder) ([]byte, bool) { 239 return line, r.Filter(line) 240 }, 241 } 242 } 243 244 type containsFilter struct { 245 match []byte 246 caseInsensitive bool 247 } 248 249 func (l *containsFilter) Filter(line []byte) bool { 250 return contains(line, l.match, l.caseInsensitive) 251 } 252 253 func contains(line, substr []byte, caseInsensitive bool) bool { 254 if !caseInsensitive { 255 return bytes.Contains(line, substr) 256 } 257 return containsLower(line, substr) 258 } 259 260 func containsLower(line, substr []byte) bool { 261 if len(substr) == 0 { 262 return true 263 } 264 if len(substr) > len(line) { 265 return false 266 } 267 j := 0 268 for len(line) > 0 { 269 // ascii fast case 270 if c := line[0]; c < utf8.RuneSelf { 271 if c == substr[j] || c+'a'-'A' == substr[j] { 272 j++ 273 if j == len(substr) { 274 return true 275 } 276 line = line[1:] 277 continue 278 } 279 line = line[1:] 280 j = 0 281 continue 282 } 283 // unicode slow case 284 lr, lwid := utf8.DecodeRune(line) 285 mr, mwid := utf8.DecodeRune(substr[j:]) 286 if lr == mr || mr == unicode.To(unicode.LowerCase, lr) { 287 j += mwid 288 if j == len(substr) { 289 return true 290 } 291 line = line[lwid:] 292 continue 293 } 294 line = line[lwid:] 295 j = 0 296 } 297 return false 298 } 299 300 func (l containsFilter) ToStage() Stage { 301 return StageFunc{ 302 process: func(_ int64, line []byte, _ *LabelsBuilder) ([]byte, bool) { 303 return line, l.Filter(line) 304 }, 305 } 306 } 307 308 func (l containsFilter) String() string { 309 return string(l.match) 310 } 311 312 // newContainsFilter creates a contains filter that checks if a log line contains a match. 313 func newContainsFilter(match []byte, caseInsensitive bool) Filterer { 314 if len(match) == 0 { 315 return TrueFilter 316 } 317 if caseInsensitive { 318 match = bytes.ToLower(match) 319 } 320 return &containsFilter{ 321 match: match, 322 caseInsensitive: caseInsensitive, 323 } 324 } 325 326 type containsAllFilter struct { 327 matches []containsFilter 328 } 329 330 func (f *containsAllFilter) Add(filter containsFilter) { 331 f.matches = append(f.matches, filter) 332 } 333 334 func (f *containsAllFilter) Empty() bool { 335 return len(f.matches) == 0 336 } 337 338 func (f containsAllFilter) Filter(line []byte) bool { 339 for _, m := range f.matches { 340 if !contains(line, m.match, m.caseInsensitive) { 341 return false 342 } 343 } 344 return true 345 } 346 347 func (f containsAllFilter) ToStage() Stage { 348 return StageFunc{ 349 process: func(_ int64, line []byte, _ *LabelsBuilder) ([]byte, bool) { 350 return line, f.Filter(line) 351 }, 352 } 353 } 354 355 // NewFilter creates a new line filter from a match string and type. 356 func NewFilter(match string, mt labels.MatchType) (Filterer, error) { 357 switch mt { 358 case labels.MatchRegexp: 359 return parseRegexpFilter(match, true) 360 case labels.MatchNotRegexp: 361 return parseRegexpFilter(match, false) 362 case labels.MatchEqual: 363 return newContainsFilter([]byte(match), false), nil 364 case labels.MatchNotEqual: 365 return newNotFilter(newContainsFilter([]byte(match), false)), nil 366 default: 367 return nil, fmt.Errorf("unknown matcher: %v", match) 368 } 369 } 370 371 // parseRegexpFilter parses a regexp and attempt to simplify it with only literal filters. 372 // If not possible it will returns the original regexp filter. 373 func parseRegexpFilter(re string, match bool) (Filterer, error) { 374 reg, err := syntax.Parse(re, syntax.Perl) 375 if err != nil { 376 return nil, err 377 } 378 reg = reg.Simplify() 379 380 // attempt to improve regex with tricks 381 f, ok := simplify(reg) 382 if !ok { 383 allNonGreedy(reg) 384 return newRegexpFilter(reg.String(), match) 385 } 386 if match { 387 return f, nil 388 } 389 return newNotFilter(f), nil 390 } 391 392 // allNonGreedy turns greedy quantifiers such as `.*` and `.+` into non-greedy ones. This is the same effect as writing 393 // `.*?` and `.+?`. This is only safe because we use `Match`. If we were to find the exact position and length of the match 394 // we would not be allowed to make this optimization. `Match` can return quicker because it is not looking for the longest match. 395 // Prepending the expression with `(?U)` or passing `NonGreedy` to the expression compiler is not enough since it will 396 // just negate `.*` and `.*?`. 397 func allNonGreedy(regs ...*syntax.Regexp) { 398 clearCapture(regs...) 399 for _, re := range regs { 400 switch re.Op { 401 case syntax.OpCapture, syntax.OpConcat, syntax.OpAlternate: 402 allNonGreedy(re.Sub...) 403 case syntax.OpStar, syntax.OpPlus: 404 re.Flags = re.Flags | syntax.NonGreedy 405 default: 406 continue 407 } 408 } 409 } 410 411 // simplify a regexp expression by replacing it, when possible, with a succession of literal filters. 412 // For example `(foo|bar)` will be replaced by `containsFilter(foo) or containsFilter(bar)` 413 func simplify(reg *syntax.Regexp) (Filterer, bool) { 414 switch reg.Op { 415 case syntax.OpAlternate: 416 return simplifyAlternate(reg) 417 case syntax.OpConcat: 418 return simplifyConcat(reg, nil) 419 case syntax.OpCapture: 420 clearCapture(reg) 421 return simplify(reg) 422 case syntax.OpLiteral: 423 return newContainsFilter([]byte(string((reg.Rune))), isCaseInsensitive(reg)), true 424 case syntax.OpStar: 425 if reg.Sub[0].Op == syntax.OpAnyCharNotNL { 426 return TrueFilter, true 427 } 428 case syntax.OpEmptyMatch: 429 return TrueFilter, true 430 } 431 return nil, false 432 } 433 434 func isCaseInsensitive(reg *syntax.Regexp) bool { 435 return (reg.Flags & syntax.FoldCase) != 0 436 } 437 438 // clearCapture removes capture operation as they are not used for filtering. 439 func clearCapture(regs ...*syntax.Regexp) { 440 for _, r := range regs { 441 if r.Op == syntax.OpCapture { 442 *r = *r.Sub[0] 443 } 444 } 445 } 446 447 // simplifyAlternate simplifies, when possible, alternate regexp expressions such as: 448 // (foo|bar) or (foo|(bar|buzz)). 449 func simplifyAlternate(reg *syntax.Regexp) (Filterer, bool) { 450 clearCapture(reg.Sub...) 451 // attempt to simplify the first leg 452 f, ok := simplify(reg.Sub[0]) 453 if !ok { 454 return nil, false 455 } 456 // merge the rest of the legs 457 for i := 1; i < len(reg.Sub); i++ { 458 f2, ok := simplify(reg.Sub[i]) 459 if !ok { 460 return nil, false 461 } 462 f = newOrFilter(f, f2) 463 } 464 return f, true 465 } 466 467 // simplifyConcat attempt to simplify concat operations. 468 // Concat operations are either literal and star such as foo.* .*foo.* .*foo 469 // which is a literalFilter. 470 // Or a literal and alternates operation (see simplifyConcatAlternate), which represent a multiplication of alternates. 471 // Anything else is rejected. 472 func simplifyConcat(reg *syntax.Regexp, baseLiteral []byte) (Filterer, bool) { 473 clearCapture(reg.Sub...) 474 // remove empty match as we don't need them for filtering 475 i := 0 476 for _, r := range reg.Sub { 477 if r.Op == syntax.OpEmptyMatch { 478 continue 479 } 480 reg.Sub[i] = r 481 i++ 482 } 483 reg.Sub = reg.Sub[:i] 484 // we support only simplication of concat operation with 3 sub expressions. 485 // for instance .*foo.*bar contains 4 subs (.*+foo+.*+bar) and can't be simplified. 486 if len(reg.Sub) > 3 { 487 return nil, false 488 } 489 490 var curr Filterer 491 var ok bool 492 literals := 0 493 for _, sub := range reg.Sub { 494 if sub.Op == syntax.OpLiteral { 495 // only one literal is allowed. 496 if literals != 0 { 497 return nil, false 498 } 499 literals++ 500 baseLiteral = append(baseLiteral, []byte(string(sub.Rune))...) 501 continue 502 } 503 // if we have an alternate we must also have a base literal to apply the concatenation with. 504 if sub.Op == syntax.OpAlternate && baseLiteral != nil { 505 if curr, ok = simplifyConcatAlternate(sub, baseLiteral, curr); !ok { 506 return nil, false 507 } 508 continue 509 } 510 if sub.Op == syntax.OpStar && sub.Sub[0].Op == syntax.OpAnyCharNotNL { 511 continue 512 } 513 return nil, false 514 } 515 516 // if we have a filter from concat alternates. 517 if curr != nil { 518 return curr, true 519 } 520 521 // if we have only a concat with literals. 522 if baseLiteral != nil { 523 return newContainsFilter(baseLiteral, isCaseInsensitive(reg)), true 524 } 525 526 return nil, false 527 } 528 529 // simplifyConcatAlternate simplifies concat alternate operations. 530 // A concat alternate is found when a concat operation has a sub alternate and is preceded by a literal. 531 // For instance bar|b|buzz is expressed as b(ar|(?:)|uzz) => b concat alternate(ar,(?:),uzz). 532 // (?:) being an OpEmptyMatch and b being the literal to concat all alternates (ar,(?:),uzz) with. 533 func simplifyConcatAlternate(reg *syntax.Regexp, literal []byte, curr Filterer) (Filterer, bool) { 534 for _, alt := range reg.Sub { 535 switch alt.Op { 536 case syntax.OpEmptyMatch: 537 curr = chainOrFilter(curr, newContainsFilter(literal, isCaseInsensitive(reg))) 538 case syntax.OpLiteral: 539 // concat the root literal with the alternate one. 540 altBytes := []byte(string(alt.Rune)) 541 altLiteral := make([]byte, 0, len(literal)+len(altBytes)) 542 altLiteral = append(altLiteral, literal...) 543 altLiteral = append(altLiteral, altBytes...) 544 curr = chainOrFilter(curr, newContainsFilter(altLiteral, isCaseInsensitive(reg))) 545 case syntax.OpConcat: 546 f, ok := simplifyConcat(alt, literal) 547 if !ok { 548 return nil, false 549 } 550 curr = chainOrFilter(curr, f) 551 case syntax.OpStar: 552 if alt.Sub[0].Op != syntax.OpAnyCharNotNL { 553 return nil, false 554 } 555 curr = chainOrFilter(curr, newContainsFilter(literal, isCaseInsensitive(reg))) 556 default: 557 return nil, false 558 } 559 } 560 if curr != nil { 561 return curr, true 562 } 563 return nil, false 564 }