github.com/Jeffail/benthos/v3@v3.65.0/lib/processor/grok.go (about)

     1  package processor
     2  
     3  import (
     4  	"bufio"
     5  	"errors"
     6  	"fmt"
     7  	"os"
     8  	"strings"
     9  	"time"
    10  
    11  	"github.com/Jeffail/benthos/v3/internal/docs"
    12  	"github.com/Jeffail/benthos/v3/internal/filepath"
    13  	"github.com/Jeffail/benthos/v3/internal/tracing"
    14  	"github.com/Jeffail/benthos/v3/lib/log"
    15  	"github.com/Jeffail/benthos/v3/lib/metrics"
    16  	"github.com/Jeffail/benthos/v3/lib/types"
    17  	"github.com/Jeffail/gabs/v2"
    18  	"github.com/Jeffail/grok"
    19  )
    20  
    21  //------------------------------------------------------------------------------
    22  
    23  func init() {
    24  	Constructors[TypeGrok] = TypeSpec{
    25  		constructor: NewGrok,
    26  		Categories: []Category{
    27  			CategoryParsing,
    28  		},
    29  		Summary: `
    30  Parses messages into a structured format by attempting to apply a list of Grok expressions, the first expression to result in at least one value replaces the original message with a JSON object containing the values.`,
    31  		Description: `
    32  Type hints within patterns are respected, therefore with the pattern ` + "`%{WORD:first},%{INT:second:int}`" + ` and a payload of ` + "`foo,1`" + ` the resulting payload would be ` + "`{\"first\":\"foo\",\"second\":1}`" + `.
    33  
    34  ### Performance
    35  
    36  This processor currently uses the [Go RE2](https://golang.org/s/re2syntax) regular expression engine, which is guaranteed to run in time linear to the size of the input. However, this property often makes it less performant than PCRE based implementations of grok. For more information see [https://swtch.com/~rsc/regexp/regexp1.html](https://swtch.com/~rsc/regexp/regexp1.html).`,
    37  		FieldSpecs: docs.FieldSpecs{
    38  			docs.FieldString("expressions", "One or more Grok expressions to attempt against incoming messages. The first expression to match at least one value will be used to form a result.").Array(),
    39  			docs.FieldString("pattern_definitions", "A map of pattern definitions that can be referenced within `patterns`.").Map(),
    40  			docs.FieldString("pattern_paths", "A list of paths to load Grok patterns from. This field supports wildcards, including super globs (double star).").Array(),
    41  			docs.FieldAdvanced("named_captures_only", "Whether to only capture values from named patterns."),
    42  			docs.FieldAdvanced("use_default_patterns", "Whether to use a [default set of patterns](#default-patterns)."),
    43  			docs.FieldAdvanced("remove_empty_values", "Whether to remove values that are empty from the resulting structure."),
    44  			docs.FieldDeprecated("patterns").Array(),
    45  			docs.FieldDeprecated("output_format"),
    46  			PartsFieldSpec,
    47  		},
    48  		Examples: []docs.AnnotatedExample{
    49  			{
    50  				Title: "VPC Flow Logs",
    51  				Summary: `
    52  Grok can be used to parse unstructured logs such as VPC flow logs that look like this:
    53  
    54  ` + "```text" + `
    55  2 123456789010 eni-1235b8ca123456789 172.31.16.139 172.31.16.21 20641 22 6 20 4249 1418530010 1418530070 ACCEPT OK
    56  ` + "```" + `
    57  
    58  Into structured objects that look like this:
    59  
    60  ` + "```json" + `
    61  {"accountid":"123456789010","action":"ACCEPT","bytes":4249,"dstaddr":"172.31.16.21","dstport":22,"end":1418530070,"interfaceid":"eni-1235b8ca123456789","logstatus":"OK","packets":20,"protocol":6,"srcaddr":"172.31.16.139","srcport":20641,"start":1418530010,"version":2}
    62  ` + "```" + `
    63  
    64  With the following config:`,
    65  				Config: `
    66  pipeline:
    67    processors:
    68      - grok:
    69          expressions:
    70            - '%{VPCFLOWLOG}'
    71          pattern_definitions:
    72            VPCFLOWLOG: '%{NUMBER:version:int} %{NUMBER:accountid} %{NOTSPACE:interfaceid} %{NOTSPACE:srcaddr} %{NOTSPACE:dstaddr} %{NOTSPACE:srcport:int} %{NOTSPACE:dstport:int} %{NOTSPACE:protocol:int} %{NOTSPACE:packets:int} %{NOTSPACE:bytes:int} %{NUMBER:start:int} %{NUMBER:end:int} %{NOTSPACE:action} %{NOTSPACE:logstatus}'
    73  `,
    74  			},
    75  		},
    76  		Footnotes: `
    77  ## Default Patterns
    78  
    79  A summary of the default patterns on offer can be [found here](https://github.com/Jeffail/grok/blob/master/patterns.go#L5).`,
    80  	}
    81  }
    82  
    83  //------------------------------------------------------------------------------
    84  
    85  // GrokConfig contains configuration fields for the Grok processor.
    86  type GrokConfig struct {
    87  	Parts              []int             `json:"parts" yaml:"parts"`
    88  	Expressions        []string          `json:"expressions" yaml:"expressions"`
    89  	RemoveEmpty        bool              `json:"remove_empty_values" yaml:"remove_empty_values"`
    90  	NamedOnly          bool              `json:"named_captures_only" yaml:"named_captures_only"`
    91  	UseDefaults        bool              `json:"use_default_patterns" yaml:"use_default_patterns"`
    92  	To                 string            `json:"output_format" yaml:"output_format"`
    93  	PatternPaths       []string          `json:"pattern_paths" yaml:"pattern_paths"`
    94  	PatternDefinitions map[string]string `json:"pattern_definitions" yaml:"pattern_definitions"`
    95  
    96  	// TODO: V4 Remove this
    97  	Patterns []string `json:"patterns" yaml:"patterns"`
    98  }
    99  
   100  // NewGrokConfig returns a GrokConfig with default values.
   101  func NewGrokConfig() GrokConfig {
   102  	return GrokConfig{
   103  		Parts:              []int{},
   104  		Expressions:        []string{},
   105  		RemoveEmpty:        true,
   106  		NamedOnly:          true,
   107  		UseDefaults:        true,
   108  		To:                 "json",
   109  		PatternPaths:       []string{},
   110  		PatternDefinitions: make(map[string]string),
   111  
   112  		Patterns: []string{},
   113  	}
   114  }
   115  
   116  //------------------------------------------------------------------------------
   117  
   118  // Grok is a processor that executes Grok queries on a message part and replaces
   119  // the contents with the result.
   120  type Grok struct {
   121  	parts    []int
   122  	gparsers []*grok.CompiledGrok
   123  
   124  	conf  Config
   125  	log   log.Modular
   126  	stats metrics.Type
   127  
   128  	mCount     metrics.StatCounter
   129  	mErrGrok   metrics.StatCounter
   130  	mErrJSONS  metrics.StatCounter
   131  	mErr       metrics.StatCounter
   132  	mSent      metrics.StatCounter
   133  	mBatchSent metrics.StatCounter
   134  }
   135  
   136  // NewGrok returns a Grok processor.
   137  func NewGrok(
   138  	conf Config, mgr types.Manager, log log.Modular, stats metrics.Type,
   139  ) (Type, error) {
   140  	if len(conf.Grok.Expressions) > 0 && len(conf.Grok.Patterns) > 0 {
   141  		return nil, errors.New("cannot specify grok expressions in both the field `expressions` and the deprecated field `patterns`")
   142  	}
   143  
   144  	grokConf := grok.Config{
   145  		RemoveEmptyValues:   conf.Grok.RemoveEmpty,
   146  		NamedCapturesOnly:   conf.Grok.NamedOnly,
   147  		SkipDefaultPatterns: !conf.Grok.UseDefaults,
   148  		Patterns:            conf.Grok.PatternDefinitions,
   149  	}
   150  
   151  	for _, path := range conf.Grok.PatternPaths {
   152  		if err := addGrokPatternsFromPath(path, grokConf.Patterns); err != nil {
   153  			return nil, fmt.Errorf("failed to parse patterns from path '%v': %v", path, err)
   154  		}
   155  	}
   156  
   157  	gcompiler, err := grok.New(grokConf)
   158  	if err != nil {
   159  		return nil, fmt.Errorf("failed to create grok compiler: %v", err)
   160  	}
   161  
   162  	var compiled []*grok.CompiledGrok
   163  	for _, pattern := range conf.Grok.Patterns {
   164  		var gcompiled *grok.CompiledGrok
   165  		if gcompiled, err = gcompiler.Compile(pattern); err != nil {
   166  			return nil, fmt.Errorf("failed to compile Grok pattern '%v': %v", pattern, err)
   167  		}
   168  		compiled = append(compiled, gcompiled)
   169  	}
   170  	for _, pattern := range conf.Grok.Expressions {
   171  		var gcompiled *grok.CompiledGrok
   172  		if gcompiled, err = gcompiler.Compile(pattern); err != nil {
   173  			return nil, fmt.Errorf("failed to compile Grok pattern '%v': %v", pattern, err)
   174  		}
   175  		compiled = append(compiled, gcompiled)
   176  	}
   177  
   178  	g := &Grok{
   179  		parts:    conf.Grok.Parts,
   180  		gparsers: compiled,
   181  		conf:     conf,
   182  		log:      log,
   183  		stats:    stats,
   184  
   185  		mCount:     stats.GetCounter("count"),
   186  		mErrGrok:   stats.GetCounter("error.grok_no_matches"),
   187  		mErrJSONS:  stats.GetCounter("error.json_set"),
   188  		mErr:       stats.GetCounter("error"),
   189  		mSent:      stats.GetCounter("sent"),
   190  		mBatchSent: stats.GetCounter("batch.sent"),
   191  	}
   192  	return g, nil
   193  }
   194  
   195  //------------------------------------------------------------------------------
   196  
   197  func addGrokPatternsFromPath(path string, patterns map[string]string) error {
   198  	if s, err := os.Stat(path); err != nil {
   199  		return err
   200  	} else if s.IsDir() {
   201  		path += "/*"
   202  	}
   203  
   204  	files, err := filepath.Globs([]string{path})
   205  	if err != nil {
   206  		return err
   207  	}
   208  
   209  	for _, f := range files {
   210  		file, err := os.Open(f)
   211  		if err != nil {
   212  			return err
   213  		}
   214  
   215  		scanner := bufio.NewScanner(file)
   216  
   217  		for scanner.Scan() {
   218  			l := scanner.Text()
   219  			if len(l) > 0 && l[0] != '#' {
   220  				names := strings.SplitN(l, " ", 2)
   221  				patterns[names[0]] = names[1]
   222  			}
   223  		}
   224  
   225  		file.Close()
   226  	}
   227  
   228  	return nil
   229  }
   230  
   231  // ProcessMessage applies the processor to a message, either creating >0
   232  // resulting messages or a response to be sent back to the message source.
   233  func (g *Grok) ProcessMessage(msg types.Message) ([]types.Message, types.Response) {
   234  	g.mCount.Incr(1)
   235  	newMsg := msg.Copy()
   236  
   237  	proc := func(index int, span *tracing.Span, part types.Part) error {
   238  		body := part.Get()
   239  
   240  		var values map[string]interface{}
   241  		for _, compiler := range g.gparsers {
   242  			var err error
   243  			if values, err = compiler.ParseTyped(body); err != nil {
   244  				g.log.Debugf("Failed to parse body: %v\n", err)
   245  				continue
   246  			}
   247  			if len(values) > 0 {
   248  				break
   249  			}
   250  		}
   251  
   252  		if len(values) == 0 {
   253  			g.mErrGrok.Incr(1)
   254  			g.mErr.Incr(1)
   255  			g.log.Debugf("No matches found for payload: %s\n", body)
   256  			return errors.New("no pattern matches found")
   257  		}
   258  
   259  		gObj := gabs.New()
   260  		for k, v := range values {
   261  			gObj.SetP(v, k)
   262  		}
   263  
   264  		if err := newMsg.Get(index).SetJSON(gObj.Data()); err != nil {
   265  			g.mErrJSONS.Incr(1)
   266  			g.mErr.Incr(1)
   267  			g.log.Debugf("Failed to convert grok result into json: %v\n", err)
   268  			return err
   269  		}
   270  
   271  		return nil
   272  	}
   273  
   274  	IteratePartsWithSpanV2(TypeGrok, g.parts, newMsg, proc)
   275  
   276  	msgs := [1]types.Message{newMsg}
   277  
   278  	g.mBatchSent.Incr(1)
   279  	g.mSent.Incr(int64(newMsg.Len()))
   280  	return msgs[:], nil
   281  }
   282  
   283  // CloseAsync shuts down the processor and stops processing requests.
   284  func (g *Grok) CloseAsync() {
   285  }
   286  
   287  // WaitForClose blocks until the processor has closed down.
   288  func (g *Grok) WaitForClose(timeout time.Duration) error {
   289  	return nil
   290  }
   291  
   292  //------------------------------------------------------------------------------