github.com/Jeffail/benthos/v3@v3.65.0/lib/processor/grok.go (about) 1 package processor 2 3 import ( 4 "bufio" 5 "errors" 6 "fmt" 7 "os" 8 "strings" 9 "time" 10 11 "github.com/Jeffail/benthos/v3/internal/docs" 12 "github.com/Jeffail/benthos/v3/internal/filepath" 13 "github.com/Jeffail/benthos/v3/internal/tracing" 14 "github.com/Jeffail/benthos/v3/lib/log" 15 "github.com/Jeffail/benthos/v3/lib/metrics" 16 "github.com/Jeffail/benthos/v3/lib/types" 17 "github.com/Jeffail/gabs/v2" 18 "github.com/Jeffail/grok" 19 ) 20 21 //------------------------------------------------------------------------------ 22 23 func init() { 24 Constructors[TypeGrok] = TypeSpec{ 25 constructor: NewGrok, 26 Categories: []Category{ 27 CategoryParsing, 28 }, 29 Summary: ` 30 Parses messages into a structured format by attempting to apply a list of Grok expressions, the first expression to result in at least one value replaces the original message with a JSON object containing the values.`, 31 Description: ` 32 Type hints within patterns are respected, therefore with the pattern ` + "`%{WORD:first},%{INT:second:int}`" + ` and a payload of ` + "`foo,1`" + ` the resulting payload would be ` + "`{\"first\":\"foo\",\"second\":1}`" + `. 33 34 ### Performance 35 36 This processor currently uses the [Go RE2](https://golang.org/s/re2syntax) regular expression engine, which is guaranteed to run in time linear to the size of the input. However, this property often makes it less performant than PCRE based implementations of grok. For more information see [https://swtch.com/~rsc/regexp/regexp1.html](https://swtch.com/~rsc/regexp/regexp1.html).`, 37 FieldSpecs: docs.FieldSpecs{ 38 docs.FieldString("expressions", "One or more Grok expressions to attempt against incoming messages. The first expression to match at least one value will be used to form a result.").Array(), 39 docs.FieldString("pattern_definitions", "A map of pattern definitions that can be referenced within `patterns`.").Map(), 40 docs.FieldString("pattern_paths", "A list of paths to load Grok patterns from. This field supports wildcards, including super globs (double star).").Array(), 41 docs.FieldAdvanced("named_captures_only", "Whether to only capture values from named patterns."), 42 docs.FieldAdvanced("use_default_patterns", "Whether to use a [default set of patterns](#default-patterns)."), 43 docs.FieldAdvanced("remove_empty_values", "Whether to remove values that are empty from the resulting structure."), 44 docs.FieldDeprecated("patterns").Array(), 45 docs.FieldDeprecated("output_format"), 46 PartsFieldSpec, 47 }, 48 Examples: []docs.AnnotatedExample{ 49 { 50 Title: "VPC Flow Logs", 51 Summary: ` 52 Grok can be used to parse unstructured logs such as VPC flow logs that look like this: 53 54 ` + "```text" + ` 55 2 123456789010 eni-1235b8ca123456789 172.31.16.139 172.31.16.21 20641 22 6 20 4249 1418530010 1418530070 ACCEPT OK 56 ` + "```" + ` 57 58 Into structured objects that look like this: 59 60 ` + "```json" + ` 61 {"accountid":"123456789010","action":"ACCEPT","bytes":4249,"dstaddr":"172.31.16.21","dstport":22,"end":1418530070,"interfaceid":"eni-1235b8ca123456789","logstatus":"OK","packets":20,"protocol":6,"srcaddr":"172.31.16.139","srcport":20641,"start":1418530010,"version":2} 62 ` + "```" + ` 63 64 With the following config:`, 65 Config: ` 66 pipeline: 67 processors: 68 - grok: 69 expressions: 70 - '%{VPCFLOWLOG}' 71 pattern_definitions: 72 VPCFLOWLOG: '%{NUMBER:version:int} %{NUMBER:accountid} %{NOTSPACE:interfaceid} %{NOTSPACE:srcaddr} %{NOTSPACE:dstaddr} %{NOTSPACE:srcport:int} %{NOTSPACE:dstport:int} %{NOTSPACE:protocol:int} %{NOTSPACE:packets:int} %{NOTSPACE:bytes:int} %{NUMBER:start:int} %{NUMBER:end:int} %{NOTSPACE:action} %{NOTSPACE:logstatus}' 73 `, 74 }, 75 }, 76 Footnotes: ` 77 ## Default Patterns 78 79 A summary of the default patterns on offer can be [found here](https://github.com/Jeffail/grok/blob/master/patterns.go#L5).`, 80 } 81 } 82 83 //------------------------------------------------------------------------------ 84 85 // GrokConfig contains configuration fields for the Grok processor. 86 type GrokConfig struct { 87 Parts []int `json:"parts" yaml:"parts"` 88 Expressions []string `json:"expressions" yaml:"expressions"` 89 RemoveEmpty bool `json:"remove_empty_values" yaml:"remove_empty_values"` 90 NamedOnly bool `json:"named_captures_only" yaml:"named_captures_only"` 91 UseDefaults bool `json:"use_default_patterns" yaml:"use_default_patterns"` 92 To string `json:"output_format" yaml:"output_format"` 93 PatternPaths []string `json:"pattern_paths" yaml:"pattern_paths"` 94 PatternDefinitions map[string]string `json:"pattern_definitions" yaml:"pattern_definitions"` 95 96 // TODO: V4 Remove this 97 Patterns []string `json:"patterns" yaml:"patterns"` 98 } 99 100 // NewGrokConfig returns a GrokConfig with default values. 101 func NewGrokConfig() GrokConfig { 102 return GrokConfig{ 103 Parts: []int{}, 104 Expressions: []string{}, 105 RemoveEmpty: true, 106 NamedOnly: true, 107 UseDefaults: true, 108 To: "json", 109 PatternPaths: []string{}, 110 PatternDefinitions: make(map[string]string), 111 112 Patterns: []string{}, 113 } 114 } 115 116 //------------------------------------------------------------------------------ 117 118 // Grok is a processor that executes Grok queries on a message part and replaces 119 // the contents with the result. 120 type Grok struct { 121 parts []int 122 gparsers []*grok.CompiledGrok 123 124 conf Config 125 log log.Modular 126 stats metrics.Type 127 128 mCount metrics.StatCounter 129 mErrGrok metrics.StatCounter 130 mErrJSONS metrics.StatCounter 131 mErr metrics.StatCounter 132 mSent metrics.StatCounter 133 mBatchSent metrics.StatCounter 134 } 135 136 // NewGrok returns a Grok processor. 137 func NewGrok( 138 conf Config, mgr types.Manager, log log.Modular, stats metrics.Type, 139 ) (Type, error) { 140 if len(conf.Grok.Expressions) > 0 && len(conf.Grok.Patterns) > 0 { 141 return nil, errors.New("cannot specify grok expressions in both the field `expressions` and the deprecated field `patterns`") 142 } 143 144 grokConf := grok.Config{ 145 RemoveEmptyValues: conf.Grok.RemoveEmpty, 146 NamedCapturesOnly: conf.Grok.NamedOnly, 147 SkipDefaultPatterns: !conf.Grok.UseDefaults, 148 Patterns: conf.Grok.PatternDefinitions, 149 } 150 151 for _, path := range conf.Grok.PatternPaths { 152 if err := addGrokPatternsFromPath(path, grokConf.Patterns); err != nil { 153 return nil, fmt.Errorf("failed to parse patterns from path '%v': %v", path, err) 154 } 155 } 156 157 gcompiler, err := grok.New(grokConf) 158 if err != nil { 159 return nil, fmt.Errorf("failed to create grok compiler: %v", err) 160 } 161 162 var compiled []*grok.CompiledGrok 163 for _, pattern := range conf.Grok.Patterns { 164 var gcompiled *grok.CompiledGrok 165 if gcompiled, err = gcompiler.Compile(pattern); err != nil { 166 return nil, fmt.Errorf("failed to compile Grok pattern '%v': %v", pattern, err) 167 } 168 compiled = append(compiled, gcompiled) 169 } 170 for _, pattern := range conf.Grok.Expressions { 171 var gcompiled *grok.CompiledGrok 172 if gcompiled, err = gcompiler.Compile(pattern); err != nil { 173 return nil, fmt.Errorf("failed to compile Grok pattern '%v': %v", pattern, err) 174 } 175 compiled = append(compiled, gcompiled) 176 } 177 178 g := &Grok{ 179 parts: conf.Grok.Parts, 180 gparsers: compiled, 181 conf: conf, 182 log: log, 183 stats: stats, 184 185 mCount: stats.GetCounter("count"), 186 mErrGrok: stats.GetCounter("error.grok_no_matches"), 187 mErrJSONS: stats.GetCounter("error.json_set"), 188 mErr: stats.GetCounter("error"), 189 mSent: stats.GetCounter("sent"), 190 mBatchSent: stats.GetCounter("batch.sent"), 191 } 192 return g, nil 193 } 194 195 //------------------------------------------------------------------------------ 196 197 func addGrokPatternsFromPath(path string, patterns map[string]string) error { 198 if s, err := os.Stat(path); err != nil { 199 return err 200 } else if s.IsDir() { 201 path += "/*" 202 } 203 204 files, err := filepath.Globs([]string{path}) 205 if err != nil { 206 return err 207 } 208 209 for _, f := range files { 210 file, err := os.Open(f) 211 if err != nil { 212 return err 213 } 214 215 scanner := bufio.NewScanner(file) 216 217 for scanner.Scan() { 218 l := scanner.Text() 219 if len(l) > 0 && l[0] != '#' { 220 names := strings.SplitN(l, " ", 2) 221 patterns[names[0]] = names[1] 222 } 223 } 224 225 file.Close() 226 } 227 228 return nil 229 } 230 231 // ProcessMessage applies the processor to a message, either creating >0 232 // resulting messages or a response to be sent back to the message source. 233 func (g *Grok) ProcessMessage(msg types.Message) ([]types.Message, types.Response) { 234 g.mCount.Incr(1) 235 newMsg := msg.Copy() 236 237 proc := func(index int, span *tracing.Span, part types.Part) error { 238 body := part.Get() 239 240 var values map[string]interface{} 241 for _, compiler := range g.gparsers { 242 var err error 243 if values, err = compiler.ParseTyped(body); err != nil { 244 g.log.Debugf("Failed to parse body: %v\n", err) 245 continue 246 } 247 if len(values) > 0 { 248 break 249 } 250 } 251 252 if len(values) == 0 { 253 g.mErrGrok.Incr(1) 254 g.mErr.Incr(1) 255 g.log.Debugf("No matches found for payload: %s\n", body) 256 return errors.New("no pattern matches found") 257 } 258 259 gObj := gabs.New() 260 for k, v := range values { 261 gObj.SetP(v, k) 262 } 263 264 if err := newMsg.Get(index).SetJSON(gObj.Data()); err != nil { 265 g.mErrJSONS.Incr(1) 266 g.mErr.Incr(1) 267 g.log.Debugf("Failed to convert grok result into json: %v\n", err) 268 return err 269 } 270 271 return nil 272 } 273 274 IteratePartsWithSpanV2(TypeGrok, g.parts, newMsg, proc) 275 276 msgs := [1]types.Message{newMsg} 277 278 g.mBatchSent.Incr(1) 279 g.mSent.Incr(int64(newMsg.Len())) 280 return msgs[:], nil 281 } 282 283 // CloseAsync shuts down the processor and stops processing requests. 284 func (g *Grok) CloseAsync() { 285 } 286 287 // WaitForClose blocks until the processor has closed down. 288 func (g *Grok) WaitForClose(timeout time.Duration) error { 289 return nil 290 } 291 292 //------------------------------------------------------------------------------