github.com/netdata/go.d.plugin@v0.58.1/modules/weblog/parser.go (about) 1 // SPDX-License-Identifier: GPL-3.0-or-later 2 3 package weblog 4 5 import ( 6 "errors" 7 "fmt" 8 "regexp" 9 "strings" 10 11 "github.com/netdata/go.d.plugin/pkg/logs" 12 ) 13 14 /* 15 Default apache log format: 16 - "%v:%p %h %l %u %t \"%r\" %>s %O \"%{Referer}i\" \"%{User-Agent}i\"" vhost_combined 17 - "%h %l %u %t \"%r\" %>s %O \"%{Referer}i\" \"%{User-Agent}i\"" combined 18 - "%h %l %u %t \"%r\" %>s %O" common 19 - "%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-agent}i\" %I %O" Combined I/O (https://httpd.apache.org/docs/2.4/mod/mod_logio.html) 20 21 Default nginx log format: 22 - '$remote_addr - $remote_user [$time_local] ' 23 '"$request" $status $body_bytes_sent ' 24 '"$http_referer" "$http_user_agent"' combined 25 26 Netdata recommends: 27 Nginx: 28 - '$remote_addr - $remote_user [$time_local] ' 29 '"$request" $status $body_bytes_sent ' 30 '$request_length $request_time $upstream_response_time ' 31 '"$http_referer" "$http_user_agent"' 32 33 Apache: 34 - "%h %l %u %t \"%r\" %>s %B %I %D \"%{Referer}i\" \"%{User-Agent}i\"" 35 */ 36 37 var ( 38 csvCommon = ` $remote_addr - - [$time_local] "$request" $status $body_bytes_sent` 39 csvCustom1 = ` $remote_addr - - [$time_local] "$request" $status $body_bytes_sent $request_length $request_time` 40 csvCustom2 = ` $remote_addr - - [$time_local] "$request" $status $body_bytes_sent $request_length $request_time $upstream_response_time` 41 csvCustom3 = ` $remote_addr - - [$time_local] "$request" $status $body_bytes_sent - - $request_length $request_time` 42 csvCustom4 = ` $remote_addr - - [$time_local] "$request" $status $body_bytes_sent - - $request_length $request_time $upstream_response_time` 43 csvVhostCommon = `$host:$server_port $remote_addr - - [$time_local] "$request" $status $body_bytes_sent` 44 csvVhostCustom1 = `$host:$server_port $remote_addr - - [$time_local] "$request" $status $body_bytes_sent $request_length $request_time` 45 csvVhostCustom2 = `$host:$server_port $remote_addr - - [$time_local] "$request" $status $body_bytes_sent $request_length $request_time $upstream_response_time` 46 csvVhostCustom3 = `$host:$server_port $remote_addr - - [$time_local] "$request" $status $body_bytes_sent - - $request_length $request_time` 47 csvVhostCustom4 = `$host:$server_port $remote_addr - - [$time_local] "$request" $status $body_bytes_sent - - $request_length $request_time $upstream_response_time` 48 49 guessOrder = []string{ 50 csvVhostCustom4, 51 csvVhostCustom3, 52 csvVhostCustom2, 53 csvVhostCustom1, 54 csvVhostCommon, 55 csvCustom4, 56 csvCustom3, 57 csvCustom2, 58 csvCustom1, 59 csvCommon, 60 } 61 ) 62 63 func cleanCSVFormat(format string) string { return strings.Join(strings.Fields(format), " ") } 64 func cleanApacheLogFormat(format string) string { return strings.ReplaceAll(format, `\`, "") } 65 66 const ( 67 typeAuto = "auto" 68 ) 69 70 var ( 71 reLTSV = regexp.MustCompile(`^[a-zA-Z0-9]+:[^\t]*(\t[a-zA-Z0-9]+:[^\t]*)*$`) 72 reJSON = regexp.MustCompile(`^[[:space:]]*{.*}[[:space:]]*$`) 73 ) 74 75 func (w *WebLog) newParser(record []byte) (logs.Parser, error) { 76 if w.Parser.LogType == typeAuto { 77 w.Debugf("log_type is %s, will try format auto-detection", typeAuto) 78 if len(record) == 0 { 79 return nil, fmt.Errorf("empty line, can't auto-detect format (%s)", w.file.CurrentFilename()) 80 } 81 return w.guessParser(record) 82 } 83 84 w.Parser.CSV.Format = cleanApacheLogFormat(w.Parser.CSV.Format) 85 w.Debugf("log_type is %s, skipping auto-detection", w.Parser.LogType) 86 switch w.Parser.LogType { 87 case logs.TypeCSV: 88 w.Debugf("config: %+v", w.Parser.CSV) 89 case logs.TypeLTSV: 90 w.Debugf("config: %+v", w.Parser.LogType) 91 case logs.TypeRegExp: 92 w.Debugf("config: %+v", w.Parser.RegExp) 93 case logs.TypeJSON: 94 w.Debugf("config: %+v", w.Parser.JSON) 95 } 96 return logs.NewParser(w.Parser, w.file) 97 } 98 99 func (w *WebLog) guessParser(record []byte) (logs.Parser, error) { 100 w.Debug("starting log type auto-detection") 101 if reLTSV.Match(record) { 102 w.Debug("log type is LTSV") 103 return logs.NewLTSVParser(w.Parser.LTSV, w.file) 104 } 105 if reJSON.Match(record) { 106 w.Debug("log type is JSON") 107 return logs.NewJSONParser(w.Parser.JSON, w.file) 108 } 109 w.Debug("log type is CSV") 110 return w.guessCSVParser(record) 111 } 112 113 func (w *WebLog) guessCSVParser(record []byte) (logs.Parser, error) { 114 w.Debug("starting csv log format auto-detection") 115 w.Debugf("config: %+v", w.Parser.CSV) 116 for _, format := range guessOrder { 117 format = cleanCSVFormat(format) 118 cfg := w.Parser.CSV 119 cfg.Format = format 120 121 w.Debugf("trying format: '%s'", format) 122 parser, err := logs.NewCSVParser(cfg, w.file) 123 if err != nil { 124 return nil, err 125 } 126 127 line := newEmptyLogLine() 128 if err := parser.Parse(record, line); err != nil { 129 w.Debug("parse: ", err) 130 continue 131 } 132 133 if err = line.verify(); err != nil { 134 w.Debug("verify: ", err) 135 continue 136 } 137 return parser, nil 138 } 139 return nil, errors.New("cannot auto-detect log format, use custom log format") 140 } 141 142 func checkCSVFormatField(field string) (newName string, offset int, valid bool) { 143 if isTimeField(field) { 144 return "", 1, false 145 } 146 if !isFieldValid(field) { 147 return "", 0, false 148 } 149 // remove `$` and `%` to have same field names with regexp parser, 150 // these symbols aren't allowed in sub exp names 151 return field[1:], 0, true 152 } 153 154 func isTimeField(field string) bool { 155 return field == "[$time_local]" || field == "$time_local" || field == "%t" 156 } 157 158 func isFieldValid(field string) bool { 159 return len(field) > 1 && (isNginxField(field) || isApacheField(field)) 160 } 161 func isNginxField(field string) bool { 162 return strings.HasPrefix(field, "$") 163 } 164 165 func isApacheField(field string) bool { 166 return strings.HasPrefix(field, "%") 167 }