github.com/netdata/go.d.plugin@v0.58.1/modules/weblog/parser.go (about)

     1  // SPDX-License-Identifier: GPL-3.0-or-later
     2  
     3  package weblog
     4  
     5  import (
     6  	"errors"
     7  	"fmt"
     8  	"regexp"
     9  	"strings"
    10  
    11  	"github.com/netdata/go.d.plugin/pkg/logs"
    12  )
    13  
    14  /*
    15  Default apache log format:
    16   - "%v:%p %h %l %u %t \"%r\" %>s %O \"%{Referer}i\" \"%{User-Agent}i\"" vhost_combined
    17   - "%h %l %u %t \"%r\" %>s %O \"%{Referer}i\" \"%{User-Agent}i\"" combined
    18   - "%h %l %u %t \"%r\" %>s %O" common
    19   - "%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-agent}i\" %I %O" Combined I/O (https://httpd.apache.org/docs/2.4/mod/mod_logio.html)
    20  
    21  Default nginx log format:
    22   - '$remote_addr - $remote_user [$time_local] '
    23     '"$request" $status $body_bytes_sent '
    24     '"$http_referer" "$http_user_agent"' combined
    25  
    26  Netdata recommends:
    27   Nginx:
    28    - '$remote_addr - $remote_user [$time_local] '
    29      '"$request" $status $body_bytes_sent '
    30      '$request_length $request_time $upstream_response_time '
    31      '"$http_referer" "$http_user_agent"'
    32  
    33   Apache:
    34    - "%h %l %u %t \"%r\" %>s %B %I %D \"%{Referer}i\" \"%{User-Agent}i\""
    35  */
    36  
    37  var (
    38  	csvCommon       = `                   $remote_addr - - [$time_local] "$request" $status $body_bytes_sent`
    39  	csvCustom1      = `                   $remote_addr - - [$time_local] "$request" $status $body_bytes_sent     $request_length $request_time`
    40  	csvCustom2      = `                   $remote_addr - - [$time_local] "$request" $status $body_bytes_sent     $request_length $request_time $upstream_response_time`
    41  	csvCustom3      = `                   $remote_addr - - [$time_local] "$request" $status $body_bytes_sent - - $request_length $request_time`
    42  	csvCustom4      = `                   $remote_addr - - [$time_local] "$request" $status $body_bytes_sent - - $request_length $request_time $upstream_response_time`
    43  	csvVhostCommon  = `$host:$server_port $remote_addr - - [$time_local] "$request" $status $body_bytes_sent`
    44  	csvVhostCustom1 = `$host:$server_port $remote_addr - - [$time_local] "$request" $status $body_bytes_sent     $request_length $request_time`
    45  	csvVhostCustom2 = `$host:$server_port $remote_addr - - [$time_local] "$request" $status $body_bytes_sent     $request_length $request_time $upstream_response_time`
    46  	csvVhostCustom3 = `$host:$server_port $remote_addr - - [$time_local] "$request" $status $body_bytes_sent - - $request_length $request_time`
    47  	csvVhostCustom4 = `$host:$server_port $remote_addr - - [$time_local] "$request" $status $body_bytes_sent - - $request_length $request_time $upstream_response_time`
    48  
    49  	guessOrder = []string{
    50  		csvVhostCustom4,
    51  		csvVhostCustom3,
    52  		csvVhostCustom2,
    53  		csvVhostCustom1,
    54  		csvVhostCommon,
    55  		csvCustom4,
    56  		csvCustom3,
    57  		csvCustom2,
    58  		csvCustom1,
    59  		csvCommon,
    60  	}
    61  )
    62  
    63  func cleanCSVFormat(format string) string       { return strings.Join(strings.Fields(format), " ") }
    64  func cleanApacheLogFormat(format string) string { return strings.ReplaceAll(format, `\`, "") }
    65  
    66  const (
    67  	typeAuto = "auto"
    68  )
    69  
    70  var (
    71  	reLTSV = regexp.MustCompile(`^[a-zA-Z0-9]+:[^\t]*(\t[a-zA-Z0-9]+:[^\t]*)*$`)
    72  	reJSON = regexp.MustCompile(`^[[:space:]]*{.*}[[:space:]]*$`)
    73  )
    74  
    75  func (w *WebLog) newParser(record []byte) (logs.Parser, error) {
    76  	if w.Parser.LogType == typeAuto {
    77  		w.Debugf("log_type is %s, will try format auto-detection", typeAuto)
    78  		if len(record) == 0 {
    79  			return nil, fmt.Errorf("empty line, can't auto-detect format (%s)", w.file.CurrentFilename())
    80  		}
    81  		return w.guessParser(record)
    82  	}
    83  
    84  	w.Parser.CSV.Format = cleanApacheLogFormat(w.Parser.CSV.Format)
    85  	w.Debugf("log_type is %s, skipping auto-detection", w.Parser.LogType)
    86  	switch w.Parser.LogType {
    87  	case logs.TypeCSV:
    88  		w.Debugf("config: %+v", w.Parser.CSV)
    89  	case logs.TypeLTSV:
    90  		w.Debugf("config: %+v", w.Parser.LogType)
    91  	case logs.TypeRegExp:
    92  		w.Debugf("config: %+v", w.Parser.RegExp)
    93  	case logs.TypeJSON:
    94  		w.Debugf("config: %+v", w.Parser.JSON)
    95  	}
    96  	return logs.NewParser(w.Parser, w.file)
    97  }
    98  
    99  func (w *WebLog) guessParser(record []byte) (logs.Parser, error) {
   100  	w.Debug("starting log type auto-detection")
   101  	if reLTSV.Match(record) {
   102  		w.Debug("log type is LTSV")
   103  		return logs.NewLTSVParser(w.Parser.LTSV, w.file)
   104  	}
   105  	if reJSON.Match(record) {
   106  		w.Debug("log type is JSON")
   107  		return logs.NewJSONParser(w.Parser.JSON, w.file)
   108  	}
   109  	w.Debug("log type is CSV")
   110  	return w.guessCSVParser(record)
   111  }
   112  
   113  func (w *WebLog) guessCSVParser(record []byte) (logs.Parser, error) {
   114  	w.Debug("starting csv log format auto-detection")
   115  	w.Debugf("config: %+v", w.Parser.CSV)
   116  	for _, format := range guessOrder {
   117  		format = cleanCSVFormat(format)
   118  		cfg := w.Parser.CSV
   119  		cfg.Format = format
   120  
   121  		w.Debugf("trying format: '%s'", format)
   122  		parser, err := logs.NewCSVParser(cfg, w.file)
   123  		if err != nil {
   124  			return nil, err
   125  		}
   126  
   127  		line := newEmptyLogLine()
   128  		if err := parser.Parse(record, line); err != nil {
   129  			w.Debug("parse: ", err)
   130  			continue
   131  		}
   132  
   133  		if err = line.verify(); err != nil {
   134  			w.Debug("verify: ", err)
   135  			continue
   136  		}
   137  		return parser, nil
   138  	}
   139  	return nil, errors.New("cannot auto-detect log format, use custom log format")
   140  }
   141  
   142  func checkCSVFormatField(field string) (newName string, offset int, valid bool) {
   143  	if isTimeField(field) {
   144  		return "", 1, false
   145  	}
   146  	if !isFieldValid(field) {
   147  		return "", 0, false
   148  	}
   149  	// remove `$` and `%` to have same field names with regexp parser,
   150  	// these symbols aren't allowed in sub exp names
   151  	return field[1:], 0, true
   152  }
   153  
   154  func isTimeField(field string) bool {
   155  	return field == "[$time_local]" || field == "$time_local" || field == "%t"
   156  }
   157  
   158  func isFieldValid(field string) bool {
   159  	return len(field) > 1 && (isNginxField(field) || isApacheField(field))
   160  }
   161  func isNginxField(field string) bool {
   162  	return strings.HasPrefix(field, "$")
   163  }
   164  
   165  func isApacheField(field string) bool {
   166  	return strings.HasPrefix(field, "%")
   167  }