github.com/honeycombio/honeytail@v1.9.0/parsers/postgresql/postgresql.go (about)

     1  // Package postgresql contains code for parsing PostgreSQL slow query logs.
     2  //
     3  // The Postgres slow query format
     4  // ------------------------------
     5  //
     6  // In general, Postgres logs consist of a prefix, a level, and a message:
     7  //
     8  // 2017-11-06 19:20:32 UTC [11534-2] LOG:  autovacuum launcher shutting down
     9  // |<-------------prefix----------->|level|<----------message-------------->|
    10  //
    11  // 2017-11-07 01:43:39 UTC [3542-7] postgres@test LOG:  duration: 15.577 ms  statement: SELECT * FROM test;
    12  // |<-------------prefix------------------------>|level|<-------------------message---------------------->|
    13  //
    14  // The format of the configuration prefix is configurable as `log_line_prefix` in postgresql.conf
    15  // using the following format specifiers:
    16  //
    17  //   %a = application name
    18  //   %u = user name
    19  //   %d = database name
    20  //   %r = remote host and port
    21  //   %h = remote host
    22  //   %p = process ID
    23  //   %t = timestamp without milliseconds
    24  //   %m = timestamp with milliseconds
    25  //   %i = command tag
    26  //   %e = SQL state
    27  //   %c = session ID
    28  //   %l = session line number
    29  //   %s = session start timestamp
    30  //   %v = virtual transaction ID
    31  //   %x = transaction ID (0 if none)
    32  //   %q = stop here in non-session
    33  //        processes
    34  //   %% = '%'
    35  //
    36  // For example, the prefix format for the lines above is:
    37  // %t [%p-%l] %q%u@%d
    38  // We currently require users to pass the prefix format as a parser option.
    39  //
    40  // Slow query logs specifically have the following format:
    41  // 2017-11-07 01:43:39 UTC [3542-7] postgres@test LOG:  duration: 15.577 ms  statement: SELECT * FROM test;
    42  // |<-------------prefix------------------------>|<----------header-------------------->|<-----query----->|
    43  //
    44  // For convenience, we call everything after the prefix but before the actual query string the "header".
    45  //
    46  // The query may span multiple lines; continuations are indented. For example:
    47  //
    48  // 2017-11-07 01:43:39 UTC [3542-7] postgres@test LOG:  duration: 15.577 ms  statement: SELECT * FROM test
    49  //		WHERE id=1;
    50  package postgresql
    51  
    52  import (
    53  	"fmt"
    54  	"regexp"
    55  	"strconv"
    56  	"strings"
    57  	"sync"
    58  	"time"
    59  
    60  	"github.com/honeycombio/honeytail/event"
    61  	"github.com/honeycombio/honeytail/parsers"
    62  	"github.com/honeycombio/mysqltools/query/normalizer"
    63  	"github.com/sirupsen/logrus"
    64  )
    65  
    66  const (
    67  	// Regex string that matches timestamps in log
    68  	timestampRe   = `\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}[.0-9]* [A-Z]+`
    69  	defaultPrefix = "%t [%p-%l] %u@%d"
    70  	// Regex string that matches the header of a slow query log line
    71  	slowQueryHeader      = `\s*(?P<level>[A-Z0-9]+):\s+duration: (?P<duration>[0-9\.]+) ms\s+(?:(statement)|(execute \S+)): `
    72  	traceFromTrace1      = `\/\*.*trace_id=['"](?P<trace_id>[^'"]+)['"].*parent_id=['"](?P<parent_id>[^'"]+)['"].*\*\/`
    73  	traceFromTrace2      = `\/\*.*parent_id=['"](?P<parent_id>[^'"]+)['"].*trace_id=['"](?P<trace_id>[^'"]+)['"].*\*\/`
    74  	traceFromTraceParent = `\/\*.*traceparent=['"]\d{2}-(?P<trace_id>[^-]+)-(?P<parent_id>[^-]+)-\d{2}['"].*\*\/`
    75  )
    76  
    77  var slowQueryHeaderRegex = &parsers.ExtRegexp{Regexp: regexp.MustCompile(slowQueryHeader)}
    78  var traceFromTraceRegex1 = &parsers.ExtRegexp{Regexp: regexp.MustCompile(traceFromTrace1)}
    79  var traceFromTraceRegex2 = &parsers.ExtRegexp{Regexp: regexp.MustCompile(traceFromTrace2)}
    80  var traceFromTraceParentRegex = &parsers.ExtRegexp{Regexp: regexp.MustCompile(traceFromTraceParent)}
    81  
    82  // prefixField represents a specific format specifier in the log_line_prefix string
    83  // (see module comment for details).
    84  type prefixField struct {
    85  	Name    string
    86  	Pattern string
    87  }
    88  
    89  func (p *prefixField) ReString() string {
    90  	return fmt.Sprintf("(?P<%s>%s)", p.Name, p.Pattern)
    91  }
    92  
    93  var prefixValues = map[string]prefixField{
    94  	"%a": {Name: "application", Pattern: "\\S+"},
    95  	"%u": {Name: "user", Pattern: "\\S+"},
    96  	"%d": {Name: "database", Pattern: "\\S+"},
    97  	"%r": {Name: "host_port", Pattern: "\\S+"},
    98  	"%h": {Name: "host", Pattern: "\\S+"},
    99  	"%p": {Name: "pid", Pattern: "\\d+"},
   100  	"%t": {Name: "timestamp", Pattern: timestampRe},
   101  	"%m": {Name: "timestamp_millis", Pattern: timestampRe},
   102  	"%n": {Name: "timestamp_unix", Pattern: "\\d+"},
   103  	"%i": {Name: "command_tag", Pattern: "\\S+"},
   104  	"%e": {Name: "sql_state", Pattern: "\\S+"},
   105  	"%c": {Name: "session_id", Pattern: "\\d+"},
   106  	"%l": {Name: "session_line_number", Pattern: "\\d+"},
   107  	"%s": {Name: "session_start", Pattern: timestampRe},
   108  	"%v": {Name: "virtual_transaction_id", Pattern: "\\S+"},
   109  	"%x": {Name: "transaction_id", Pattern: "\\S+"},
   110  }
   111  
   112  type Options struct {
   113  	LogLinePrefix string `long:"log_line_prefix" description:"Format string for PostgreSQL log line prefix"`
   114  }
   115  
   116  type Parser struct {
   117  	// regex to match the log_line_prefix format specified by the user
   118  	pgPrefixRegex *parsers.ExtRegexp
   119  }
   120  
   121  func (p *Parser) Init(options interface{}) (err error) {
   122  	var logLinePrefixFormat string
   123  	conf, ok := options.(*Options)
   124  	if !ok || conf.LogLinePrefix == "" {
   125  		logLinePrefixFormat = defaultPrefix
   126  	} else {
   127  		logLinePrefixFormat = conf.LogLinePrefix
   128  	}
   129  	p.pgPrefixRegex, err = buildPrefixRegexp(logLinePrefixFormat)
   130  	return err
   131  }
   132  
   133  func (p *Parser) ProcessLines(lines <-chan string, send chan<- event.Event, prefixRegex *parsers.ExtRegexp) {
   134  	rawEvents := make(chan []string)
   135  	wg := &sync.WaitGroup{}
   136  	wg.Add(1)
   137  	go p.handleEvents(rawEvents, send, wg)
   138  	var groupedLines []string
   139  	for line := range lines {
   140  		if prefixRegex != nil {
   141  			// This is the "global" prefix regex as specified by the
   142  			// --log_prefix option, for stripping prefixes added by syslog or
   143  			// the like. It's unlikely that it'll actually be set by consumers
   144  			// of database logs. Don't confuse this with p.pgPrefixRegex, which
   145  			// is a compiled regex for parsing the postgres-specific line
   146  			// prefix.
   147  			var prefix = prefixRegex.FindString(line)
   148  			line = strings.TrimPrefix(line, prefix)
   149  		}
   150  		if !isContinuationLine(line) && len(groupedLines) > 0 {
   151  			// If the line we just parsed is the start of a new log statement,
   152  			// send off the previously accumulated group.
   153  			rawEvents <- groupedLines
   154  			groupedLines = make([]string, 0, 1)
   155  		}
   156  		groupedLines = append(groupedLines, line)
   157  	}
   158  
   159  	rawEvents <- groupedLines
   160  	close(rawEvents)
   161  	wg.Wait()
   162  }
   163  
   164  // handleEvents receives sets of grouped log lines, each representing a single
   165  // log statement. It attempts to parse them, and sends the events it constructs
   166  // down the send channel.
   167  func (p *Parser) handleEvents(rawEvents <-chan []string, send chan<- event.Event, wg *sync.WaitGroup) {
   168  	defer wg.Done()
   169  	// TODO: spin up a group of goroutines to do this
   170  	for rawEvent := range rawEvents {
   171  		ev := p.handleEvent(rawEvent)
   172  		if ev != nil {
   173  			send <- *ev
   174  		}
   175  	}
   176  }
   177  
   178  // handleEvent takes a single grouped log statement (an array of lines) and attempts to parse it.
   179  // It returns a pointer to an Event if successful, and nil if not.
   180  func (p *Parser) handleEvent(rawEvent []string) *event.Event {
   181  	normalizer := normalizer.Parser{}
   182  	if len(rawEvent) == 0 {
   183  		return nil
   184  	}
   185  	firstLine := rawEvent[0]
   186  
   187  	// First, try to parse the prefix
   188  	match, suffix, generalMeta := parsePrefix(p.pgPrefixRegex, firstLine)
   189  	if !match {
   190  		// Note: this may be noisy when debug logging is turned on, since the
   191  		// postgres general log contains lots of other statements as well.
   192  		logrus.WithField("line", firstLine).Debug("Log line prefix didn't match expected format")
   193  		return nil
   194  	}
   195  
   196  	ev := &event.Event{
   197  		Data: make(map[string]interface{}, 0),
   198  	}
   199  
   200  	addFieldsToEvent(generalMeta, ev)
   201  
   202  	// Now, parse the slow query header
   203  	match, query, slowQueryMeta := parsePrefix(slowQueryHeaderRegex, suffix)
   204  
   205  	if !match {
   206  		logrus.WithField("line", firstLine).Debug("didn't find slow query header, skipping line")
   207  		return nil
   208  	}
   209  
   210  	if rawDuration, ok := slowQueryMeta["duration"]; ok {
   211  		duration, _ := strconv.ParseFloat(rawDuration, 64)
   212  		ev.Data["duration"] = duration // Backwards compatibility
   213  		ev.Data["duration_ms"] = duration
   214  	} else {
   215  		logrus.WithField("query", query).Debug("Failed to find query duration in log line")
   216  	}
   217  
   218  	// Finally, concatenate the remaining text to form the query, and attempt to
   219  	// normalize it.
   220  	for _, line := range rawEvent[1:] {
   221  		query += " " + strings.TrimLeft(line, " \t")
   222  	}
   223  
   224  	// Also try to parse the trace data from a SQL comment
   225  	match, traceData := parseTraceData(query)
   226  
   227  	if match {
   228  		if traceId, ok := traceData["trace_id"]; ok {
   229  			ev.Data["trace.trace_id"] = traceId
   230  		}
   231  
   232  		if parentId, ok := traceData["parent_id"]; ok {
   233  			ev.Data["trace.parent_id"] = parentId
   234  		}
   235  	}
   236  
   237  	normalizedQuery := normalizer.NormalizeQuery(query)
   238  
   239  	ev.Data["query"] = query
   240  	ev.Data["normalized_query"] = normalizedQuery
   241  
   242  	if len(normalizer.LastTables) > 0 {
   243  		ev.Data["tables"] = strings.Join(normalizer.LastTables, " ")
   244  	}
   245  
   246  	if len(normalizer.LastComments) > 0 {
   247  		ev.Data["comments"] = "/* " + strings.Join(normalizer.LastComments, " */ /* ") + " */"
   248  	}
   249  
   250  	return ev
   251  }
   252  
   253  func parseTraceData(query string) (matched bool, fields map[string]string) {
   254  	match, _, traceData := parsePrefix(traceFromTraceRegex1, query)
   255  
   256  	if match {
   257  		return match, traceData
   258  	}
   259  
   260  	match, _, traceData = parsePrefix(traceFromTraceRegex2, query)
   261  
   262  	if match {
   263  		return match, traceData
   264  	}
   265  
   266  	match, _, traceData = parsePrefix(traceFromTraceParentRegex, query)
   267  
   268  	return match, traceData
   269  }
   270  
   271  func isContinuationLine(line string) bool {
   272  	return strings.HasPrefix(line, "\t")
   273  }
   274  
   275  // addFieldsToEvent takes a map of key-value metadata extracted from a log
   276  // line, and adds them to the given event. It'll convert values to integer
   277  // types where possible, and try to populate the event's timestamp.
   278  func addFieldsToEvent(fields map[string]string, ev *event.Event) {
   279  	for k, v := range fields {
   280  		// Try to convert values to integer types where sensible, and extract
   281  		// timestamp for event
   282  		switch k {
   283  		case "session_id", "pid", "session_line_number":
   284  			if typed, err := strconv.Atoi(v); err == nil {
   285  				ev.Data[k] = typed
   286  			} else {
   287  				ev.Data[k] = v
   288  			}
   289  		case "timestamp", "timestamp_millis":
   290  			if timestamp, err := time.Parse("2006-01-02 15:04:05.999 MST", v); err == nil {
   291  				ev.Timestamp = timestamp
   292  			} else {
   293  				logrus.WithField("timestamp", v).WithError(err).Debug("Error parsing query timestamp")
   294  			}
   295  		case "timestamp_unix":
   296  			if typed, err := strconv.Atoi(v); err == nil {
   297  				// Convert millisecond-resolution Unix timestamp to time.Time
   298  				// object
   299  				timestamp := time.Unix(int64(typed/1000), int64((1000*1000)*(typed%1000))).UTC()
   300  				ev.Timestamp = timestamp
   301  			} else {
   302  				logrus.WithField("timestamp", v).WithError(err).Debug("Error parsing query timestamp")
   303  			}
   304  		default:
   305  			ev.Data[k] = v
   306  		}
   307  	}
   308  }
   309  
   310  func parsePrefix(re *parsers.ExtRegexp, line string) (matched bool, suffix string, fields map[string]string) {
   311  	prefix, fields := re.FindStringSubmatchMap(line)
   312  	if prefix == "" {
   313  		return false, "", nil
   314  	}
   315  	return true, line[len(prefix):], fields
   316  }
   317  
   318  func buildPrefixRegexp(prefixFormat string) (*parsers.ExtRegexp, error) {
   319  	prefixFormat = strings.Replace(prefixFormat, "%%", "%", -1)
   320  	// The %q format specifier means "if this log line isn't part of a session,
   321  	// stop here." The slow query logs that we care about always come from
   322  	// sessions, so ignore this.
   323  	prefixFormat = strings.Replace(prefixFormat, "%q", "", -1)
   324  	prefixFormat = regexp.QuoteMeta(prefixFormat)
   325  	for k, v := range prefixValues {
   326  		prefixFormat = strings.Replace(prefixFormat, k, v.ReString(), -1)
   327  	}
   328  	prefixFormat = "^" + prefixFormat
   329  
   330  	re, err := regexp.Compile(prefixFormat)
   331  	if err != nil {
   332  		return nil, err
   333  	}
   334  	return &parsers.ExtRegexp{Regexp: re}, nil
   335  }