github.com/pachyderm/pachyderm@v1.13.4/src/server/pkg/log/log.go (about)

     1  package log
     2  
     3  import (
     4  	"encoding/json"
     5  	"fmt"
     6  	"runtime"
     7  	"strings"
     8  	"sync"
     9  	"time"
    10  
    11  	"github.com/fatih/camelcase"
    12  	"github.com/pachyderm/pachyderm/src/client/pkg/errors"
    13  	"github.com/prometheus/client_golang/prometheus"
    14  	"github.com/sirupsen/logrus"
    15  )
    16  
    17  const (
    18  	bucketFactor = 2.0
    19  	bucketCount  = 20 // Which makes the max bucket 2^20 seconds or ~12 days in size
    20  )
    21  
    22  // This needs to be a global var, not a field on the logger, because multiple servers
    23  // create new loggers, and the prometheus registration uses a global namespace
    24  var reportMetricGauge prometheus.Gauge
    25  var reportMetricsOnce sync.Once
    26  
    27  // Logger is a helper for emitting our grpc API logs
    28  type Logger interface {
    29  	Log(request interface{}, response interface{}, err error, duration time.Duration)
    30  	LogAtLevelFromDepth(request interface{}, response interface{}, err error, duration time.Duration, level logrus.Level, depth int)
    31  }
    32  
    33  type logger struct {
    34  	*logrus.Entry
    35  	histogram   map[string]*prometheus.HistogramVec
    36  	counter     map[string]prometheus.Counter
    37  	mutex       *sync.Mutex // synchronizes access to both histogram and counter maps
    38  	exportStats bool
    39  	service     string
    40  }
    41  
    42  // NewLogger creates a new logger
    43  func NewLogger(service string) Logger {
    44  	return newLogger(service, true)
    45  }
    46  
    47  // NewLocalLogger creates a new logger for local testing (which does not report prometheus metrics)
    48  func NewLocalLogger(service string) Logger {
    49  	return newLogger(service, false)
    50  }
    51  
    52  func newLogger(service string, exportStats bool) Logger {
    53  	l := logrus.StandardLogger()
    54  	l.Formatter = FormatterFunc(Pretty)
    55  	newLogger := &logger{
    56  		l.WithFields(logrus.Fields{"service": service}),
    57  		make(map[string]*prometheus.HistogramVec),
    58  		make(map[string]prometheus.Counter),
    59  		&sync.Mutex{},
    60  		exportStats,
    61  		service,
    62  	}
    63  	if exportStats {
    64  		reportMetricsOnce.Do(func() {
    65  			newReportMetricGauge := prometheus.NewGauge(
    66  				prometheus.GaugeOpts{
    67  					Namespace: "pachyderm",
    68  					Subsystem: "pachd",
    69  					Name:      "report_metric",
    70  					Help:      "gauge of number of calls to ReportMetric()",
    71  				},
    72  			)
    73  			if err := prometheus.Register(newReportMetricGauge); err != nil {
    74  				// metrics may be redundantly registered; ignore these errors
    75  				if !errors.As(err, &prometheus.AlreadyRegisteredError{}) {
    76  					entry := newLogger.WithFields(logrus.Fields{"method": "NewLogger"})
    77  					newLogger.LogAtLevel(entry, logrus.WarnLevel, fmt.Sprintf("error registering prometheus metric: %v", newReportMetricGauge), err)
    78  				}
    79  			} else {
    80  				reportMetricGauge = newReportMetricGauge
    81  			}
    82  		})
    83  	}
    84  	return newLogger
    85  }
    86  
    87  // Helper function used to log requests and responses from our GRPC method
    88  // implementations
    89  func (l *logger) Log(request interface{}, response interface{}, err error, duration time.Duration) {
    90  	if err != nil {
    91  		l.LogAtLevelFromDepth(request, response, err, duration, logrus.ErrorLevel, 4)
    92  	} else {
    93  		l.LogAtLevelFromDepth(request, response, err, duration, logrus.InfoLevel, 4)
    94  	}
    95  	// We have to grab the method's name here before we
    96  	// enter the goro's stack
    97  	go l.ReportMetric(getMethodName(), duration, err)
    98  }
    99  
   100  func getMethodName() string {
   101  	depth := 4
   102  	pc := make([]uintptr, depth)
   103  	runtime.Callers(depth, pc)
   104  	split := strings.Split(runtime.FuncForPC(pc[0]).Name(), ".")
   105  	return split[len(split)-1]
   106  }
   107  
   108  func (l *logger) ReportMetric(method string, duration time.Duration, err error) {
   109  	if !l.exportStats {
   110  		return
   111  	}
   112  	// Count the number of ReportMetric() goros in case we start to leak them
   113  	if reportMetricGauge != nil {
   114  		reportMetricGauge.Inc()
   115  	}
   116  	defer func() {
   117  		if reportMetricGauge != nil {
   118  			reportMetricGauge.Dec()
   119  		}
   120  	}()
   121  	l.mutex.Lock() // for conccurent map access (histogram,counter)
   122  	defer l.mutex.Unlock()
   123  	state := "started"
   124  	if err != nil {
   125  		state = "errored"
   126  	} else {
   127  		if duration.Seconds() > 0 {
   128  			state = "finished"
   129  		}
   130  	}
   131  	entry := l.WithFields(logrus.Fields{"method": method})
   132  
   133  	var tokens []string
   134  	for _, token := range camelcase.Split(method) {
   135  		tokens = append(tokens, strings.ToLower(token))
   136  	}
   137  	rootStatName := strings.Join(tokens, "_")
   138  
   139  	// Recording the distribution of started times is meaningless
   140  	if state != "started" {
   141  		runTimeName := fmt.Sprintf("%v_time", rootStatName)
   142  		runTime, ok := l.histogram[runTimeName]
   143  		if !ok {
   144  			runTime = prometheus.NewHistogramVec(
   145  				prometheus.HistogramOpts{
   146  					Namespace: "pachyderm",
   147  					Subsystem: fmt.Sprintf("pachd_%v", topLevelService(l.service)),
   148  					Name:      runTimeName,
   149  					Help:      fmt.Sprintf("Run time of %v", method),
   150  					Buckets:   prometheus.ExponentialBuckets(1.0, bucketFactor, bucketCount),
   151  				},
   152  				[]string{
   153  					"state", // Since both finished and errored API calls can have run times
   154  				},
   155  			)
   156  			if err := prometheus.Register(runTime); err != nil {
   157  				// metrics may be redundantly registered; ignore these errors
   158  				if !errors.As(err, &prometheus.AlreadyRegisteredError{}) {
   159  					l.LogAtLevel(entry, logrus.WarnLevel, fmt.Sprintf("error registering prometheus metric %v: %v", runTime, runTimeName), err)
   160  				}
   161  			} else {
   162  				l.histogram[runTimeName] = runTime
   163  			}
   164  		}
   165  		if hist, err := runTime.GetMetricWithLabelValues(state); err != nil {
   166  			l.LogAtLevel(entry, logrus.WarnLevel, "failed to get histogram w labels: state (%v) with error %v", state, err)
   167  		} else {
   168  			hist.Observe(duration.Seconds())
   169  		}
   170  	}
   171  
   172  	secondsCountName := fmt.Sprintf("%v_seconds_count", rootStatName)
   173  	secondsCount, ok := l.counter[secondsCountName]
   174  	if !ok {
   175  		secondsCount = prometheus.NewCounter(
   176  			prometheus.CounterOpts{
   177  				Namespace: "pachyderm",
   178  				Subsystem: fmt.Sprintf("pachd_%v", topLevelService(l.service)),
   179  				Name:      secondsCountName,
   180  				Help:      fmt.Sprintf("cumulative number of seconds spent in %v", method),
   181  			},
   182  		)
   183  		if err := prometheus.Register(secondsCount); err != nil {
   184  			// metrics may be redundantly registered; ignore these errors
   185  			if !errors.As(err, &prometheus.AlreadyRegisteredError{}) {
   186  				l.LogAtLevel(entry, logrus.WarnLevel, fmt.Sprintf("error registering prometheus metric %v: %v", secondsCount, secondsCountName), err)
   187  			}
   188  		} else {
   189  			l.counter[secondsCountName] = secondsCount
   190  		}
   191  	}
   192  	secondsCount.Add(duration.Seconds())
   193  
   194  }
   195  
   196  func (l *logger) LogAtLevel(entry *logrus.Entry, level logrus.Level, args ...interface{}) {
   197  	entry.Log(level, args)
   198  }
   199  
   200  func (l *logger) LogAtLevelFromDepth(request interface{}, response interface{}, err error, duration time.Duration, level logrus.Level, depth int) {
   201  	// We're only interested in 1 stack frame, however due to weirdness with
   202  	// inlining sometimes you need to get more than 1 caller so that
   203  	// CallersFrames can resolve the first function. 2 seems to be enough be
   204  	// we've set it to 5 to insulate us more, because this at one point broke
   205  	// due to some compile optimization changes.
   206  	pc := make([]uintptr, 5)
   207  	runtime.Callers(depth, pc)
   208  	frames := runtime.CallersFrames(pc)
   209  	frame, ok := frames.Next()
   210  	fields := logrus.Fields{}
   211  	if ok {
   212  		split := strings.Split(frame.Function, ".")
   213  		method := split[len(split)-1]
   214  		fields["method"] = method
   215  	} else {
   216  		fields["warn"] = "failed to resolve method"
   217  	}
   218  	fields["request"] = request
   219  	if response != nil {
   220  		fields["response"] = response
   221  	}
   222  	if err != nil {
   223  		// "err" itself might be a code or even an empty struct
   224  		fields["error"] = err.Error()
   225  		var frames []string
   226  		errors.ForEachStackFrame(err, func(frame errors.Frame) {
   227  			frames = append(frames, fmt.Sprintf("%+v", frame))
   228  		})
   229  		fields["stack"] = frames
   230  	}
   231  	if duration > 0 {
   232  		fields["duration"] = duration
   233  	}
   234  	l.LogAtLevel(l.WithFields(fields), level)
   235  }
   236  
   237  func topLevelService(fullyQualifiedService string) string {
   238  	tokens := strings.Split(fullyQualifiedService, ".")
   239  	return tokens[0]
   240  }
   241  
   242  // FormatterFunc is a type alias for a function that satisfies logrus'
   243  // `Formatter` interface
   244  type FormatterFunc func(entry *logrus.Entry) ([]byte, error)
   245  
   246  // Format proxies the closure in order to satisfy `logrus.Formatter`'s
   247  // interface.
   248  func (f FormatterFunc) Format(entry *logrus.Entry) ([]byte, error) {
   249  	return f(entry)
   250  }
   251  
   252  // Pretty formats a logrus entry like so:
   253  // ```
   254  // 2019-02-11T16:02:02Z INFO pfs.API.InspectRepo {"request":{"repo":{"name":"images"}}} []
   255  // ```
   256  func Pretty(entry *logrus.Entry) ([]byte, error) {
   257  	serialized := []byte(
   258  		fmt.Sprintf(
   259  			"%v %v ",
   260  			entry.Time.Format(time.RFC3339),
   261  			strings.ToUpper(entry.Level.String()),
   262  		),
   263  	)
   264  	if entry.Data["service"] != nil && entry.Data["method"] != nil {
   265  		serialized = append(serialized, []byte(fmt.Sprintf("%v.%v ", entry.Data["service"], entry.Data["method"]))...)
   266  		delete(entry.Data, "service")
   267  		delete(entry.Data, "method")
   268  	}
   269  	if len(entry.Data) > 0 {
   270  		if entry.Data["duration"] != nil {
   271  			entry.Data["duration"] = entry.Data["duration"].(time.Duration).Seconds()
   272  		}
   273  		data, err := json.Marshal(entry.Data)
   274  		if err != nil {
   275  			return nil, errors.Wrapf(err, "failed to marshal fields to JSON")
   276  		}
   277  		serialized = append(serialized, data...)
   278  		serialized = append(serialized, ' ')
   279  	}
   280  
   281  	serialized = append(serialized, []byte(entry.Message)...)
   282  	serialized = append(serialized, '\n')
   283  	return serialized, nil
   284  }
   285  
   286  // GRPCLogWriter proxies gRPC and etcd-produced log messages to a logrus
   287  // logger. Because it implements `io.Writer`, it could be used anywhere where
   288  // `io.Writer`s are used, but it has some logic specifically designed to
   289  // handle gRPC-formatted logs.
   290  type GRPCLogWriter struct {
   291  	logger *logrus.Logger
   292  	source string
   293  }
   294  
   295  // NewGRPCLogWriter creates a new GRPC log writer. `logger` specifies the
   296  // underlying logger, and `source` specifies where these logs are coming from;
   297  // it is added as a entry field for all log messages.
   298  func NewGRPCLogWriter(logger *logrus.Logger, source string) *GRPCLogWriter {
   299  	return &GRPCLogWriter{
   300  		logger: logger,
   301  		source: source,
   302  	}
   303  }
   304  
   305  // Write allows `GRPCInfoWriter` to implement the `io.Writer` interface. This
   306  // will take gRPC logs, which look something like this:
   307  // ```
   308  // INFO: 2019/02/18 12:21:54 ClientConn switching balancer to "pick_first"
   309  // ```
   310  // strip out redundant content, and print the message at the appropriate log
   311  // level in logrus. Any parse errors of the log message will be reported in
   312  // logrus as well.
   313  func (l *GRPCLogWriter) Write(p []byte) (int, error) {
   314  	parts := strings.SplitN(string(p), " ", 4)
   315  	entry := l.logger.WithField("source", l.source)
   316  
   317  	if len(parts) == 4 {
   318  		// parts[1] and parts[2] contain the date and time, but logrus already
   319  		// adds this under the `time` entry field, so it's not needed (though
   320  		// the time will presumably be marginally ahead of the original log
   321  		// message)
   322  		level := parts[0]
   323  		message := strings.TrimSpace(parts[3])
   324  
   325  		if level == "INFO:" {
   326  			entry.Info(message)
   327  		} else if level == "ERROR:" {
   328  			entry.Error(message)
   329  		} else if level == "WARNING:" {
   330  			entry.Warning(message)
   331  		} else if level == "FATAL:" {
   332  			// no need to call fatal ourselves because gRPC will exit the
   333  			// process
   334  			entry.Error(message)
   335  		} else {
   336  			entry.Error(message)
   337  			entry.Error("entry had unknown log level prefix: '%s'; this is a bug, please report it along with the previous log entry", level)
   338  		}
   339  	} else {
   340  		// can't format the message -- just display the contents
   341  		entry := l.logger.WithFields(logrus.Fields{
   342  			"source": l.source,
   343  		})
   344  		entry.Error(p)
   345  		entry.Error("entry had unexpected format; this is a bug, please report it along with the previous log entry")
   346  	}
   347  
   348  	return len(p), nil
   349  }