github.com/pachyderm/pachyderm@v1.13.4/src/server/pkg/storage/metrics/metrics.go (about)

     1  package metrics
     2  
     3  import (
     4  	"path"
     5  	"runtime"
     6  	"strings"
     7  	"sync"
     8  	"time"
     9  
    10  	units "github.com/docker/go-units"
    11  	"github.com/prometheus/client_golang/prometheus"
    12  )
    13  
    14  // TODO The metrics code should probably be reorganized at some point.
    15  // The current setup provides an easy way to collect metrics for both external and internal PFS/Storage APIs.
    16  
    17  type metrics struct {
    18  	requestCounter                           *prometheus.CounterVec
    19  	requestSummary, requestSummaryThroughput *prometheus.SummaryVec
    20  }
    21  
    22  var (
    23  	subsystems = make(map[string]*metrics)
    24  	mu         sync.Mutex
    25  )
    26  
    27  const (
    28  	trimPrefix = "github.com/pachyderm/pachyderm/src/"
    29  )
    30  
    31  // ReportRequest reports a request to Prometheus.
    32  // This function automatically registers a metric (if one does not already
    33  // exist) with the default register.
    34  // The calling function's package name is used as the subsystem name and the
    35  // function name is used for the operation label.
    36  // This function also labels the request as successful or not, and records
    37  // the time spent in a separate metric.
    38  func ReportRequest(f func() error, skip ...int) (retErr error) {
    39  	ci := retrieveCallInfo(skip...)
    40  	ms, err := maybeRegisterSubsystem(ci.packageName)
    41  	if err != nil {
    42  		return err
    43  	}
    44  	operation := ci.funcName
    45  	start := time.Now()
    46  	defer func() {
    47  		result := "success"
    48  		if retErr != nil {
    49  			result = retErr.Error()
    50  		}
    51  		ms.requestCounter.WithLabelValues(operation, result).Inc()
    52  		ms.requestSummary.WithLabelValues(operation).Observe(time.Since(start).Seconds())
    53  	}()
    54  	return f()
    55  }
    56  
    57  // ReportRequestWithThroughput functions the same as ReportRequest, but also
    58  // reports the throughput in a separate metric.
    59  func ReportRequestWithThroughput(f func() (int64, error)) error {
    60  	ci := retrieveCallInfo()
    61  	ms, err := maybeRegisterSubsystem(ci.packageName)
    62  	if err != nil {
    63  		return err
    64  	}
    65  	operation := ci.funcName
    66  	start := time.Now()
    67  	return ReportRequest(func() error {
    68  		bytesProcessed, err := f()
    69  		throughput := float64(bytesProcessed) / units.MB / time.Since(start).Seconds()
    70  		ms.requestSummaryThroughput.WithLabelValues(operation).Observe(throughput)
    71  		return err
    72  	}, 1)
    73  }
    74  
    75  type callInfo struct {
    76  	packageName string
    77  	fileName    string
    78  	funcName    string
    79  	line        int
    80  }
    81  
    82  func retrieveCallInfo(skip ...int) *callInfo {
    83  	skipFrames := 2
    84  	if len(skip) > 0 {
    85  		skipFrames += skip[0]
    86  	}
    87  	pc, file, line, _ := runtime.Caller(skipFrames)
    88  	_, fileName := path.Split(file)
    89  	parts := strings.Split(runtime.FuncForPC(pc).Name(), ".")
    90  	pl := len(parts)
    91  	packageName := ""
    92  	funcName := parts[pl-1]
    93  
    94  	if parts[pl-2][0] == '(' {
    95  		funcName = parts[pl-2] + "." + funcName
    96  		packageName = strings.Join(parts[0:pl-2], ".")
    97  	} else {
    98  		packageName = strings.Join(parts[0:pl-1], ".")
    99  	}
   100  
   101  	return &callInfo{
   102  		packageName: packageName,
   103  		fileName:    fileName,
   104  		funcName:    funcName,
   105  		line:        line,
   106  	}
   107  }
   108  
   109  func maybeRegisterSubsystem(packageName string) (*metrics, error) {
   110  	subsystem := strings.ReplaceAll(strings.TrimPrefix(packageName, trimPrefix), "/", "_")
   111  	mu.Lock()
   112  	defer mu.Unlock()
   113  	if ms, ok := subsystems[subsystem]; ok {
   114  		return ms, nil
   115  	}
   116  	err := register(subsystem)
   117  	return subsystems[subsystem], err
   118  }
   119  
   120  func register(subsystem string) error {
   121  	ms := &metrics{
   122  		requestCounter:           newRequestCounter(subsystem),
   123  		requestSummary:           newRequestSummary(subsystem),
   124  		requestSummaryThroughput: newRequestSummaryThroughput(subsystem),
   125  	}
   126  	for _, m := range []prometheus.Collector{
   127  		ms.requestCounter,
   128  		ms.requestSummary,
   129  		ms.requestSummaryThroughput,
   130  	} {
   131  		if err := prometheus.Register(m); err != nil {
   132  			return err
   133  		}
   134  	}
   135  	subsystems[subsystem] = ms
   136  	return nil
   137  }
   138  
   139  func newRequestCounter(subsystem string) *prometheus.CounterVec {
   140  	return prometheus.NewCounterVec(
   141  		prometheus.CounterOpts{
   142  			Namespace: "pachyderm",
   143  			Subsystem: subsystem,
   144  			Name:      "request_results",
   145  			Help:      subsystem + " operations, count by operation and result type",
   146  		},
   147  		[]string{"operation", "result"},
   148  	)
   149  }
   150  
   151  func newRequestSummary(subsystem string) *prometheus.SummaryVec {
   152  	return prometheus.NewSummaryVec(
   153  		prometheus.SummaryOpts{
   154  			Namespace: "pachyderm",
   155  			Subsystem: subsystem,
   156  			Name:      "request_time",
   157  			Help:      "time spent on " + subsystem + " operations, histogram by duration (seconds)",
   158  		},
   159  		[]string{"operation"},
   160  	)
   161  }
   162  
   163  func newRequestSummaryThroughput(subsystem string) *prometheus.SummaryVec {
   164  	return prometheus.NewSummaryVec(
   165  		prometheus.SummaryOpts{
   166  			Namespace: "pachyderm",
   167  			Subsystem: subsystem,
   168  			Name:      "request_throughput",
   169  			Help:      "throughput of " + subsystem + " operations, histogram by throughput (MB/s)",
   170  		},
   171  		[]string{"operation"},
   172  	)
   173  }