github.com/pachyderm/pachyderm@v1.13.4/src/server/worker/stats/stats.go (about)

     1  package stats
     2  
     3  import (
     4  	"fmt"
     5  	"net/http"
     6  
     7  	"github.com/prometheus/client_golang/prometheus"
     8  	"github.com/prometheus/client_golang/prometheus/promhttp"
     9  	"github.com/sirupsen/logrus"
    10  
    11  	"github.com/pachyderm/pachyderm/src/client/pkg/errors"
    12  )
    13  
    14  const (
    15  	// PrometheusPort is the port the aggregated metrics are served on for scraping
    16  	PrometheusPort = 9090
    17  )
    18  
    19  var (
    20  	bucketFactor = 2.0
    21  	bucketCount  = 20 // Which makes the max bucket 2^20 seconds or ~12 days in size
    22  
    23  	// DatumCount is a counter tracking the number of datums processed by a pipeline
    24  	DatumCount = prometheus.NewCounterVec(
    25  		prometheus.CounterOpts{
    26  			Namespace: "pachyderm",
    27  			Subsystem: "worker",
    28  			Name:      "datum_count",
    29  			Help:      "Number of datums processed by pipeline ID and state (started|errored|finished)",
    30  		},
    31  		[]string{
    32  			"pipeline",
    33  			"job",
    34  			"state",
    35  		},
    36  	)
    37  
    38  	// DatumProcTime is a histogram tracking the time spent in user code for datums processed by a pipeline
    39  	DatumProcTime = prometheus.NewHistogramVec(
    40  		prometheus.HistogramOpts{
    41  			Namespace: "pachyderm",
    42  			Subsystem: "worker",
    43  			Name:      "datum_proc_time",
    44  			Help:      "Time running user code",
    45  			Buckets:   prometheus.ExponentialBuckets(1.0, bucketFactor, bucketCount),
    46  		},
    47  		[]string{
    48  			"pipeline",
    49  			"job",
    50  			"state", // Since both finished and errored datums can have proc times
    51  		},
    52  	)
    53  
    54  	// DatumProcSecondsCount is a counter tracking the total time spent in user code by a pipeline
    55  	DatumProcSecondsCount = prometheus.NewCounterVec(
    56  		prometheus.CounterOpts{
    57  			Namespace: "pachyderm",
    58  			Subsystem: "worker",
    59  			Name:      "datum_proc_seconds_count",
    60  			Help:      "Cumulative number of seconds spent processing",
    61  		},
    62  		[]string{
    63  			"pipeline",
    64  			"job",
    65  		},
    66  	)
    67  
    68  	// DatumDownloadTime is a histogram tracking the time spent downloading input data by a pipeline
    69  	DatumDownloadTime = prometheus.NewHistogramVec(
    70  		prometheus.HistogramOpts{
    71  			Namespace: "pachyderm",
    72  			Subsystem: "worker",
    73  			Name:      "datum_download_time",
    74  			Help:      "Time to download input data",
    75  			Buckets:   prometheus.ExponentialBuckets(1.0, bucketFactor, bucketCount),
    76  		},
    77  		[]string{
    78  			"pipeline",
    79  			"job",
    80  		},
    81  	)
    82  
    83  	// DatumDownloadSecondsCount is a counter tracking the total time spent downloading input data by a pipeline
    84  	DatumDownloadSecondsCount = prometheus.NewCounterVec(
    85  		prometheus.CounterOpts{
    86  			Namespace: "pachyderm",
    87  			Subsystem: "worker",
    88  			Name:      "datum_download_seconds_count",
    89  			Help:      "Cumulative number of seconds spent downloading",
    90  		},
    91  		[]string{
    92  			"pipeline",
    93  			"job",
    94  		},
    95  	)
    96  
    97  	// DatumUploadTime is a histogram tracking the time spent uploading output data by a pipeline
    98  	DatumUploadTime = prometheus.NewHistogramVec(
    99  		prometheus.HistogramOpts{
   100  			Namespace: "pachyderm",
   101  			Subsystem: "worker",
   102  			Name:      "datum_upload_time",
   103  			Help:      "Time to upload output data",
   104  			Buckets:   prometheus.ExponentialBuckets(1.0, bucketFactor, bucketCount),
   105  		},
   106  		[]string{
   107  			"pipeline",
   108  			"job",
   109  		},
   110  	)
   111  
   112  	// DatumUploadSecondsCount is a counter tracking the total time spent uploading output data by a pipeline
   113  	DatumUploadSecondsCount = prometheus.NewCounterVec(
   114  		prometheus.CounterOpts{
   115  			Namespace: "pachyderm",
   116  			Subsystem: "worker",
   117  			Name:      "datum_upload_seconds_count",
   118  			Help:      "Cumulative number of seconds spent uploading",
   119  		},
   120  		[]string{
   121  			"pipeline",
   122  			"job",
   123  		},
   124  	)
   125  
   126  	// DatumDownloadSize is a histogram tracking the size of input data downloaded by a pipeline
   127  	DatumDownloadSize = prometheus.NewHistogramVec(
   128  		prometheus.HistogramOpts{
   129  			Namespace: "pachyderm",
   130  			Subsystem: "worker",
   131  			Name:      "datum_download_size",
   132  			Help:      "Size of downloaded input data",
   133  			Buckets:   prometheus.ExponentialBuckets(1.0, bucketFactor, bucketCount),
   134  		},
   135  		[]string{
   136  			"pipeline",
   137  			"job",
   138  		},
   139  	)
   140  
   141  	// DatumDownloadBytesCount is a counter tracking the total size of input data downloaded by a pipeline
   142  	DatumDownloadBytesCount = prometheus.NewCounterVec(
   143  		prometheus.CounterOpts{
   144  			Namespace: "pachyderm",
   145  			Subsystem: "worker",
   146  			Name:      "datum_download_bytes_count",
   147  			Help:      "Cumulative number of bytes downloaded",
   148  		},
   149  		[]string{
   150  			"pipeline",
   151  			"job",
   152  		},
   153  	)
   154  
   155  	// DatumUploadSize is a histogram tracking the size of output data uploaded by a pipeline
   156  	DatumUploadSize = prometheus.NewHistogramVec(
   157  		prometheus.HistogramOpts{
   158  			Namespace: "pachyderm",
   159  			Subsystem: "worker",
   160  			Name:      "datum_upload_size",
   161  			Help:      "Size of uploaded output data",
   162  			Buckets:   prometheus.ExponentialBuckets(1.0, bucketFactor, bucketCount),
   163  		},
   164  		[]string{
   165  			"pipeline",
   166  			"job",
   167  		},
   168  	)
   169  
   170  	// DatumUploadBytesCount is a counter tracking the total size of output data uploaded by a pipeline
   171  	DatumUploadBytesCount = prometheus.NewCounterVec(
   172  		prometheus.CounterOpts{
   173  			Namespace: "pachyderm",
   174  			Subsystem: "worker",
   175  			Name:      "datum_upload_bytes_count",
   176  			Help:      "Cumulative number of bytes uploaded",
   177  		},
   178  		[]string{
   179  			"pipeline",
   180  			"job",
   181  		},
   182  	)
   183  )
   184  
   185  // InitPrometheus sets up the default datum stats collectors for use by worker
   186  // code, and exposes the stats on an http endpoint.
   187  func InitPrometheus() {
   188  	metrics := []prometheus.Collector{
   189  		DatumCount,
   190  		DatumProcTime,
   191  		DatumProcSecondsCount,
   192  		DatumDownloadTime,
   193  		DatumDownloadSecondsCount,
   194  		DatumUploadTime,
   195  		DatumUploadSecondsCount,
   196  		DatumDownloadSize,
   197  		DatumDownloadBytesCount,
   198  		DatumUploadSize,
   199  		DatumUploadBytesCount,
   200  	}
   201  	for _, metric := range metrics {
   202  		if err := prometheus.Register(metric); err != nil {
   203  			// metrics may be redundantly registered; ignore these errors
   204  			if !errors.As(err, &prometheus.AlreadyRegisteredError{}) {
   205  				logrus.Errorf("error registering prometheus metric: %v", err)
   206  			}
   207  		}
   208  	}
   209  	http.Handle("/metrics", promhttp.Handler())
   210  	go func() {
   211  		if err := http.ListenAndServe(fmt.Sprintf(":%v", PrometheusPort), nil); err != nil {
   212  			logrus.Errorf("error serving prometheus metrics: %v", err)
   213  		}
   214  	}()
   215  }