github.com/pachyderm/pachyderm@v1.13.4/src/server/worker/stats/stats.go (about) 1 package stats 2 3 import ( 4 "fmt" 5 "net/http" 6 7 "github.com/prometheus/client_golang/prometheus" 8 "github.com/prometheus/client_golang/prometheus/promhttp" 9 "github.com/sirupsen/logrus" 10 11 "github.com/pachyderm/pachyderm/src/client/pkg/errors" 12 ) 13 14 const ( 15 // PrometheusPort is the port the aggregated metrics are served on for scraping 16 PrometheusPort = 9090 17 ) 18 19 var ( 20 bucketFactor = 2.0 21 bucketCount = 20 // Which makes the max bucket 2^20 seconds or ~12 days in size 22 23 // DatumCount is a counter tracking the number of datums processed by a pipeline 24 DatumCount = prometheus.NewCounterVec( 25 prometheus.CounterOpts{ 26 Namespace: "pachyderm", 27 Subsystem: "worker", 28 Name: "datum_count", 29 Help: "Number of datums processed by pipeline ID and state (started|errored|finished)", 30 }, 31 []string{ 32 "pipeline", 33 "job", 34 "state", 35 }, 36 ) 37 38 // DatumProcTime is a histogram tracking the time spent in user code for datums processed by a pipeline 39 DatumProcTime = prometheus.NewHistogramVec( 40 prometheus.HistogramOpts{ 41 Namespace: "pachyderm", 42 Subsystem: "worker", 43 Name: "datum_proc_time", 44 Help: "Time running user code", 45 Buckets: prometheus.ExponentialBuckets(1.0, bucketFactor, bucketCount), 46 }, 47 []string{ 48 "pipeline", 49 "job", 50 "state", // Since both finished and errored datums can have proc times 51 }, 52 ) 53 54 // DatumProcSecondsCount is a counter tracking the total time spent in user code by a pipeline 55 DatumProcSecondsCount = prometheus.NewCounterVec( 56 prometheus.CounterOpts{ 57 Namespace: "pachyderm", 58 Subsystem: "worker", 59 Name: "datum_proc_seconds_count", 60 Help: "Cumulative number of seconds spent processing", 61 }, 62 []string{ 63 "pipeline", 64 "job", 65 }, 66 ) 67 68 // DatumDownloadTime is a histogram tracking the time spent downloading input data by a pipeline 69 DatumDownloadTime = prometheus.NewHistogramVec( 70 prometheus.HistogramOpts{ 71 Namespace: "pachyderm", 72 Subsystem: "worker", 73 Name: "datum_download_time", 74 Help: "Time to download input data", 75 Buckets: prometheus.ExponentialBuckets(1.0, bucketFactor, bucketCount), 76 }, 77 []string{ 78 "pipeline", 79 "job", 80 }, 81 ) 82 83 // DatumDownloadSecondsCount is a counter tracking the total time spent downloading input data by a pipeline 84 DatumDownloadSecondsCount = prometheus.NewCounterVec( 85 prometheus.CounterOpts{ 86 Namespace: "pachyderm", 87 Subsystem: "worker", 88 Name: "datum_download_seconds_count", 89 Help: "Cumulative number of seconds spent downloading", 90 }, 91 []string{ 92 "pipeline", 93 "job", 94 }, 95 ) 96 97 // DatumUploadTime is a histogram tracking the time spent uploading output data by a pipeline 98 DatumUploadTime = prometheus.NewHistogramVec( 99 prometheus.HistogramOpts{ 100 Namespace: "pachyderm", 101 Subsystem: "worker", 102 Name: "datum_upload_time", 103 Help: "Time to upload output data", 104 Buckets: prometheus.ExponentialBuckets(1.0, bucketFactor, bucketCount), 105 }, 106 []string{ 107 "pipeline", 108 "job", 109 }, 110 ) 111 112 // DatumUploadSecondsCount is a counter tracking the total time spent uploading output data by a pipeline 113 DatumUploadSecondsCount = prometheus.NewCounterVec( 114 prometheus.CounterOpts{ 115 Namespace: "pachyderm", 116 Subsystem: "worker", 117 Name: "datum_upload_seconds_count", 118 Help: "Cumulative number of seconds spent uploading", 119 }, 120 []string{ 121 "pipeline", 122 "job", 123 }, 124 ) 125 126 // DatumDownloadSize is a histogram tracking the size of input data downloaded by a pipeline 127 DatumDownloadSize = prometheus.NewHistogramVec( 128 prometheus.HistogramOpts{ 129 Namespace: "pachyderm", 130 Subsystem: "worker", 131 Name: "datum_download_size", 132 Help: "Size of downloaded input data", 133 Buckets: prometheus.ExponentialBuckets(1.0, bucketFactor, bucketCount), 134 }, 135 []string{ 136 "pipeline", 137 "job", 138 }, 139 ) 140 141 // DatumDownloadBytesCount is a counter tracking the total size of input data downloaded by a pipeline 142 DatumDownloadBytesCount = prometheus.NewCounterVec( 143 prometheus.CounterOpts{ 144 Namespace: "pachyderm", 145 Subsystem: "worker", 146 Name: "datum_download_bytes_count", 147 Help: "Cumulative number of bytes downloaded", 148 }, 149 []string{ 150 "pipeline", 151 "job", 152 }, 153 ) 154 155 // DatumUploadSize is a histogram tracking the size of output data uploaded by a pipeline 156 DatumUploadSize = prometheus.NewHistogramVec( 157 prometheus.HistogramOpts{ 158 Namespace: "pachyderm", 159 Subsystem: "worker", 160 Name: "datum_upload_size", 161 Help: "Size of uploaded output data", 162 Buckets: prometheus.ExponentialBuckets(1.0, bucketFactor, bucketCount), 163 }, 164 []string{ 165 "pipeline", 166 "job", 167 }, 168 ) 169 170 // DatumUploadBytesCount is a counter tracking the total size of output data uploaded by a pipeline 171 DatumUploadBytesCount = prometheus.NewCounterVec( 172 prometheus.CounterOpts{ 173 Namespace: "pachyderm", 174 Subsystem: "worker", 175 Name: "datum_upload_bytes_count", 176 Help: "Cumulative number of bytes uploaded", 177 }, 178 []string{ 179 "pipeline", 180 "job", 181 }, 182 ) 183 ) 184 185 // InitPrometheus sets up the default datum stats collectors for use by worker 186 // code, and exposes the stats on an http endpoint. 187 func InitPrometheus() { 188 metrics := []prometheus.Collector{ 189 DatumCount, 190 DatumProcTime, 191 DatumProcSecondsCount, 192 DatumDownloadTime, 193 DatumDownloadSecondsCount, 194 DatumUploadTime, 195 DatumUploadSecondsCount, 196 DatumDownloadSize, 197 DatumDownloadBytesCount, 198 DatumUploadSize, 199 DatumUploadBytesCount, 200 } 201 for _, metric := range metrics { 202 if err := prometheus.Register(metric); err != nil { 203 // metrics may be redundantly registered; ignore these errors 204 if !errors.As(err, &prometheus.AlreadyRegisteredError{}) { 205 logrus.Errorf("error registering prometheus metric: %v", err) 206 } 207 } 208 } 209 http.Handle("/metrics", promhttp.Handler()) 210 go func() { 211 if err := http.ListenAndServe(fmt.Sprintf(":%v", PrometheusPort), nil); err != nil { 212 logrus.Errorf("error serving prometheus metrics: %v", err) 213 } 214 }() 215 }