github.com/tiagovtristao/plz@v13.4.0+incompatible/src/metrics/prometheus.go (about) 1 // +build !bootstrap 2 3 // Package metrics contains support for reporting metrics to an external server, 4 // currently a Prometheus pushgateway. Because plz runs as a transient process 5 // we can't wait around for Prometheus to call us, we've got to push to them. 6 package metrics 7 8 import ( 9 "fmt" 10 "os" 11 "os/user" 12 "runtime" 13 "strings" 14 "time" 15 16 "github.com/google/shlex" 17 "github.com/prometheus/client_golang/prometheus" 18 "github.com/prometheus/client_golang/prometheus/push" 19 "gopkg.in/op/go-logging.v1" 20 21 "github.com/thought-machine/please/src/core" 22 ) 23 24 var log = logging.MustGetLogger("metrics") 25 26 // This is the maximum number of errors after which plz will stop attempting to send metrics. 27 const maxErrors = 3 28 29 type metrics struct { 30 url string 31 newMetrics bool 32 ticker *time.Ticker 33 cancelled bool 34 perTest bool 35 errors int 36 pushes int 37 timeout time.Duration 38 buildCounter, cacheCounter, testCounter *prometheus.CounterVec 39 buildHistogram, cacheHistogram, testHistogram *prometheus.HistogramVec 40 registry *prometheus.Registry 41 } 42 43 // m is the singleton metrics instance. 44 var m *metrics 45 46 // buckets are the buckets we use for build histograms. 47 var buckets = []float64{0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0, 25.0, 50.0, 100.0, 250.0, 500.0, 1000.0} 48 49 // InitFromConfig sets up the initial metrics from the configuration. 50 func InitFromConfig(config *core.Configuration) { 51 if config.Metrics.PushGatewayURL != "" { 52 defer func() { 53 if r := recover(); r != nil { 54 log.Fatalf("%s", r) 55 } 56 }() 57 58 m = initMetrics(config.Metrics.PushGatewayURL.String(), time.Duration(config.Metrics.PushFrequency), 59 time.Duration(config.Metrics.PushTimeout), config.CustomMetricLabels, config.Metrics.PerTest, config.Metrics.PerUser) 60 } 61 } 62 63 // initMetrics initialises a new metrics instance. 64 // This is deliberately not exposed but is useful for testing. 65 func initMetrics(url string, frequency, timeout time.Duration, customLabels map[string]string, perTest, perUser bool) *metrics { 66 constLabels := prometheus.Labels{} 67 if perUser { 68 u, err := user.Current() 69 if err != nil { 70 // we've observed os/user failing in some cases involving LDAP logins; fall back to the 71 // env var if it is set. 72 if username := os.Getenv("USER"); username != "" { 73 u = &user.User{Username: username} 74 } else { 75 log.Warning("Can't determine current user name for metrics: %s", err) 76 u = &user.User{Username: "unknown"} 77 } 78 } 79 constLabels["user"] = u.Username 80 constLabels["arch"] = runtime.GOOS + "_" + runtime.GOARCH 81 } 82 for k, v := range customLabels { 83 constLabels[k] = deriveLabelValue(v) 84 } 85 86 m = &metrics{ 87 url: url, 88 timeout: timeout, 89 ticker: time.NewTicker(frequency), 90 perTest: perTest, 91 registry: prometheus.NewRegistry(), 92 } 93 94 // Count of builds for each target. 95 m.buildCounter = prometheus.NewCounterVec(prometheus.CounterOpts{ 96 Name: "build_counts", 97 Help: "Count of number of times each target is built", 98 ConstLabels: constLabels, 99 }, []string{"success", "incremental"}) 100 101 // Count of cache hits for each target 102 m.cacheCounter = prometheus.NewCounterVec(prometheus.CounterOpts{ 103 Name: "cache_hits", 104 Help: "Count of number of times we successfully retrieve from the cache", 105 ConstLabels: constLabels, 106 }, []string{"hit"}) 107 108 // Count of test runs for each target 109 m.testCounter = prometheus.NewCounterVec(prometheus.CounterOpts{ 110 Name: "test_runs", 111 Help: "Count of number of times we run each test", 112 ConstLabels: constLabels, 113 }, addTest([]string{"pass"}, perTest)) 114 115 // Build durations for each target 116 m.buildHistogram = prometheus.NewHistogramVec(prometheus.HistogramOpts{ 117 Name: "build_durations_histogram", 118 Help: "Durations of individual build targets", 119 Buckets: buckets, 120 ConstLabels: constLabels, 121 }, []string{}) 122 123 // Cache retrieval durations for each target 124 m.cacheHistogram = prometheus.NewHistogramVec(prometheus.HistogramOpts{ 125 Name: "cache_durations_histogram", 126 Help: "Durations to retrieve artifacts from the cache", 127 Buckets: buckets, 128 ConstLabels: constLabels, 129 }, []string{}) 130 131 // Test durations for each target 132 m.testHistogram = prometheus.NewHistogramVec(prometheus.HistogramOpts{ 133 Name: "test_durations_histogram", 134 Help: "Durations to run tests", 135 Buckets: buckets, 136 ConstLabels: constLabels, 137 }, addTest([]string{}, perTest)) 138 139 m.registry.MustRegister(prometheus.NewProcessCollector(os.Getpid(), "")) 140 m.registry.MustRegister(m.buildCounter) 141 m.registry.MustRegister(m.cacheCounter) 142 m.registry.MustRegister(m.testCounter) 143 m.registry.MustRegister(m.buildHistogram) 144 m.registry.MustRegister(m.cacheHistogram) 145 m.registry.MustRegister(m.testHistogram) 146 147 go m.keepPushing() 148 149 return m 150 } 151 152 // addTest adds a per-test label to the given slice. 153 func addTest(s []string, perTest bool) []string { 154 if perTest { 155 return append(s, "test") 156 } 157 return s 158 } 159 160 // Stop shuts down the metrics and ensures the final ones are sent before returning. 161 func Stop() { 162 if m != nil { 163 m.stop() 164 } 165 } 166 167 func (m *metrics) stop() { 168 m.ticker.Stop() 169 if !m.cancelled { 170 m.errors = m.pushMetrics() 171 } 172 } 173 174 // Record records metrics for the given target. 175 func Record(target *core.BuildTarget, duration time.Duration) { 176 if m != nil { 177 m.record(target, duration) 178 } 179 } 180 181 func (m *metrics) record(target *core.BuildTarget, duration time.Duration) { 182 if len(target.Results.TestCases) > 0 { 183 // Tests have run 184 m.cacheCounter.WithLabelValues(b(target.Results.Cached)).Inc() 185 if m.perTest { 186 m.testCounter.WithLabelValues(b(target.Results.Failures() == 0), target.Label.String()).Inc() 187 } else { 188 m.testCounter.WithLabelValues(b(target.Results.Failures() == 0)).Inc() 189 } 190 if target.Results.Cached { 191 m.cacheHistogram.WithLabelValues().Observe(duration.Seconds()) 192 } else if target.Results.Failures() == 0 { 193 if m.perTest { 194 m.testHistogram.WithLabelValues(target.Label.String()).Observe(duration.Seconds()) 195 } else { 196 m.testHistogram.WithLabelValues().Observe(duration.Seconds()) 197 } 198 } 199 } else { 200 // Build has run 201 state := target.State() 202 m.cacheCounter.WithLabelValues(b(state == core.Cached)).Inc() 203 m.buildCounter.WithLabelValues(b(state != core.Failed), b(state != core.Reused)).Inc() 204 if state == core.Cached { 205 m.cacheHistogram.WithLabelValues().Observe(duration.Seconds()) 206 } else if state != core.Failed && state >= core.Built { 207 m.buildHistogram.WithLabelValues().Observe(duration.Seconds()) 208 } 209 } 210 m.newMetrics = true 211 } 212 213 func b(value bool) string { 214 if value { 215 return "true" 216 } 217 return "false" 218 } 219 220 func (m *metrics) keepPushing() { 221 for range m.ticker.C { 222 m.errors = m.pushMetrics() 223 if m.errors >= maxErrors { 224 log.Warning("Metrics don't seem to be working, giving up") 225 m.cancelled = true 226 return 227 } 228 } 229 } 230 231 // deadline applies a deadline to an arbitrary function and returns when either the function 232 // completes or the deadline expires. 233 func deadline(f func() error, timeout time.Duration) error { 234 c := make(chan error) 235 go func() { 236 c <- f() 237 }() 238 select { 239 case err := <-c: 240 return err 241 case <-time.After(timeout): 242 return fmt.Errorf("Metrics push timed out") 243 } 244 } 245 246 // pushMetrics attempts to send some new metrics to the server. It returns the new number of errors. 247 func (m *metrics) pushMetrics() int { 248 if !m.newMetrics { 249 return m.errors 250 } 251 start := time.Now() 252 m.newMetrics = false 253 if err := deadline(func() error { 254 return push.AddFromGatherer("please", push.HostnameGroupingKey(), m.url, m.registry) 255 }, m.timeout); err != nil { 256 log.Warning("Could not push metrics to the repository: %s", err) 257 m.newMetrics = true 258 return m.errors + 1 259 } 260 m.pushes++ 261 log.Debug("Push #%d of metrics in %0.3fs", m.pushes, time.Since(start).Seconds()) 262 return 0 263 } 264 265 // deriveLabelValue runs a command and returns its output. 266 // It returns the empty string on error; we assume it's better to keep the set of labels constant on failure. 267 func deriveLabelValue(cmd string) string { 268 parts, err := shlex.Split(cmd) 269 if err != nil { 270 panic(fmt.Sprintf("Invalid custom metric command [%s]: %s", cmd, err)) 271 } 272 log.Debug("Running custom label command: %s", cmd) 273 b, err := core.ExecCommand(parts[0], parts[1:]...).Output() 274 log.Debug("Got output: %s", b) 275 if err != nil { 276 panic(fmt.Sprintf("Custom metric command [%s] failed: %s", cmd, err)) 277 } 278 value := strings.TrimSpace(string(b)) 279 if strings.Contains(value, "\n") { 280 panic(fmt.Sprintf("Return value of custom metric command [%s] contains spaces: %s", cmd, value)) 281 } 282 return value 283 }