github.com/tiagovtristao/plz@v13.4.0+incompatible/src/metrics/prometheus.go

github.com/tiagovtristao/plz@v13.4.0+incompatible/src/metrics/prometheus.go (about)

     1  // +build !bootstrap
     2  
     3  // Package metrics contains support for reporting metrics to an external server,
     4  // currently a Prometheus pushgateway. Because plz runs as a transient process
     5  // we can't wait around for Prometheus to call us, we've got to push to them.
     6  package metrics
     7  
     8  import (
     9  	"fmt"
    10  	"os"
    11  	"os/user"
    12  	"runtime"
    13  	"strings"
    14  	"time"
    15  
    16  	"github.com/google/shlex"
    17  	"github.com/prometheus/client_golang/prometheus"
    18  	"github.com/prometheus/client_golang/prometheus/push"
    19  	"gopkg.in/op/go-logging.v1"
    20  
    21  	"github.com/thought-machine/please/src/core"
    22  )
    23  
    24  var log = logging.MustGetLogger("metrics")
    25  
    26  // This is the maximum number of errors after which plz will stop attempting to send metrics.
    27  const maxErrors = 3
    28  
    29  type metrics struct {
    30  	url                                           string
    31  	newMetrics                                    bool
    32  	ticker                                        *time.Ticker
    33  	cancelled                                     bool
    34  	perTest                                       bool
    35  	errors                                        int
    36  	pushes                                        int
    37  	timeout                                       time.Duration
    38  	buildCounter, cacheCounter, testCounter       *prometheus.CounterVec
    39  	buildHistogram, cacheHistogram, testHistogram *prometheus.HistogramVec
    40  	registry                                      *prometheus.Registry
    41  }
    42  
    43  // m is the singleton metrics instance.
    44  var m *metrics
    45  
    46  // buckets are the buckets we use for build histograms.
    47  var buckets = []float64{0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0, 25.0, 50.0, 100.0, 250.0, 500.0, 1000.0}
    48  
    49  // InitFromConfig sets up the initial metrics from the configuration.
    50  func InitFromConfig(config *core.Configuration) {
    51  	if config.Metrics.PushGatewayURL != "" {
    52  		defer func() {
    53  			if r := recover(); r != nil {
    54  				log.Fatalf("%s", r)
    55  			}
    56  		}()
    57  
    58  		m = initMetrics(config.Metrics.PushGatewayURL.String(), time.Duration(config.Metrics.PushFrequency),
    59  			time.Duration(config.Metrics.PushTimeout), config.CustomMetricLabels, config.Metrics.PerTest, config.Metrics.PerUser)
    60  	}
    61  }
    62  
    63  // initMetrics initialises a new metrics instance.
    64  // This is deliberately not exposed but is useful for testing.
    65  func initMetrics(url string, frequency, timeout time.Duration, customLabels map[string]string, perTest, perUser bool) *metrics {
    66  	constLabels := prometheus.Labels{}
    67  	if perUser {
    68  		u, err := user.Current()
    69  		if err != nil {
    70  			// we've observed os/user failing in some cases involving LDAP logins; fall back to the
    71  			// env var if it is set.
    72  			if username := os.Getenv("USER"); username != "" {
    73  				u = &user.User{Username: username}
    74  			} else {
    75  				log.Warning("Can't determine current user name for metrics: %s", err)
    76  				u = &user.User{Username: "unknown"}
    77  			}
    78  		}
    79  		constLabels["user"] = u.Username
    80  		constLabels["arch"] = runtime.GOOS + "_" + runtime.GOARCH
    81  	}
    82  	for k, v := range customLabels {
    83  		constLabels[k] = deriveLabelValue(v)
    84  	}
    85  
    86  	m = &metrics{
    87  		url:      url,
    88  		timeout:  timeout,
    89  		ticker:   time.NewTicker(frequency),
    90  		perTest:  perTest,
    91  		registry: prometheus.NewRegistry(),
    92  	}
    93  
    94  	// Count of builds for each target.
    95  	m.buildCounter = prometheus.NewCounterVec(prometheus.CounterOpts{
    96  		Name:        "build_counts",
    97  		Help:        "Count of number of times each target is built",
    98  		ConstLabels: constLabels,
    99  	}, []string{"success", "incremental"})
   100  
   101  	// Count of cache hits for each target
   102  	m.cacheCounter = prometheus.NewCounterVec(prometheus.CounterOpts{
   103  		Name:        "cache_hits",
   104  		Help:        "Count of number of times we successfully retrieve from the cache",
   105  		ConstLabels: constLabels,
   106  	}, []string{"hit"})
   107  
   108  	// Count of test runs for each target
   109  	m.testCounter = prometheus.NewCounterVec(prometheus.CounterOpts{
   110  		Name:        "test_runs",
   111  		Help:        "Count of number of times we run each test",
   112  		ConstLabels: constLabels,
   113  	}, addTest([]string{"pass"}, perTest))
   114  
   115  	// Build durations for each target
   116  	m.buildHistogram = prometheus.NewHistogramVec(prometheus.HistogramOpts{
   117  		Name:        "build_durations_histogram",
   118  		Help:        "Durations of individual build targets",
   119  		Buckets:     buckets,
   120  		ConstLabels: constLabels,
   121  	}, []string{})
   122  
   123  	// Cache retrieval durations for each target
   124  	m.cacheHistogram = prometheus.NewHistogramVec(prometheus.HistogramOpts{
   125  		Name:        "cache_durations_histogram",
   126  		Help:        "Durations to retrieve artifacts from the cache",
   127  		Buckets:     buckets,
   128  		ConstLabels: constLabels,
   129  	}, []string{})
   130  
   131  	// Test durations for each target
   132  	m.testHistogram = prometheus.NewHistogramVec(prometheus.HistogramOpts{
   133  		Name:        "test_durations_histogram",
   134  		Help:        "Durations to run tests",
   135  		Buckets:     buckets,
   136  		ConstLabels: constLabels,
   137  	}, addTest([]string{}, perTest))
   138  
   139  	m.registry.MustRegister(prometheus.NewProcessCollector(os.Getpid(), ""))
   140  	m.registry.MustRegister(m.buildCounter)
   141  	m.registry.MustRegister(m.cacheCounter)
   142  	m.registry.MustRegister(m.testCounter)
   143  	m.registry.MustRegister(m.buildHistogram)
   144  	m.registry.MustRegister(m.cacheHistogram)
   145  	m.registry.MustRegister(m.testHistogram)
   146  
   147  	go m.keepPushing()
   148  
   149  	return m
   150  }
   151  
   152  // addTest adds a per-test label to the given slice.
   153  func addTest(s []string, perTest bool) []string {
   154  	if perTest {
   155  		return append(s, "test")
   156  	}
   157  	return s
   158  }
   159  
   160  // Stop shuts down the metrics and ensures the final ones are sent before returning.
   161  func Stop() {
   162  	if m != nil {
   163  		m.stop()
   164  	}
   165  }
   166  
   167  func (m *metrics) stop() {
   168  	m.ticker.Stop()
   169  	if !m.cancelled {
   170  		m.errors = m.pushMetrics()
   171  	}
   172  }
   173  
   174  // Record records metrics for the given target.
   175  func Record(target *core.BuildTarget, duration time.Duration) {
   176  	if m != nil {
   177  		m.record(target, duration)
   178  	}
   179  }
   180  
   181  func (m *metrics) record(target *core.BuildTarget, duration time.Duration) {
   182  	if len(target.Results.TestCases) > 0 {
   183  		// Tests have run
   184  		m.cacheCounter.WithLabelValues(b(target.Results.Cached)).Inc()
   185  		if m.perTest {
   186  			m.testCounter.WithLabelValues(b(target.Results.Failures() == 0), target.Label.String()).Inc()
   187  		} else {
   188  			m.testCounter.WithLabelValues(b(target.Results.Failures() == 0)).Inc()
   189  		}
   190  		if target.Results.Cached {
   191  			m.cacheHistogram.WithLabelValues().Observe(duration.Seconds())
   192  		} else if target.Results.Failures() == 0 {
   193  			if m.perTest {
   194  				m.testHistogram.WithLabelValues(target.Label.String()).Observe(duration.Seconds())
   195  			} else {
   196  				m.testHistogram.WithLabelValues().Observe(duration.Seconds())
   197  			}
   198  		}
   199  	} else {
   200  		// Build has run
   201  		state := target.State()
   202  		m.cacheCounter.WithLabelValues(b(state == core.Cached)).Inc()
   203  		m.buildCounter.WithLabelValues(b(state != core.Failed), b(state != core.Reused)).Inc()
   204  		if state == core.Cached {
   205  			m.cacheHistogram.WithLabelValues().Observe(duration.Seconds())
   206  		} else if state != core.Failed && state >= core.Built {
   207  			m.buildHistogram.WithLabelValues().Observe(duration.Seconds())
   208  		}
   209  	}
   210  	m.newMetrics = true
   211  }
   212  
   213  func b(value bool) string {
   214  	if value {
   215  		return "true"
   216  	}
   217  	return "false"
   218  }
   219  
   220  func (m *metrics) keepPushing() {
   221  	for range m.ticker.C {
   222  		m.errors = m.pushMetrics()
   223  		if m.errors >= maxErrors {
   224  			log.Warning("Metrics don't seem to be working, giving up")
   225  			m.cancelled = true
   226  			return
   227  		}
   228  	}
   229  }
   230  
   231  // deadline applies a deadline to an arbitrary function and returns when either the function
   232  // completes or the deadline expires.
   233  func deadline(f func() error, timeout time.Duration) error {
   234  	c := make(chan error)
   235  	go func() {
   236  		c <- f()
   237  	}()
   238  	select {
   239  	case err := <-c:
   240  		return err
   241  	case <-time.After(timeout):
   242  		return fmt.Errorf("Metrics push timed out")
   243  	}
   244  }
   245  
   246  // pushMetrics attempts to send some new metrics to the server. It returns the new number of errors.
   247  func (m *metrics) pushMetrics() int {
   248  	if !m.newMetrics {
   249  		return m.errors
   250  	}
   251  	start := time.Now()
   252  	m.newMetrics = false
   253  	if err := deadline(func() error {
   254  		return push.AddFromGatherer("please", push.HostnameGroupingKey(), m.url, m.registry)
   255  	}, m.timeout); err != nil {
   256  		log.Warning("Could not push metrics to the repository: %s", err)
   257  		m.newMetrics = true
   258  		return m.errors + 1
   259  	}
   260  	m.pushes++
   261  	log.Debug("Push #%d of metrics in %0.3fs", m.pushes, time.Since(start).Seconds())
   262  	return 0
   263  }
   264  
   265  // deriveLabelValue runs a command and returns its output.
   266  // It returns the empty string on error; we assume it's better to keep the set of labels constant on failure.
   267  func deriveLabelValue(cmd string) string {
   268  	parts, err := shlex.Split(cmd)
   269  	if err != nil {
   270  		panic(fmt.Sprintf("Invalid custom metric command [%s]: %s", cmd, err))
   271  	}
   272  	log.Debug("Running custom label command: %s", cmd)
   273  	b, err := core.ExecCommand(parts[0], parts[1:]...).Output()
   274  	log.Debug("Got output: %s", b)
   275  	if err != nil {
   276  		panic(fmt.Sprintf("Custom metric command [%s] failed: %s", cmd, err))
   277  	}
   278  	value := strings.TrimSpace(string(b))
   279  	if strings.Contains(value, "\n") {
   280  		panic(fmt.Sprintf("Return value of custom metric command [%s] contains spaces: %s", cmd, value))
   281  	}
   282  	return value
   283  }