zotregistry.dev/zot@v1.4.4-0.20240314164342-eec277e14d20/pkg/extensions/monitoring/minimal.go (about)

     1  //go:build !metrics
     2  // +build !metrics
     3  
     4  //nolint:varnamelen,forcetypeassert
     5  package monitoring
     6  
     7  import (
     8  	"fmt"
     9  	"math"
    10  	"path"
    11  	"strconv"
    12  	"sync"
    13  	"time"
    14  
    15  	"zotregistry.dev/zot/pkg/log"
    16  )
    17  
    18  const (
    19  	metricsNamespace = "zot"
    20  	// Counters.
    21  	httpConnRequests    = metricsNamespace + ".http.requests"
    22  	repoDownloads       = metricsNamespace + ".repo.downloads"
    23  	repoUploads         = metricsNamespace + ".repo.uploads"
    24  	schedulerGenerators = metricsNamespace + ".scheduler.generators"
    25  	// Gauge.
    26  	repoStorageBytes          = metricsNamespace + ".repo.storage.bytes"
    27  	serverInfo                = metricsNamespace + ".info"
    28  	schedulerNumWorkers       = metricsNamespace + ".scheduler.workers.total"
    29  	schedulerWorkers          = metricsNamespace + ".scheduler.workers"
    30  	schedulerGeneratorsStatus = metricsNamespace + ".scheduler.generators.status"
    31  	schedulerTasksQueue       = metricsNamespace + ".scheduler.tasksqueue.length"
    32  	// Summary.
    33  	httpRepoLatencySeconds = metricsNamespace + ".http.repo.latency.seconds"
    34  	// Histogram.
    35  	httpMethodLatencySeconds  = metricsNamespace + ".http.method.latency.seconds"
    36  	storageLockLatencySeconds = metricsNamespace + ".storage.lock.latency.seconds"
    37  	workersTasksDuration      = metricsNamespace + ".scheduler.workers.tasks.duration.seconds"
    38  
    39  	metricsScrapeTimeout       = 2 * time.Minute
    40  	metricsScrapeCheckInterval = 30 * time.Second
    41  )
    42  
    43  type metricServer struct {
    44  	enabled    bool
    45  	lastCheck  time.Time
    46  	reqChan    chan interface{}
    47  	cache      *MetricsInfo
    48  	cacheChan  chan MetricsCopy
    49  	bucketsF2S map[float64]string // float64 to string conversion of buckets label
    50  	log        log.Logger
    51  	lock       *sync.RWMutex
    52  }
    53  
    54  type MetricsInfo struct {
    55  	Counters   []*CounterValue
    56  	Gauges     []*GaugeValue
    57  	Summaries  []*SummaryValue
    58  	Histograms []*HistogramValue
    59  }
    60  type MetricsCopy struct {
    61  	Counters   []CounterValue
    62  	Gauges     []GaugeValue
    63  	Summaries  []SummaryValue
    64  	Histograms []HistogramValue
    65  }
    66  
    67  // CounterValue stores info about a metric that is incremented over time,
    68  // such as the number of requests to an HTTP endpoint.
    69  type CounterValue struct {
    70  	Name        string
    71  	Count       int
    72  	LabelNames  []string
    73  	LabelValues []string
    74  }
    75  
    76  // GaugeValue stores one value that is updated as time goes on, such as
    77  // the amount of memory allocated.
    78  type GaugeValue struct {
    79  	Name        string
    80  	Value       float64
    81  	LabelNames  []string
    82  	LabelValues []string
    83  }
    84  
    85  // SummaryValue stores info about a metric that is incremented over time,
    86  // such as the number of requests to an HTTP endpoint.
    87  type SummaryValue struct {
    88  	Name        string
    89  	Count       int
    90  	Sum         float64
    91  	LabelNames  []string
    92  	LabelValues []string
    93  }
    94  
    95  type HistogramValue struct {
    96  	Name        string
    97  	Count       int
    98  	Sum         float64
    99  	Buckets     map[string]int
   100  	LabelNames  []string
   101  	LabelValues []string
   102  }
   103  
   104  func GetDefaultBuckets() []float64 {
   105  	return []float64{.05, .5, 1, 5, 30, 60, 600, math.MaxFloat64}
   106  }
   107  
   108  func GetStorageLatencyBuckets() []float64 {
   109  	return []float64{.001, .01, 0.1, 1, 5, 10, 15, 30, 60, math.MaxFloat64}
   110  }
   111  
   112  // implements the MetricServer interface.
   113  func (ms *metricServer) SendMetric(metric interface{}) {
   114  	ms.lock.RLock()
   115  	if ms.enabled {
   116  		ms.lock.RUnlock()
   117  		ms.reqChan <- metric
   118  	} else {
   119  		ms.lock.RUnlock()
   120  	}
   121  }
   122  
   123  func (ms *metricServer) ForceSendMetric(metric interface{}) {
   124  	ms.reqChan <- metric
   125  }
   126  
   127  func (ms *metricServer) ReceiveMetrics() interface{} {
   128  	ms.lock.Lock()
   129  	if !ms.enabled {
   130  		ms.enabled = true
   131  	}
   132  	ms.lock.Unlock()
   133  	ms.cacheChan <- MetricsCopy{}
   134  
   135  	return <-ms.cacheChan
   136  }
   137  
   138  func (ms *metricServer) IsEnabled() bool {
   139  	ms.lock.RLock()
   140  	defer ms.lock.RUnlock()
   141  
   142  	return ms.enabled
   143  }
   144  
   145  func (ms *metricServer) Run() {
   146  	sendAfter := make(chan time.Duration, 1)
   147  	// periodically send a notification to the metric server to check if we can disable metrics
   148  	go func() {
   149  		for {
   150  			t := metricsScrapeCheckInterval
   151  			time.Sleep(t)
   152  			sendAfter <- t
   153  		}
   154  	}()
   155  
   156  	for {
   157  		select {
   158  		case <-ms.cacheChan:
   159  			ms.lastCheck = time.Now()
   160  			// make a copy of cache values to prevent data race
   161  			metrics := MetricsCopy{
   162  				Counters:   make([]CounterValue, len(ms.cache.Counters)),
   163  				Gauges:     make([]GaugeValue, len(ms.cache.Gauges)),
   164  				Summaries:  make([]SummaryValue, len(ms.cache.Summaries)),
   165  				Histograms: make([]HistogramValue, len(ms.cache.Histograms)),
   166  			}
   167  			for i, cv := range ms.cache.Counters {
   168  				metrics.Counters[i] = *cv
   169  			}
   170  
   171  			for i, gv := range ms.cache.Gauges {
   172  				metrics.Gauges[i] = *gv
   173  			}
   174  
   175  			for i, sv := range ms.cache.Summaries {
   176  				metrics.Summaries[i] = *sv
   177  			}
   178  
   179  			for i, hv := range ms.cache.Histograms {
   180  				metrics.Histograms[i] = *hv
   181  			}
   182  			ms.cacheChan <- metrics
   183  		case m := <-ms.reqChan:
   184  			switch v := m.(type) {
   185  			case CounterValue:
   186  				cv := m.(CounterValue)
   187  				ms.CounterInc(&cv)
   188  			case GaugeValue:
   189  				gv := m.(GaugeValue)
   190  				ms.GaugeSet(&gv)
   191  			case SummaryValue:
   192  				sv := m.(SummaryValue)
   193  				ms.SummaryObserve(&sv)
   194  			case HistogramValue:
   195  				hv := m.(HistogramValue)
   196  				ms.HistogramObserve(&hv)
   197  			default:
   198  				ms.log.Error().Str("type", fmt.Sprintf("%T", v)).Msg("unexpected type")
   199  			}
   200  		case <-sendAfter:
   201  			// Check if we didn't receive a metrics scrape in a while and if so,
   202  			// disable metrics (possible node exporter down/crashed)
   203  			ms.lock.Lock()
   204  			if ms.enabled {
   205  				lastCheckInterval := time.Since(ms.lastCheck)
   206  				if lastCheckInterval > metricsScrapeTimeout {
   207  					ms.enabled = false
   208  				}
   209  			}
   210  			ms.lock.Unlock()
   211  		}
   212  	}
   213  }
   214  
   215  func NewMetricsServer(enabled bool, log log.Logger) MetricServer {
   216  	mi := &MetricsInfo{
   217  		Counters:   make([]*CounterValue, 0),
   218  		Gauges:     make([]*GaugeValue, 0),
   219  		Summaries:  make([]*SummaryValue, 0),
   220  		Histograms: make([]*HistogramValue, 0),
   221  	}
   222  	// convert to a map for returning easily the string corresponding to a bucket
   223  	bucketsFloat2String := map[float64]string{}
   224  
   225  	for _, fvalue := range append(GetDefaultBuckets(), GetStorageLatencyBuckets()...) {
   226  		if fvalue == math.MaxFloat64 {
   227  			bucketsFloat2String[fvalue] = "+Inf"
   228  		} else {
   229  			s := strconv.FormatFloat(fvalue, 'f', -1, 64)
   230  			bucketsFloat2String[fvalue] = s
   231  		}
   232  	}
   233  
   234  	ms := &metricServer{
   235  		enabled:    enabled,
   236  		reqChan:    make(chan interface{}),
   237  		cacheChan:  make(chan MetricsCopy),
   238  		cache:      mi,
   239  		bucketsF2S: bucketsFloat2String,
   240  		log:        log,
   241  		lock:       &sync.RWMutex{},
   242  	}
   243  
   244  	go ms.Run()
   245  
   246  	return ms
   247  }
   248  
   249  // contains a map with key=CounterName and value=CounterLabels.
   250  func GetCounters() map[string][]string {
   251  	return map[string][]string{
   252  		httpConnRequests:    {"method", "code"},
   253  		repoDownloads:       {"repo"},
   254  		repoUploads:         {"repo"},
   255  		schedulerGenerators: {},
   256  	}
   257  }
   258  
   259  func GetGauges() map[string][]string {
   260  	return map[string][]string{
   261  		repoStorageBytes:          {"repo"},
   262  		serverInfo:                {"commit", "binaryType", "goVersion", "version"},
   263  		schedulerNumWorkers:       {},
   264  		schedulerGeneratorsStatus: {"priority", "state"},
   265  		schedulerTasksQueue:       {"priority"},
   266  		schedulerWorkers:          {"state"},
   267  	}
   268  }
   269  
   270  func GetSummaries() map[string][]string {
   271  	return map[string][]string{
   272  		httpRepoLatencySeconds: {"repo"},
   273  	}
   274  }
   275  
   276  func GetHistograms() map[string][]string {
   277  	return map[string][]string{
   278  		httpMethodLatencySeconds:  {"method"},
   279  		storageLockLatencySeconds: {"storageName", "lockType"},
   280  		workersTasksDuration:      {"name"},
   281  	}
   282  }
   283  
   284  // return true if a metric does not have any labels or if the label
   285  // values for searched metric corresponds to the one in the cached slice.
   286  func isMetricMatch(lValues, metricValues []string) bool {
   287  	if len(lValues) == len(metricValues) {
   288  		for i, v := range metricValues {
   289  			if v != lValues[i] {
   290  				return false
   291  			}
   292  		}
   293  	}
   294  
   295  	return true
   296  }
   297  
   298  // returns {-1, false} in case metric was not found in the slice.
   299  func findCounterValueIndex(metricSlice []*CounterValue, name string, labelValues []string) (int, bool) {
   300  	for i, m := range metricSlice {
   301  		if m.Name == name {
   302  			if isMetricMatch(labelValues, m.LabelValues) {
   303  				return i, true
   304  			}
   305  		}
   306  	}
   307  
   308  	return -1, false
   309  }
   310  
   311  // returns {-1, false} in case metric was not found in the slice.
   312  func findGaugeValueIndex(metricSlice []*GaugeValue, name string, labelValues []string) (int, bool) {
   313  	for i, m := range metricSlice {
   314  		if m.Name == name {
   315  			if isMetricMatch(labelValues, m.LabelValues) {
   316  				return i, true
   317  			}
   318  		}
   319  	}
   320  
   321  	return -1, false
   322  }
   323  
   324  // returns {-1, false} in case metric was not found in the slice.
   325  func findSummaryValueIndex(metricSlice []*SummaryValue, name string, labelValues []string) (int, bool) {
   326  	for i, m := range metricSlice {
   327  		if m.Name == name {
   328  			if isMetricMatch(labelValues, m.LabelValues) {
   329  				return i, true
   330  			}
   331  		}
   332  	}
   333  
   334  	return -1, false
   335  }
   336  
   337  // returns {-1, false} in case metric was not found in the slice.
   338  func findHistogramValueIndex(metricSlice []*HistogramValue, name string, labelValues []string) (int, bool) {
   339  	for i, m := range metricSlice {
   340  		if m.Name == name {
   341  			if isMetricMatch(labelValues, m.LabelValues) {
   342  				return i, true
   343  			}
   344  		}
   345  	}
   346  
   347  	return -1, false
   348  }
   349  
   350  func (ms *metricServer) CounterInc(cv *CounterValue) {
   351  	labels, ok := GetCounters()[cv.Name] // known label names for the 'name' counter
   352  
   353  	err := sanityChecks(cv.Name, labels, ok, cv.LabelNames, cv.LabelValues)
   354  	if err != nil {
   355  		// The last thing we want is to panic/stop the server due to instrumentation
   356  		// thus log a message (should be detected during development of new metrics)
   357  		ms.log.Error().Err(err).Msg("failed due to instrumentation error")
   358  
   359  		return
   360  	}
   361  
   362  	index, ok := findCounterValueIndex(ms.cache.Counters, cv.Name, cv.LabelValues)
   363  	if !ok {
   364  		// cv not found in cache: add it
   365  		cv.Count = 1
   366  		ms.cache.Counters = append(ms.cache.Counters, cv)
   367  	} else {
   368  		ms.cache.Counters[index].Count++
   369  	}
   370  }
   371  
   372  func (ms *metricServer) GaugeSet(gv *GaugeValue) {
   373  	labels, ok := GetGauges()[gv.Name] // known label names for the 'name' counter
   374  
   375  	err := sanityChecks(gv.Name, labels, ok, gv.LabelNames, gv.LabelValues)
   376  	if err != nil {
   377  		ms.log.Error().Err(err).Msg("failed due to instrumentation error")
   378  
   379  		return
   380  	}
   381  
   382  	index, ok := findGaugeValueIndex(ms.cache.Gauges, gv.Name, gv.LabelValues)
   383  	if !ok {
   384  		// gv not found in cache: add it
   385  		ms.cache.Gauges = append(ms.cache.Gauges, gv)
   386  	} else {
   387  		ms.cache.Gauges[index].Value = gv.Value
   388  	}
   389  }
   390  
   391  func (ms *metricServer) SummaryObserve(sv *SummaryValue) {
   392  	labels, ok := GetSummaries()[sv.Name] // known label names for the 'name' summary
   393  
   394  	err := sanityChecks(sv.Name, labels, ok, sv.LabelNames, sv.LabelValues)
   395  	if err != nil {
   396  		ms.log.Error().Err(err).Msg("failed due to instrumentation error")
   397  
   398  		return
   399  	}
   400  
   401  	index, ok := findSummaryValueIndex(ms.cache.Summaries, sv.Name, sv.LabelValues)
   402  	if !ok {
   403  		// The SampledValue not found: add it
   404  		sv.Count = 1 // First value, no need to increment
   405  		ms.cache.Summaries = append(ms.cache.Summaries, sv)
   406  	} else {
   407  		ms.cache.Summaries[index].Count++
   408  		ms.cache.Summaries[index].Sum += sv.Sum
   409  	}
   410  }
   411  
   412  func (ms *metricServer) HistogramObserve(hv *HistogramValue) {
   413  	labels, ok := GetHistograms()[hv.Name] // known label names for the 'name' counter
   414  
   415  	err := sanityChecks(hv.Name, labels, ok, hv.LabelNames, hv.LabelValues)
   416  	if err != nil {
   417  		ms.log.Error().Err(err).Msg("failed due to instrumentation error")
   418  
   419  		return
   420  	}
   421  
   422  	index, ok := findHistogramValueIndex(ms.cache.Histograms, hv.Name, hv.LabelValues)
   423  	if !ok {
   424  		// The HistogramValue not found: add it
   425  		buckets := make(map[string]int)
   426  
   427  		for _, fvalue := range GetBuckets(hv.Name) {
   428  			if hv.Sum <= fvalue {
   429  				buckets[ms.bucketsF2S[fvalue]] = 1
   430  			} else {
   431  				buckets[ms.bucketsF2S[fvalue]] = 0
   432  			}
   433  		}
   434  
   435  		hv.Count = 1 // First value, no need to increment
   436  		hv.Buckets = buckets
   437  		ms.cache.Histograms = append(ms.cache.Histograms, hv)
   438  	} else {
   439  		cachedH := ms.cache.Histograms[index]
   440  		cachedH.Count++
   441  		cachedH.Sum += hv.Sum
   442  		for _, fvalue := range GetBuckets(hv.Name) {
   443  			if hv.Sum <= fvalue {
   444  				cachedH.Buckets[ms.bucketsF2S[fvalue]]++
   445  			}
   446  		}
   447  	}
   448  }
   449  
   450  //nolint:goerr113
   451  func sanityChecks(name string, knownLabels []string, found bool, labelNames, labelValues []string) error {
   452  	if !found {
   453  		return fmt.Errorf("metric %s: not found", name)
   454  	}
   455  
   456  	if len(labelNames) != len(labelValues) ||
   457  		len(labelNames) != len(knownLabels) {
   458  		return fmt.Errorf("metric %s: label size mismatch", name)
   459  	}
   460  	// The list of label names defined in init() for the counter must match what was provided in labelNames
   461  	for i, label := range labelNames {
   462  		if label != knownLabels[i] {
   463  			return fmt.Errorf("metric %s: label size mismatch", name)
   464  		}
   465  	}
   466  
   467  	return nil
   468  }
   469  
   470  func IncHTTPConnRequests(ms MetricServer, lvs ...string) {
   471  	req := CounterValue{
   472  		Name:        httpConnRequests,
   473  		LabelNames:  []string{"method", "code"},
   474  		LabelValues: lvs,
   475  	}
   476  	ms.SendMetric(req)
   477  }
   478  
   479  func ObserveHTTPRepoLatency(ms MetricServer, path string, latency time.Duration) {
   480  	var lvs []string
   481  	match := re.FindStringSubmatch(path)
   482  
   483  	if len(match) > 1 {
   484  		lvs = []string{match[1]}
   485  	} else {
   486  		lvs = []string{"N/A"}
   487  	}
   488  
   489  	sv := SummaryValue{
   490  		Name:        httpRepoLatencySeconds,
   491  		Sum:         latency.Seconds(),
   492  		LabelNames:  []string{"repo"},
   493  		LabelValues: lvs,
   494  	}
   495  	ms.SendMetric(sv)
   496  }
   497  
   498  func ObserveHTTPMethodLatency(ms MetricServer, method string, latency time.Duration) {
   499  	h := HistogramValue{
   500  		Name:        httpMethodLatencySeconds,
   501  		Sum:         latency.Seconds(), // convenient temporary store for Histogram latency value
   502  		LabelNames:  []string{"method"},
   503  		LabelValues: []string{method},
   504  	}
   505  	ms.SendMetric(h)
   506  }
   507  
   508  func IncDownloadCounter(ms MetricServer, repo string) {
   509  	dCounter := CounterValue{
   510  		Name:        repoDownloads,
   511  		LabelNames:  []string{"repo"},
   512  		LabelValues: []string{repo},
   513  	}
   514  	ms.SendMetric(dCounter)
   515  }
   516  
   517  func IncUploadCounter(ms MetricServer, repo string) {
   518  	uCounter := CounterValue{
   519  		Name:        repoUploads,
   520  		LabelNames:  []string{"repo"},
   521  		LabelValues: []string{repo},
   522  	}
   523  	ms.SendMetric(uCounter)
   524  }
   525  
   526  func SetStorageUsage(ms MetricServer, rootDir, repo string) {
   527  	dir := path.Join(rootDir, repo)
   528  
   529  	repoSize, err := GetDirSize(dir)
   530  	if err != nil {
   531  		ms.(*metricServer).log.Error().Err(err).Msg("failed to set storage usage")
   532  	}
   533  
   534  	storage := GaugeValue{
   535  		Name:        repoStorageBytes,
   536  		Value:       float64(repoSize),
   537  		LabelNames:  []string{"repo"},
   538  		LabelValues: []string{repo},
   539  	}
   540  	ms.ForceSendMetric(storage)
   541  }
   542  
   543  func SetServerInfo(ms MetricServer, lvs ...string) {
   544  	info := GaugeValue{
   545  		Name:        serverInfo,
   546  		Value:       0,
   547  		LabelNames:  []string{"commit", "binaryType", "goVersion", "version"},
   548  		LabelValues: lvs,
   549  	}
   550  	// This metric is set once at zot startup (set it regardless of metrics enabled)
   551  	ms.ForceSendMetric(info)
   552  }
   553  
   554  func ObserveStorageLockLatency(ms MetricServer, latency time.Duration, storageName, lockType string) {
   555  	h := HistogramValue{
   556  		Name:        storageLockLatencySeconds,
   557  		Sum:         latency.Seconds(), // convenient temporary store for Histogram latency value
   558  		LabelNames:  []string{"storageName", "lockType"},
   559  		LabelValues: []string{storageName, lockType},
   560  	}
   561  	ms.SendMetric(h)
   562  }
   563  
   564  func GetMaxIdleScrapeInterval() time.Duration {
   565  	return metricsScrapeTimeout + metricsScrapeCheckInterval
   566  }
   567  
   568  func GetBuckets(metricName string) []float64 {
   569  	switch metricName {
   570  	case storageLockLatencySeconds:
   571  		return GetStorageLatencyBuckets()
   572  	default:
   573  		return GetDefaultBuckets()
   574  	}
   575  }
   576  
   577  func SetSchedulerNumWorkers(ms MetricServer, workers int) {
   578  	numWorkers := GaugeValue{
   579  		Name:  schedulerNumWorkers,
   580  		Value: float64(workers),
   581  	}
   582  	ms.ForceSendMetric(numWorkers)
   583  }
   584  
   585  func IncSchedulerGenerators(ms MetricServer) {
   586  	genCounter := CounterValue{
   587  		Name: schedulerGenerators,
   588  	}
   589  	ms.ForceSendMetric(genCounter)
   590  }
   591  
   592  func ObserveWorkersTasksDuration(ms MetricServer, taskName string, duration time.Duration) {
   593  	h := HistogramValue{
   594  		Name:        workersTasksDuration,
   595  		Sum:         duration.Seconds(), // convenient temporary store for Histogram latency value
   596  		LabelNames:  []string{"name"},
   597  		LabelValues: []string{taskName},
   598  	}
   599  	ms.SendMetric(h)
   600  }
   601  
   602  func SetSchedulerGenerators(ms MetricServer, gen map[string]map[string]uint64) {
   603  	for priority, states := range gen {
   604  		for state, value := range states {
   605  			generator := GaugeValue{
   606  				Name:        schedulerGeneratorsStatus,
   607  				Value:       float64(value),
   608  				LabelNames:  []string{"priority", "state"},
   609  				LabelValues: []string{priority, state},
   610  			}
   611  			ms.SendMetric(generator)
   612  		}
   613  	}
   614  }
   615  
   616  func SetSchedulerTasksQueue(ms MetricServer, tq map[string]int) {
   617  	for priority, value := range tq {
   618  		tasks := GaugeValue{
   619  			Name:        schedulerTasksQueue,
   620  			Value:       float64(value),
   621  			LabelNames:  []string{"priority"},
   622  			LabelValues: []string{priority},
   623  		}
   624  		ms.SendMetric(tasks)
   625  	}
   626  }
   627  
   628  func SetSchedulerWorkers(ms MetricServer, w map[string]int) {
   629  	for state, value := range w {
   630  		workers := GaugeValue{
   631  			Name:        schedulerWorkers,
   632  			Value:       float64(value),
   633  			LabelNames:  []string{"state"},
   634  			LabelValues: []string{state},
   635  		}
   636  		ms.SendMetric(workers)
   637  	}
   638  }