zotregistry.dev/zot@v1.4.4-0.20240314164342-eec277e14d20/pkg/extensions/monitoring/minimal.go (about) 1 //go:build !metrics 2 // +build !metrics 3 4 //nolint:varnamelen,forcetypeassert 5 package monitoring 6 7 import ( 8 "fmt" 9 "math" 10 "path" 11 "strconv" 12 "sync" 13 "time" 14 15 "zotregistry.dev/zot/pkg/log" 16 ) 17 18 const ( 19 metricsNamespace = "zot" 20 // Counters. 21 httpConnRequests = metricsNamespace + ".http.requests" 22 repoDownloads = metricsNamespace + ".repo.downloads" 23 repoUploads = metricsNamespace + ".repo.uploads" 24 schedulerGenerators = metricsNamespace + ".scheduler.generators" 25 // Gauge. 26 repoStorageBytes = metricsNamespace + ".repo.storage.bytes" 27 serverInfo = metricsNamespace + ".info" 28 schedulerNumWorkers = metricsNamespace + ".scheduler.workers.total" 29 schedulerWorkers = metricsNamespace + ".scheduler.workers" 30 schedulerGeneratorsStatus = metricsNamespace + ".scheduler.generators.status" 31 schedulerTasksQueue = metricsNamespace + ".scheduler.tasksqueue.length" 32 // Summary. 33 httpRepoLatencySeconds = metricsNamespace + ".http.repo.latency.seconds" 34 // Histogram. 35 httpMethodLatencySeconds = metricsNamespace + ".http.method.latency.seconds" 36 storageLockLatencySeconds = metricsNamespace + ".storage.lock.latency.seconds" 37 workersTasksDuration = metricsNamespace + ".scheduler.workers.tasks.duration.seconds" 38 39 metricsScrapeTimeout = 2 * time.Minute 40 metricsScrapeCheckInterval = 30 * time.Second 41 ) 42 43 type metricServer struct { 44 enabled bool 45 lastCheck time.Time 46 reqChan chan interface{} 47 cache *MetricsInfo 48 cacheChan chan MetricsCopy 49 bucketsF2S map[float64]string // float64 to string conversion of buckets label 50 log log.Logger 51 lock *sync.RWMutex 52 } 53 54 type MetricsInfo struct { 55 Counters []*CounterValue 56 Gauges []*GaugeValue 57 Summaries []*SummaryValue 58 Histograms []*HistogramValue 59 } 60 type MetricsCopy struct { 61 Counters []CounterValue 62 Gauges []GaugeValue 63 Summaries []SummaryValue 64 Histograms []HistogramValue 65 } 66 67 // CounterValue stores info about a metric that is incremented over time, 68 // such as the number of requests to an HTTP endpoint. 69 type CounterValue struct { 70 Name string 71 Count int 72 LabelNames []string 73 LabelValues []string 74 } 75 76 // GaugeValue stores one value that is updated as time goes on, such as 77 // the amount of memory allocated. 78 type GaugeValue struct { 79 Name string 80 Value float64 81 LabelNames []string 82 LabelValues []string 83 } 84 85 // SummaryValue stores info about a metric that is incremented over time, 86 // such as the number of requests to an HTTP endpoint. 87 type SummaryValue struct { 88 Name string 89 Count int 90 Sum float64 91 LabelNames []string 92 LabelValues []string 93 } 94 95 type HistogramValue struct { 96 Name string 97 Count int 98 Sum float64 99 Buckets map[string]int 100 LabelNames []string 101 LabelValues []string 102 } 103 104 func GetDefaultBuckets() []float64 { 105 return []float64{.05, .5, 1, 5, 30, 60, 600, math.MaxFloat64} 106 } 107 108 func GetStorageLatencyBuckets() []float64 { 109 return []float64{.001, .01, 0.1, 1, 5, 10, 15, 30, 60, math.MaxFloat64} 110 } 111 112 // implements the MetricServer interface. 113 func (ms *metricServer) SendMetric(metric interface{}) { 114 ms.lock.RLock() 115 if ms.enabled { 116 ms.lock.RUnlock() 117 ms.reqChan <- metric 118 } else { 119 ms.lock.RUnlock() 120 } 121 } 122 123 func (ms *metricServer) ForceSendMetric(metric interface{}) { 124 ms.reqChan <- metric 125 } 126 127 func (ms *metricServer) ReceiveMetrics() interface{} { 128 ms.lock.Lock() 129 if !ms.enabled { 130 ms.enabled = true 131 } 132 ms.lock.Unlock() 133 ms.cacheChan <- MetricsCopy{} 134 135 return <-ms.cacheChan 136 } 137 138 func (ms *metricServer) IsEnabled() bool { 139 ms.lock.RLock() 140 defer ms.lock.RUnlock() 141 142 return ms.enabled 143 } 144 145 func (ms *metricServer) Run() { 146 sendAfter := make(chan time.Duration, 1) 147 // periodically send a notification to the metric server to check if we can disable metrics 148 go func() { 149 for { 150 t := metricsScrapeCheckInterval 151 time.Sleep(t) 152 sendAfter <- t 153 } 154 }() 155 156 for { 157 select { 158 case <-ms.cacheChan: 159 ms.lastCheck = time.Now() 160 // make a copy of cache values to prevent data race 161 metrics := MetricsCopy{ 162 Counters: make([]CounterValue, len(ms.cache.Counters)), 163 Gauges: make([]GaugeValue, len(ms.cache.Gauges)), 164 Summaries: make([]SummaryValue, len(ms.cache.Summaries)), 165 Histograms: make([]HistogramValue, len(ms.cache.Histograms)), 166 } 167 for i, cv := range ms.cache.Counters { 168 metrics.Counters[i] = *cv 169 } 170 171 for i, gv := range ms.cache.Gauges { 172 metrics.Gauges[i] = *gv 173 } 174 175 for i, sv := range ms.cache.Summaries { 176 metrics.Summaries[i] = *sv 177 } 178 179 for i, hv := range ms.cache.Histograms { 180 metrics.Histograms[i] = *hv 181 } 182 ms.cacheChan <- metrics 183 case m := <-ms.reqChan: 184 switch v := m.(type) { 185 case CounterValue: 186 cv := m.(CounterValue) 187 ms.CounterInc(&cv) 188 case GaugeValue: 189 gv := m.(GaugeValue) 190 ms.GaugeSet(&gv) 191 case SummaryValue: 192 sv := m.(SummaryValue) 193 ms.SummaryObserve(&sv) 194 case HistogramValue: 195 hv := m.(HistogramValue) 196 ms.HistogramObserve(&hv) 197 default: 198 ms.log.Error().Str("type", fmt.Sprintf("%T", v)).Msg("unexpected type") 199 } 200 case <-sendAfter: 201 // Check if we didn't receive a metrics scrape in a while and if so, 202 // disable metrics (possible node exporter down/crashed) 203 ms.lock.Lock() 204 if ms.enabled { 205 lastCheckInterval := time.Since(ms.lastCheck) 206 if lastCheckInterval > metricsScrapeTimeout { 207 ms.enabled = false 208 } 209 } 210 ms.lock.Unlock() 211 } 212 } 213 } 214 215 func NewMetricsServer(enabled bool, log log.Logger) MetricServer { 216 mi := &MetricsInfo{ 217 Counters: make([]*CounterValue, 0), 218 Gauges: make([]*GaugeValue, 0), 219 Summaries: make([]*SummaryValue, 0), 220 Histograms: make([]*HistogramValue, 0), 221 } 222 // convert to a map for returning easily the string corresponding to a bucket 223 bucketsFloat2String := map[float64]string{} 224 225 for _, fvalue := range append(GetDefaultBuckets(), GetStorageLatencyBuckets()...) { 226 if fvalue == math.MaxFloat64 { 227 bucketsFloat2String[fvalue] = "+Inf" 228 } else { 229 s := strconv.FormatFloat(fvalue, 'f', -1, 64) 230 bucketsFloat2String[fvalue] = s 231 } 232 } 233 234 ms := &metricServer{ 235 enabled: enabled, 236 reqChan: make(chan interface{}), 237 cacheChan: make(chan MetricsCopy), 238 cache: mi, 239 bucketsF2S: bucketsFloat2String, 240 log: log, 241 lock: &sync.RWMutex{}, 242 } 243 244 go ms.Run() 245 246 return ms 247 } 248 249 // contains a map with key=CounterName and value=CounterLabels. 250 func GetCounters() map[string][]string { 251 return map[string][]string{ 252 httpConnRequests: {"method", "code"}, 253 repoDownloads: {"repo"}, 254 repoUploads: {"repo"}, 255 schedulerGenerators: {}, 256 } 257 } 258 259 func GetGauges() map[string][]string { 260 return map[string][]string{ 261 repoStorageBytes: {"repo"}, 262 serverInfo: {"commit", "binaryType", "goVersion", "version"}, 263 schedulerNumWorkers: {}, 264 schedulerGeneratorsStatus: {"priority", "state"}, 265 schedulerTasksQueue: {"priority"}, 266 schedulerWorkers: {"state"}, 267 } 268 } 269 270 func GetSummaries() map[string][]string { 271 return map[string][]string{ 272 httpRepoLatencySeconds: {"repo"}, 273 } 274 } 275 276 func GetHistograms() map[string][]string { 277 return map[string][]string{ 278 httpMethodLatencySeconds: {"method"}, 279 storageLockLatencySeconds: {"storageName", "lockType"}, 280 workersTasksDuration: {"name"}, 281 } 282 } 283 284 // return true if a metric does not have any labels or if the label 285 // values for searched metric corresponds to the one in the cached slice. 286 func isMetricMatch(lValues, metricValues []string) bool { 287 if len(lValues) == len(metricValues) { 288 for i, v := range metricValues { 289 if v != lValues[i] { 290 return false 291 } 292 } 293 } 294 295 return true 296 } 297 298 // returns {-1, false} in case metric was not found in the slice. 299 func findCounterValueIndex(metricSlice []*CounterValue, name string, labelValues []string) (int, bool) { 300 for i, m := range metricSlice { 301 if m.Name == name { 302 if isMetricMatch(labelValues, m.LabelValues) { 303 return i, true 304 } 305 } 306 } 307 308 return -1, false 309 } 310 311 // returns {-1, false} in case metric was not found in the slice. 312 func findGaugeValueIndex(metricSlice []*GaugeValue, name string, labelValues []string) (int, bool) { 313 for i, m := range metricSlice { 314 if m.Name == name { 315 if isMetricMatch(labelValues, m.LabelValues) { 316 return i, true 317 } 318 } 319 } 320 321 return -1, false 322 } 323 324 // returns {-1, false} in case metric was not found in the slice. 325 func findSummaryValueIndex(metricSlice []*SummaryValue, name string, labelValues []string) (int, bool) { 326 for i, m := range metricSlice { 327 if m.Name == name { 328 if isMetricMatch(labelValues, m.LabelValues) { 329 return i, true 330 } 331 } 332 } 333 334 return -1, false 335 } 336 337 // returns {-1, false} in case metric was not found in the slice. 338 func findHistogramValueIndex(metricSlice []*HistogramValue, name string, labelValues []string) (int, bool) { 339 for i, m := range metricSlice { 340 if m.Name == name { 341 if isMetricMatch(labelValues, m.LabelValues) { 342 return i, true 343 } 344 } 345 } 346 347 return -1, false 348 } 349 350 func (ms *metricServer) CounterInc(cv *CounterValue) { 351 labels, ok := GetCounters()[cv.Name] // known label names for the 'name' counter 352 353 err := sanityChecks(cv.Name, labels, ok, cv.LabelNames, cv.LabelValues) 354 if err != nil { 355 // The last thing we want is to panic/stop the server due to instrumentation 356 // thus log a message (should be detected during development of new metrics) 357 ms.log.Error().Err(err).Msg("failed due to instrumentation error") 358 359 return 360 } 361 362 index, ok := findCounterValueIndex(ms.cache.Counters, cv.Name, cv.LabelValues) 363 if !ok { 364 // cv not found in cache: add it 365 cv.Count = 1 366 ms.cache.Counters = append(ms.cache.Counters, cv) 367 } else { 368 ms.cache.Counters[index].Count++ 369 } 370 } 371 372 func (ms *metricServer) GaugeSet(gv *GaugeValue) { 373 labels, ok := GetGauges()[gv.Name] // known label names for the 'name' counter 374 375 err := sanityChecks(gv.Name, labels, ok, gv.LabelNames, gv.LabelValues) 376 if err != nil { 377 ms.log.Error().Err(err).Msg("failed due to instrumentation error") 378 379 return 380 } 381 382 index, ok := findGaugeValueIndex(ms.cache.Gauges, gv.Name, gv.LabelValues) 383 if !ok { 384 // gv not found in cache: add it 385 ms.cache.Gauges = append(ms.cache.Gauges, gv) 386 } else { 387 ms.cache.Gauges[index].Value = gv.Value 388 } 389 } 390 391 func (ms *metricServer) SummaryObserve(sv *SummaryValue) { 392 labels, ok := GetSummaries()[sv.Name] // known label names for the 'name' summary 393 394 err := sanityChecks(sv.Name, labels, ok, sv.LabelNames, sv.LabelValues) 395 if err != nil { 396 ms.log.Error().Err(err).Msg("failed due to instrumentation error") 397 398 return 399 } 400 401 index, ok := findSummaryValueIndex(ms.cache.Summaries, sv.Name, sv.LabelValues) 402 if !ok { 403 // The SampledValue not found: add it 404 sv.Count = 1 // First value, no need to increment 405 ms.cache.Summaries = append(ms.cache.Summaries, sv) 406 } else { 407 ms.cache.Summaries[index].Count++ 408 ms.cache.Summaries[index].Sum += sv.Sum 409 } 410 } 411 412 func (ms *metricServer) HistogramObserve(hv *HistogramValue) { 413 labels, ok := GetHistograms()[hv.Name] // known label names for the 'name' counter 414 415 err := sanityChecks(hv.Name, labels, ok, hv.LabelNames, hv.LabelValues) 416 if err != nil { 417 ms.log.Error().Err(err).Msg("failed due to instrumentation error") 418 419 return 420 } 421 422 index, ok := findHistogramValueIndex(ms.cache.Histograms, hv.Name, hv.LabelValues) 423 if !ok { 424 // The HistogramValue not found: add it 425 buckets := make(map[string]int) 426 427 for _, fvalue := range GetBuckets(hv.Name) { 428 if hv.Sum <= fvalue { 429 buckets[ms.bucketsF2S[fvalue]] = 1 430 } else { 431 buckets[ms.bucketsF2S[fvalue]] = 0 432 } 433 } 434 435 hv.Count = 1 // First value, no need to increment 436 hv.Buckets = buckets 437 ms.cache.Histograms = append(ms.cache.Histograms, hv) 438 } else { 439 cachedH := ms.cache.Histograms[index] 440 cachedH.Count++ 441 cachedH.Sum += hv.Sum 442 for _, fvalue := range GetBuckets(hv.Name) { 443 if hv.Sum <= fvalue { 444 cachedH.Buckets[ms.bucketsF2S[fvalue]]++ 445 } 446 } 447 } 448 } 449 450 //nolint:goerr113 451 func sanityChecks(name string, knownLabels []string, found bool, labelNames, labelValues []string) error { 452 if !found { 453 return fmt.Errorf("metric %s: not found", name) 454 } 455 456 if len(labelNames) != len(labelValues) || 457 len(labelNames) != len(knownLabels) { 458 return fmt.Errorf("metric %s: label size mismatch", name) 459 } 460 // The list of label names defined in init() for the counter must match what was provided in labelNames 461 for i, label := range labelNames { 462 if label != knownLabels[i] { 463 return fmt.Errorf("metric %s: label size mismatch", name) 464 } 465 } 466 467 return nil 468 } 469 470 func IncHTTPConnRequests(ms MetricServer, lvs ...string) { 471 req := CounterValue{ 472 Name: httpConnRequests, 473 LabelNames: []string{"method", "code"}, 474 LabelValues: lvs, 475 } 476 ms.SendMetric(req) 477 } 478 479 func ObserveHTTPRepoLatency(ms MetricServer, path string, latency time.Duration) { 480 var lvs []string 481 match := re.FindStringSubmatch(path) 482 483 if len(match) > 1 { 484 lvs = []string{match[1]} 485 } else { 486 lvs = []string{"N/A"} 487 } 488 489 sv := SummaryValue{ 490 Name: httpRepoLatencySeconds, 491 Sum: latency.Seconds(), 492 LabelNames: []string{"repo"}, 493 LabelValues: lvs, 494 } 495 ms.SendMetric(sv) 496 } 497 498 func ObserveHTTPMethodLatency(ms MetricServer, method string, latency time.Duration) { 499 h := HistogramValue{ 500 Name: httpMethodLatencySeconds, 501 Sum: latency.Seconds(), // convenient temporary store for Histogram latency value 502 LabelNames: []string{"method"}, 503 LabelValues: []string{method}, 504 } 505 ms.SendMetric(h) 506 } 507 508 func IncDownloadCounter(ms MetricServer, repo string) { 509 dCounter := CounterValue{ 510 Name: repoDownloads, 511 LabelNames: []string{"repo"}, 512 LabelValues: []string{repo}, 513 } 514 ms.SendMetric(dCounter) 515 } 516 517 func IncUploadCounter(ms MetricServer, repo string) { 518 uCounter := CounterValue{ 519 Name: repoUploads, 520 LabelNames: []string{"repo"}, 521 LabelValues: []string{repo}, 522 } 523 ms.SendMetric(uCounter) 524 } 525 526 func SetStorageUsage(ms MetricServer, rootDir, repo string) { 527 dir := path.Join(rootDir, repo) 528 529 repoSize, err := GetDirSize(dir) 530 if err != nil { 531 ms.(*metricServer).log.Error().Err(err).Msg("failed to set storage usage") 532 } 533 534 storage := GaugeValue{ 535 Name: repoStorageBytes, 536 Value: float64(repoSize), 537 LabelNames: []string{"repo"}, 538 LabelValues: []string{repo}, 539 } 540 ms.ForceSendMetric(storage) 541 } 542 543 func SetServerInfo(ms MetricServer, lvs ...string) { 544 info := GaugeValue{ 545 Name: serverInfo, 546 Value: 0, 547 LabelNames: []string{"commit", "binaryType", "goVersion", "version"}, 548 LabelValues: lvs, 549 } 550 // This metric is set once at zot startup (set it regardless of metrics enabled) 551 ms.ForceSendMetric(info) 552 } 553 554 func ObserveStorageLockLatency(ms MetricServer, latency time.Duration, storageName, lockType string) { 555 h := HistogramValue{ 556 Name: storageLockLatencySeconds, 557 Sum: latency.Seconds(), // convenient temporary store for Histogram latency value 558 LabelNames: []string{"storageName", "lockType"}, 559 LabelValues: []string{storageName, lockType}, 560 } 561 ms.SendMetric(h) 562 } 563 564 func GetMaxIdleScrapeInterval() time.Duration { 565 return metricsScrapeTimeout + metricsScrapeCheckInterval 566 } 567 568 func GetBuckets(metricName string) []float64 { 569 switch metricName { 570 case storageLockLatencySeconds: 571 return GetStorageLatencyBuckets() 572 default: 573 return GetDefaultBuckets() 574 } 575 } 576 577 func SetSchedulerNumWorkers(ms MetricServer, workers int) { 578 numWorkers := GaugeValue{ 579 Name: schedulerNumWorkers, 580 Value: float64(workers), 581 } 582 ms.ForceSendMetric(numWorkers) 583 } 584 585 func IncSchedulerGenerators(ms MetricServer) { 586 genCounter := CounterValue{ 587 Name: schedulerGenerators, 588 } 589 ms.ForceSendMetric(genCounter) 590 } 591 592 func ObserveWorkersTasksDuration(ms MetricServer, taskName string, duration time.Duration) { 593 h := HistogramValue{ 594 Name: workersTasksDuration, 595 Sum: duration.Seconds(), // convenient temporary store for Histogram latency value 596 LabelNames: []string{"name"}, 597 LabelValues: []string{taskName}, 598 } 599 ms.SendMetric(h) 600 } 601 602 func SetSchedulerGenerators(ms MetricServer, gen map[string]map[string]uint64) { 603 for priority, states := range gen { 604 for state, value := range states { 605 generator := GaugeValue{ 606 Name: schedulerGeneratorsStatus, 607 Value: float64(value), 608 LabelNames: []string{"priority", "state"}, 609 LabelValues: []string{priority, state}, 610 } 611 ms.SendMetric(generator) 612 } 613 } 614 } 615 616 func SetSchedulerTasksQueue(ms MetricServer, tq map[string]int) { 617 for priority, value := range tq { 618 tasks := GaugeValue{ 619 Name: schedulerTasksQueue, 620 Value: float64(value), 621 LabelNames: []string{"priority"}, 622 LabelValues: []string{priority}, 623 } 624 ms.SendMetric(tasks) 625 } 626 } 627 628 func SetSchedulerWorkers(ms MetricServer, w map[string]int) { 629 for state, value := range w { 630 workers := GaugeValue{ 631 Name: schedulerWorkers, 632 Value: float64(value), 633 LabelNames: []string{"state"}, 634 LabelValues: []string{state}, 635 } 636 ms.SendMetric(workers) 637 } 638 }