zotregistry.io/zot@v1.4.4-0.20231124084042-02a8ed785457/pkg/extensions/monitoring/minimal.go (about) 1 //go:build !metrics 2 // +build !metrics 3 4 //nolint:varnamelen,forcetypeassert 5 package monitoring 6 7 import ( 8 "fmt" 9 "math" 10 "path" 11 "strconv" 12 "sync" 13 "time" 14 15 "zotregistry.io/zot/pkg/log" 16 ) 17 18 const ( 19 metricsNamespace = "zot" 20 // Counters. 21 httpConnRequests = metricsNamespace + ".http.requests" 22 repoDownloads = metricsNamespace + ".repo.downloads" 23 repoUploads = metricsNamespace + ".repo.uploads" 24 // Gauge. 25 repoStorageBytes = metricsNamespace + ".repo.storage.bytes" 26 serverInfo = metricsNamespace + ".info" 27 // Summary. 28 httpRepoLatencySeconds = metricsNamespace + ".http.repo.latency.seconds" 29 // Histogram. 30 httpMethodLatencySeconds = metricsNamespace + ".http.method.latency.seconds" 31 storageLockLatencySeconds = metricsNamespace + ".storage.lock.latency.seconds" 32 33 metricsScrapeTimeout = 2 * time.Minute 34 metricsScrapeCheckInterval = 30 * time.Second 35 ) 36 37 type metricServer struct { 38 enabled bool 39 lastCheck time.Time 40 reqChan chan interface{} 41 cache *MetricsInfo 42 cacheChan chan *MetricsInfo 43 bucketsF2S map[float64]string // float64 to string conversion of buckets label 44 log log.Logger 45 lock *sync.RWMutex 46 } 47 48 type MetricsInfo struct { 49 Counters []*CounterValue 50 Gauges []*GaugeValue 51 Summaries []*SummaryValue 52 Histograms []*HistogramValue 53 } 54 55 // CounterValue stores info about a metric that is incremented over time, 56 // such as the number of requests to an HTTP endpoint. 57 type CounterValue struct { 58 Name string 59 Count int 60 LabelNames []string 61 LabelValues []string 62 } 63 64 // GaugeValue stores one value that is updated as time goes on, such as 65 // the amount of memory allocated. 66 type GaugeValue struct { 67 Name string 68 Value float64 69 LabelNames []string 70 LabelValues []string 71 } 72 73 // SummaryValue stores info about a metric that is incremented over time, 74 // such as the number of requests to an HTTP endpoint. 75 type SummaryValue struct { 76 Name string 77 Count int 78 Sum float64 79 LabelNames []string 80 LabelValues []string 81 } 82 83 type HistogramValue struct { 84 Name string 85 Count int 86 Sum float64 87 Buckets map[string]int 88 LabelNames []string 89 LabelValues []string 90 } 91 92 func GetDefaultBuckets() []float64 { 93 return []float64{.05, .5, 1, 5, 30, 60, 600, math.MaxFloat64} 94 } 95 96 func GetStorageLatencyBuckets() []float64 { 97 return []float64{.001, .01, 0.1, 1, 5, 10, 15, 30, 60, math.MaxFloat64} 98 } 99 100 // implements the MetricServer interface. 101 func (ms *metricServer) SendMetric(metric interface{}) { 102 ms.lock.RLock() 103 if ms.enabled { 104 ms.lock.RUnlock() 105 ms.reqChan <- metric 106 } else { 107 ms.lock.RUnlock() 108 } 109 } 110 111 func (ms *metricServer) ForceSendMetric(metric interface{}) { 112 ms.reqChan <- metric 113 } 114 115 func (ms *metricServer) ReceiveMetrics() interface{} { 116 ms.lock.Lock() 117 if !ms.enabled { 118 ms.enabled = true 119 } 120 ms.lock.Unlock() 121 ms.cacheChan <- &MetricsInfo{} 122 123 return <-ms.cacheChan 124 } 125 126 func (ms *metricServer) IsEnabled() bool { 127 ms.lock.RLock() 128 defer ms.lock.RUnlock() 129 130 return ms.enabled 131 } 132 133 func (ms *metricServer) Run() { 134 sendAfter := make(chan time.Duration, 1) 135 // periodically send a notification to the metric server to check if we can disable metrics 136 go func() { 137 for { 138 t := metricsScrapeCheckInterval 139 time.Sleep(t) 140 sendAfter <- t 141 } 142 }() 143 144 for { 145 select { 146 case <-ms.cacheChan: 147 ms.lastCheck = time.Now() 148 ms.cacheChan <- ms.cache 149 case m := <-ms.reqChan: 150 switch v := m.(type) { 151 case CounterValue: 152 cv := m.(CounterValue) 153 ms.CounterInc(&cv) 154 case GaugeValue: 155 gv := m.(GaugeValue) 156 ms.GaugeSet(&gv) 157 case SummaryValue: 158 sv := m.(SummaryValue) 159 ms.SummaryObserve(&sv) 160 case HistogramValue: 161 hv := m.(HistogramValue) 162 ms.HistogramObserve(&hv) 163 default: 164 ms.log.Error().Str("type", fmt.Sprintf("%T", v)).Msg("unexpected type") 165 } 166 case <-sendAfter: 167 // Check if we didn't receive a metrics scrape in a while and if so, 168 // disable metrics (possible node exporter down/crashed) 169 ms.lock.Lock() 170 if ms.enabled { 171 lastCheckInterval := time.Since(ms.lastCheck) 172 if lastCheckInterval > metricsScrapeTimeout { 173 ms.enabled = false 174 } 175 } 176 ms.lock.Unlock() 177 } 178 } 179 } 180 181 func NewMetricsServer(enabled bool, log log.Logger) MetricServer { 182 mi := &MetricsInfo{ 183 Counters: make([]*CounterValue, 0), 184 Gauges: make([]*GaugeValue, 0), 185 Summaries: make([]*SummaryValue, 0), 186 Histograms: make([]*HistogramValue, 0), 187 } 188 // convert to a map for returning easily the string corresponding to a bucket 189 bucketsFloat2String := map[float64]string{} 190 191 for _, fvalue := range append(GetDefaultBuckets(), GetStorageLatencyBuckets()...) { 192 if fvalue == math.MaxFloat64 { 193 bucketsFloat2String[fvalue] = "+Inf" 194 } else { 195 s := strconv.FormatFloat(fvalue, 'f', -1, 64) 196 bucketsFloat2String[fvalue] = s 197 } 198 } 199 200 ms := &metricServer{ 201 enabled: enabled, 202 reqChan: make(chan interface{}), 203 cacheChan: make(chan *MetricsInfo), 204 cache: mi, 205 bucketsF2S: bucketsFloat2String, 206 log: log, 207 lock: &sync.RWMutex{}, 208 } 209 210 go ms.Run() 211 212 return ms 213 } 214 215 // contains a map with key=CounterName and value=CounterLabels. 216 func GetCounters() map[string][]string { 217 return map[string][]string{ 218 httpConnRequests: {"method", "code"}, 219 repoDownloads: {"repo"}, 220 repoUploads: {"repo"}, 221 } 222 } 223 224 func GetGauges() map[string][]string { 225 return map[string][]string{ 226 repoStorageBytes: {"repo"}, 227 serverInfo: {"commit", "binaryType", "goVersion", "version"}, 228 } 229 } 230 231 func GetSummaries() map[string][]string { 232 return map[string][]string{ 233 httpRepoLatencySeconds: {"repo"}, 234 } 235 } 236 237 func GetHistograms() map[string][]string { 238 return map[string][]string{ 239 httpMethodLatencySeconds: {"method"}, 240 storageLockLatencySeconds: {"storageName", "lockType"}, 241 } 242 } 243 244 // return true if a metric does not have any labels or if the label 245 // values for searched metric corresponds to the one in the cached slice. 246 func isMetricMatch(lValues, metricValues []string) bool { 247 if len(lValues) == len(metricValues) { 248 for i, v := range metricValues { 249 if v != lValues[i] { 250 return false 251 } 252 } 253 } 254 255 return true 256 } 257 258 // returns {-1, false} in case metric was not found in the slice. 259 func findCounterValueIndex(metricSlice []*CounterValue, name string, labelValues []string) (int, bool) { 260 for i, m := range metricSlice { 261 if m.Name == name { 262 if isMetricMatch(labelValues, m.LabelValues) { 263 return i, true 264 } 265 } 266 } 267 268 return -1, false 269 } 270 271 // returns {-1, false} in case metric was not found in the slice. 272 func findGaugeValueIndex(metricSlice []*GaugeValue, name string, labelValues []string) (int, bool) { 273 for i, m := range metricSlice { 274 if m.Name == name { 275 if isMetricMatch(labelValues, m.LabelValues) { 276 return i, true 277 } 278 } 279 } 280 281 return -1, false 282 } 283 284 // returns {-1, false} in case metric was not found in the slice. 285 func findSummaryValueIndex(metricSlice []*SummaryValue, name string, labelValues []string) (int, bool) { 286 for i, m := range metricSlice { 287 if m.Name == name { 288 if isMetricMatch(labelValues, m.LabelValues) { 289 return i, true 290 } 291 } 292 } 293 294 return -1, false 295 } 296 297 // returns {-1, false} in case metric was not found in the slice. 298 func findHistogramValueIndex(metricSlice []*HistogramValue, name string, labelValues []string) (int, bool) { 299 for i, m := range metricSlice { 300 if m.Name == name { 301 if isMetricMatch(labelValues, m.LabelValues) { 302 return i, true 303 } 304 } 305 } 306 307 return -1, false 308 } 309 310 func (ms *metricServer) CounterInc(cv *CounterValue) { 311 labels, ok := GetCounters()[cv.Name] // known label names for the 'name' counter 312 313 err := sanityChecks(cv.Name, labels, ok, cv.LabelNames, cv.LabelValues) 314 if err != nil { 315 // The last thing we want is to panic/stop the server due to instrumentation 316 // thus log a message (should be detected during development of new metrics) 317 ms.log.Error().Err(err).Msg("Instrumentation error") 318 319 return 320 } 321 322 index, ok := findCounterValueIndex(ms.cache.Counters, cv.Name, cv.LabelValues) 323 if !ok { 324 // cv not found in cache: add it 325 cv.Count = 1 326 ms.cache.Counters = append(ms.cache.Counters, cv) 327 } else { 328 ms.cache.Counters[index].Count++ 329 } 330 } 331 332 func (ms *metricServer) GaugeSet(gv *GaugeValue) { 333 labels, ok := GetGauges()[gv.Name] // known label names for the 'name' counter 334 335 err := sanityChecks(gv.Name, labels, ok, gv.LabelNames, gv.LabelValues) 336 if err != nil { 337 ms.log.Error().Err(err).Msg("Instrumentation error") 338 339 return 340 } 341 342 index, ok := findGaugeValueIndex(ms.cache.Gauges, gv.Name, gv.LabelValues) 343 if !ok { 344 // gv not found in cache: add it 345 ms.cache.Gauges = append(ms.cache.Gauges, gv) 346 } else { 347 ms.cache.Gauges[index].Value = gv.Value 348 } 349 } 350 351 func (ms *metricServer) SummaryObserve(sv *SummaryValue) { 352 labels, ok := GetSummaries()[sv.Name] // known label names for the 'name' summary 353 354 err := sanityChecks(sv.Name, labels, ok, sv.LabelNames, sv.LabelValues) 355 if err != nil { 356 ms.log.Error().Err(err).Msg("Instrumentation error") 357 358 return 359 } 360 361 index, ok := findSummaryValueIndex(ms.cache.Summaries, sv.Name, sv.LabelValues) 362 if !ok { 363 // The SampledValue not found: add it 364 sv.Count = 1 // First value, no need to increment 365 ms.cache.Summaries = append(ms.cache.Summaries, sv) 366 } else { 367 ms.cache.Summaries[index].Count++ 368 ms.cache.Summaries[index].Sum += sv.Sum 369 } 370 } 371 372 func (ms *metricServer) HistogramObserve(hv *HistogramValue) { 373 labels, ok := GetHistograms()[hv.Name] // known label names for the 'name' counter 374 375 err := sanityChecks(hv.Name, labels, ok, hv.LabelNames, hv.LabelValues) 376 if err != nil { 377 ms.log.Error().Err(err).Msg("Instrumentation error") 378 379 return 380 } 381 382 index, ok := findHistogramValueIndex(ms.cache.Histograms, hv.Name, hv.LabelValues) 383 if !ok { 384 // The HistogramValue not found: add it 385 buckets := make(map[string]int) 386 387 for _, fvalue := range GetBuckets(hv.Name) { 388 if hv.Sum <= fvalue { 389 buckets[ms.bucketsF2S[fvalue]] = 1 390 } else { 391 buckets[ms.bucketsF2S[fvalue]] = 0 392 } 393 } 394 395 hv.Count = 1 // First value, no need to increment 396 hv.Buckets = buckets 397 ms.cache.Histograms = append(ms.cache.Histograms, hv) 398 } else { 399 cachedH := ms.cache.Histograms[index] 400 cachedH.Count++ 401 cachedH.Sum += hv.Sum 402 for _, fvalue := range GetBuckets(hv.Name) { 403 if hv.Sum <= fvalue { 404 cachedH.Buckets[ms.bucketsF2S[fvalue]]++ 405 } 406 } 407 } 408 } 409 410 //nolint:goerr113 411 func sanityChecks(name string, knownLabels []string, found bool, labelNames, labelValues []string) error { 412 if !found { 413 return fmt.Errorf("metric %s: not found", name) 414 } 415 416 if len(labelNames) != len(labelValues) || 417 len(labelNames) != len(knownLabels) { 418 return fmt.Errorf("metric %s: label size mismatch", name) 419 } 420 // The list of label names defined in init() for the counter must match what was provided in labelNames 421 for i, label := range labelNames { 422 if label != knownLabels[i] { 423 return fmt.Errorf("metric %s: label size mismatch", name) 424 } 425 } 426 427 return nil 428 } 429 430 func IncHTTPConnRequests(ms MetricServer, lvs ...string) { 431 req := CounterValue{ 432 Name: httpConnRequests, 433 LabelNames: []string{"method", "code"}, 434 LabelValues: lvs, 435 } 436 ms.SendMetric(req) 437 } 438 439 func ObserveHTTPRepoLatency(ms MetricServer, path string, latency time.Duration) { 440 var lvs []string 441 match := re.FindStringSubmatch(path) 442 443 if len(match) > 1 { 444 lvs = []string{match[1]} 445 } else { 446 lvs = []string{"N/A"} 447 } 448 449 sv := SummaryValue{ 450 Name: httpRepoLatencySeconds, 451 Sum: latency.Seconds(), 452 LabelNames: []string{"repo"}, 453 LabelValues: lvs, 454 } 455 ms.SendMetric(sv) 456 } 457 458 func ObserveHTTPMethodLatency(ms MetricServer, method string, latency time.Duration) { 459 h := HistogramValue{ 460 Name: httpMethodLatencySeconds, 461 Sum: latency.Seconds(), // convenient temporary store for Histogram latency value 462 LabelNames: []string{"method"}, 463 LabelValues: []string{method}, 464 } 465 ms.SendMetric(h) 466 } 467 468 func IncDownloadCounter(ms MetricServer, repo string) { 469 dCounter := CounterValue{ 470 Name: repoDownloads, 471 LabelNames: []string{"repo"}, 472 LabelValues: []string{repo}, 473 } 474 ms.SendMetric(dCounter) 475 } 476 477 func IncUploadCounter(ms MetricServer, repo string) { 478 uCounter := CounterValue{ 479 Name: repoUploads, 480 LabelNames: []string{"repo"}, 481 LabelValues: []string{repo}, 482 } 483 ms.SendMetric(uCounter) 484 } 485 486 func SetStorageUsage(ms MetricServer, rootDir, repo string) { 487 dir := path.Join(rootDir, repo) 488 489 repoSize, err := GetDirSize(dir) 490 if err != nil { 491 ms.(*metricServer).log.Error().Err(err).Msg("failed to set storage usage") 492 } 493 494 storage := GaugeValue{ 495 Name: repoStorageBytes, 496 Value: float64(repoSize), 497 LabelNames: []string{"repo"}, 498 LabelValues: []string{repo}, 499 } 500 ms.ForceSendMetric(storage) 501 } 502 503 func SetServerInfo(ms MetricServer, lvs ...string) { 504 info := GaugeValue{ 505 Name: serverInfo, 506 Value: 0, 507 LabelNames: []string{"commit", "binaryType", "goVersion", "version"}, 508 LabelValues: lvs, 509 } 510 // This metric is set once at zot startup (set it regardless of metrics enabled) 511 ms.ForceSendMetric(info) 512 } 513 514 func ObserveStorageLockLatency(ms MetricServer, latency time.Duration, storageName, lockType string) { 515 h := HistogramValue{ 516 Name: storageLockLatencySeconds, 517 Sum: latency.Seconds(), // convenient temporary store for Histogram latency value 518 LabelNames: []string{"storageName", "lockType"}, 519 LabelValues: []string{storageName, lockType}, 520 } 521 ms.SendMetric(h) 522 } 523 524 func GetMaxIdleScrapeInterval() time.Duration { 525 return metricsScrapeTimeout + metricsScrapeCheckInterval 526 } 527 528 func GetBuckets(metricName string) []float64 { 529 switch metricName { 530 case storageLockLatencySeconds: 531 return GetStorageLatencyBuckets() 532 default: 533 return GetDefaultBuckets() 534 } 535 }