github.com/onflow/flow-go@v0.35.7-crescendo-preview.23-atree-inlining/module/metrics/herocache.go (about) 1 package metrics 2 3 import ( 4 "fmt" 5 6 "github.com/prometheus/client_golang/prometheus" 7 8 "github.com/onflow/flow-go/module" 9 "github.com/onflow/flow-go/network" 10 ) 11 12 const subsystemHeroCache = "hero_cache" 13 14 var _ module.HeroCacheMetrics = (*HeroCacheCollector)(nil) 15 16 type HeroCacheCollector struct { 17 histogramNormalizedBucketSlotAvailable prometheus.Histogram 18 19 countKeyGetSuccess prometheus.Counter 20 countKeyGetFailure prometheus.Counter 21 22 countKeyPutSuccess prometheus.Counter 23 countKeyPutDrop prometheus.Counter 24 countKeyPutDeduplicated prometheus.Counter 25 countKeyPutAttempt prometheus.Counter 26 countKeyRemoved prometheus.Counter 27 28 size prometheus.Gauge 29 30 countKeyEjectionDueToFullCapacity prometheus.Counter 31 countKeyEjectionDueToEmergency prometheus.Counter 32 } 33 34 type HeroCacheMetricsRegistrationFunc func(uint64) module.HeroCacheMetrics 35 36 // HeroCacheMetricsFactory is a factory method to create a new HeroCacheCollector for a specific cache 37 // with a specific namespace and a specific name. 38 // Args: 39 // - namespace: the namespace of the cache 40 // - cacheName: the name of the cache 41 type HeroCacheMetricsFactory func(namespace string, cacheName string) module.HeroCacheMetrics 42 43 // NewHeroCacheMetricsFactory creates a new HeroCacheMetricsFactory for the given registrar. It allows to defer the 44 // registration of the metrics to the point where the cache is created without exposing the registrar to the cache. 45 // Args: 46 // - registrar: the prometheus registrar to register the metrics with 47 // Returns: 48 // - a HeroCacheMetricsFactory that can be used to create a new HeroCacheCollector for a specific cache 49 func NewHeroCacheMetricsFactory(registrar prometheus.Registerer) HeroCacheMetricsFactory { 50 return func(namespace string, cacheName string) module.HeroCacheMetrics { 51 return NewHeroCacheCollector(namespace, cacheName, registrar) 52 } 53 } 54 55 // NewNoopHeroCacheMetricsFactory creates a new HeroCacheMetricsFactory that returns a noop collector. 56 // This is useful for tests that don't want to register metrics. 57 // Args: 58 // - none 59 // Returns: 60 // - a HeroCacheMetricsFactory that returns a noop collector 61 func NewNoopHeroCacheMetricsFactory() HeroCacheMetricsFactory { 62 return func(string, string) module.HeroCacheMetrics { 63 return NewNoopCollector() 64 } 65 } 66 67 func NetworkReceiveCacheMetricsFactory(f HeroCacheMetricsFactory, networkType network.NetworkingType) module.HeroCacheMetrics { 68 r := ResourceNetworkingReceiveCache 69 if networkType == network.PublicNetwork { 70 r = PrependPublicPrefix(r) 71 } 72 return f(namespaceNetwork, r) 73 } 74 75 func NewSubscriptionRecordCacheMetricsFactory(f HeroCacheMetricsFactory, networkType network.NetworkingType) module.HeroCacheMetrics { 76 r := ResourceNetworkingSubscriptionRecordsCache 77 if networkType == network.PublicNetwork { 78 r = PrependPublicPrefix(r) 79 } 80 return f(namespaceNetwork, r) 81 } 82 83 // NewGossipSubApplicationSpecificScoreCacheMetrics is the factory method for creating a new HeroCacheCollector for the 84 // application specific score cache of the GossipSub peer scoring module. The application specific score cache is used 85 // to keep track of the application specific score of peers in GossipSub. 86 // Args: 87 // - f: the HeroCacheMetricsFactory to create the collector 88 // Returns: 89 // - a HeroCacheMetrics for the application specific score cache 90 func NewGossipSubApplicationSpecificScoreCacheMetrics(f HeroCacheMetricsFactory, networkingType network.NetworkingType) module.HeroCacheMetrics { 91 r := ResourceNetworkingGossipSubApplicationSpecificScoreCache 92 if networkingType == network.PublicNetwork { 93 r = PrependPublicPrefix(r) 94 } 95 return f(namespaceNetwork, r) 96 } 97 98 // DisallowListCacheMetricsFactory is the factory method for creating a new HeroCacheCollector for the disallow list cache. 99 // The disallow-list cache is used to keep track of peers that are disallow-listed and the reasons for it. 100 // Args: 101 // - f: the HeroCacheMetricsFactory to create the collector 102 // - networkingType: the networking type of the cache, i.e., whether it is used for the public or the private network 103 // Returns: 104 // - a HeroCacheMetrics for the disallow list cache 105 func DisallowListCacheMetricsFactory(f HeroCacheMetricsFactory, networkingType network.NetworkingType) module.HeroCacheMetrics { 106 r := ResourceNetworkingDisallowListCache 107 if networkingType == network.PublicNetwork { 108 r = PrependPublicPrefix(r) 109 } 110 return f(namespaceNetwork, r) 111 } 112 113 // GossipSubSpamRecordCacheMetricsFactory is the factory method for creating a new HeroCacheCollector for the spam record cache. 114 // The spam record cache is used to keep track of peers that are spamming the network and the reasons for it. 115 func GossipSubSpamRecordCacheMetricsFactory(f HeroCacheMetricsFactory, networkingType network.NetworkingType) module.HeroCacheMetrics { 116 r := ResourceNetworkingGossipSubSpamRecordCache 117 if networkingType == network.PublicNetwork { 118 r = PrependPublicPrefix(r) 119 } 120 return f(namespaceNetwork, r) 121 } 122 123 func NetworkDnsTxtCacheMetricsFactory(registrar prometheus.Registerer) *HeroCacheCollector { 124 return NewHeroCacheCollector(namespaceNetwork, ResourceNetworkingDnsTxtCache, registrar) 125 } 126 127 func NetworkDnsIpCacheMetricsFactory(registrar prometheus.Registerer) *HeroCacheCollector { 128 return NewHeroCacheCollector(namespaceNetwork, ResourceNetworkingDnsIpCache, registrar) 129 } 130 131 func ChunkDataPackRequestQueueMetricsFactory(registrar prometheus.Registerer) *HeroCacheCollector { 132 return NewHeroCacheCollector(namespaceExecution, ResourceChunkDataPackRequests, registrar) 133 } 134 135 func ReceiptRequestsQueueMetricFactory(registrar prometheus.Registerer) *HeroCacheCollector { 136 return NewHeroCacheCollector(namespaceExecution, ResourceReceipt, registrar) 137 } 138 139 func CollectionRequestsQueueMetricFactory(registrar prometheus.Registerer) *HeroCacheCollector { 140 return NewHeroCacheCollector(namespaceCollection, ResourceCollection, registrar) 141 } 142 143 func DisallowListNotificationQueueMetricFactory(registrar prometheus.Registerer) *HeroCacheCollector { 144 return NewHeroCacheCollector(namespaceNetwork, ResourceNetworkingDisallowListNotificationQueue, registrar) 145 } 146 147 func ApplicationLayerSpamRecordCacheMetricFactory(f HeroCacheMetricsFactory, networkType network.NetworkingType) module.HeroCacheMetrics { 148 r := ResourceNetworkingApplicationLayerSpamRecordCache 149 if networkType == network.PublicNetwork { 150 r = PrependPublicPrefix(r) 151 } 152 153 return f(namespaceNetwork, r) 154 } 155 156 func DialConfigCacheMetricFactory(f HeroCacheMetricsFactory, networkType network.NetworkingType) module.HeroCacheMetrics { 157 r := ResourceNetworkingUnicastDialConfigCache 158 if networkType == network.PublicNetwork { 159 r = PrependPublicPrefix(r) 160 } 161 return f(namespaceNetwork, r) 162 } 163 164 func ApplicationLayerSpamRecordQueueMetricsFactory(f HeroCacheMetricsFactory, networkType network.NetworkingType) module.HeroCacheMetrics { 165 r := ResourceNetworkingApplicationLayerSpamReportQueue 166 if networkType == network.PublicNetwork { 167 r = PrependPublicPrefix(r) 168 } 169 return f(namespaceNetwork, r) 170 } 171 172 func GossipSubRPCInspectorQueueMetricFactory(f HeroCacheMetricsFactory, networkType network.NetworkingType) module.HeroCacheMetrics { 173 // we don't use the public prefix for the metrics here for sake of backward compatibility of metric name. 174 r := ResourceNetworkingRpcValidationInspectorQueue 175 if networkType == network.PublicNetwork { 176 r = PrependPublicPrefix(r) 177 } 178 return f(namespaceNetwork, r) 179 } 180 181 func GossipSubDuplicateMessageTrackerCacheMetricFactory(f HeroCacheMetricsFactory, networkType network.NetworkingType) module.HeroCacheMetrics { 182 r := ResourceNetworkingGossipsubDuplicateMessagesTrackerCache 183 if networkType == network.PublicNetwork { 184 r = PrependPublicPrefix(r) 185 } 186 return f(namespaceNetwork, r) 187 } 188 189 func GossipSubRPCSentTrackerMetricFactory(f HeroCacheMetricsFactory, networkType network.NetworkingType) module.HeroCacheMetrics { 190 // we don't use the public prefix for the metrics here for sake of backward compatibility of metric name. 191 r := ResourceNetworkingRPCSentTrackerCache 192 if networkType == network.PublicNetwork { 193 r = PrependPublicPrefix(r) 194 } 195 return f(namespaceNetwork, r) 196 } 197 198 func GossipSubRPCSentTrackerQueueMetricFactory(f HeroCacheMetricsFactory, networkType network.NetworkingType) module.HeroCacheMetrics { 199 // we don't use the public prefix for the metrics here for sake of backward compatibility of metric name. 200 r := ResourceNetworkingRPCSentTrackerQueue 201 if networkType == network.PublicNetwork { 202 r = PrependPublicPrefix(r) 203 } 204 return f(namespaceNetwork, r) 205 } 206 207 func RpcInspectorNotificationQueueMetricFactory(f HeroCacheMetricsFactory, networkType network.NetworkingType) module.HeroCacheMetrics { 208 r := ResourceNetworkingRpcInspectorNotificationQueue 209 if networkType == network.PublicNetwork { 210 r = PrependPublicPrefix(r) 211 } 212 return f(namespaceNetwork, r) 213 } 214 215 func GossipSubRPCInspectorClusterPrefixedCacheMetricFactory(f HeroCacheMetricsFactory, networkType network.NetworkingType) module.HeroCacheMetrics { 216 // we don't use the public prefix for the metrics here for sake of backward compatibility of metric name. 217 r := ResourceNetworkingRpcClusterPrefixReceivedCache 218 if networkType == network.PublicNetwork { 219 r = PrependPublicPrefix(r) 220 } 221 return f(namespaceNetwork, r) 222 } 223 224 // GossipSubAppSpecificScoreUpdateQueueMetricFactory is the factory method for creating a new HeroCacheCollector for the 225 // app-specific score update queue of the GossipSub peer scoring module. The app-specific score update queue is used to 226 // queue the update requests for the app-specific score of peers. The update requests are queued in a worker pool and 227 // processed asynchronously. 228 // Args: 229 // - f: the HeroCacheMetricsFactory to create the collector 230 // Returns: 231 // - a HeroCacheMetrics for the app-specific score update queue. 232 func GossipSubAppSpecificScoreUpdateQueueMetricFactory(f HeroCacheMetricsFactory, networkingType network.NetworkingType) module.HeroCacheMetrics { 233 r := ResourceNetworkingAppSpecificScoreUpdateQueue 234 if networkingType == network.PublicNetwork { 235 r = PrependPublicPrefix(r) 236 } 237 return f(namespaceNetwork, r) 238 } 239 240 func CollectionNodeTransactionsCacheMetrics(registrar prometheus.Registerer, epoch uint64) *HeroCacheCollector { 241 return NewHeroCacheCollector(namespaceCollection, fmt.Sprintf("%s_%d", ResourceTransaction, epoch), registrar) 242 } 243 244 func FollowerCacheMetrics(registrar prometheus.Registerer) *HeroCacheCollector { 245 return NewHeroCacheCollector(namespaceFollowerEngine, ResourceFollowerPendingBlocksCache, registrar) 246 } 247 248 func AccessNodeExecutionDataCacheMetrics(registrar prometheus.Registerer) *HeroCacheCollector { 249 return NewHeroCacheCollector(namespaceAccess, ResourceExecutionDataCache, registrar) 250 } 251 252 // PrependPublicPrefix prepends the string "public" to the given string. 253 // This is used to distinguish between public and private metrics. 254 // Args: 255 // - str: the string to prepend, example: "my_metric" 256 // Returns: 257 // - the prepended string, example: "public_my_metric" 258 func PrependPublicPrefix(str string) string { 259 return fmt.Sprintf("%s_%s", "public", str) 260 } 261 262 func NewHeroCacheCollector(nameSpace string, cacheName string, registrar prometheus.Registerer) *HeroCacheCollector { 263 264 histogramNormalizedBucketSlotAvailable := prometheus.NewHistogram(prometheus.HistogramOpts{ 265 Namespace: nameSpace, 266 Subsystem: subsystemHeroCache, 267 268 // Note that the notion of "bucket" in HeroCache differs from Prometheus. 269 // A HeroCache "bucket" is used to group the keys of the entities. 270 // A Prometheus "bucket" is used to group collected data points within a range. 271 // This metric represents the histogram of normalized available slots in buckets, where 272 // a data point set to 1 represents a bucket with all slots available (i.e., a fully empty bucket), 273 // and a data point set to 0 means a bucket with no available slots (i.e., a completely full bucket). 274 // 275 // We generally set total slots of a bucket in HeroCache to 16. Hence: 276 // Prometheus bucket 1 represents total number of HeroCache buckets with at most 16 available slots. 277 // Prometheus bucket 0.75 represents total number of HeroCache buckets with at most 12 available slots. 278 // Prometheus bucket 0.5 represents total number of HeroCache buckets with at most 8 available slots. 279 // Prometheus bucket 0.25 represents total number of HeroCache buckets with at most 4 available slots. 280 // Prometheus bucket 0.1 represents total number of HeroCache buckets with at most 1 available slots. 281 // Prometheus bucket 0 represents total number of HeroCache buckets with no (i.e., zero) available slots. 282 Buckets: []float64{0, 0.1, 0.25, 0.5, 0.75, 1}, 283 Name: cacheName + "_" + "normalized_bucket_available_slot_count", 284 Help: "normalized histogram of available slots across all buckets", 285 }) 286 287 size := prometheus.NewGauge(prometheus.GaugeOpts{ 288 Namespace: nameSpace, 289 Subsystem: subsystemHeroCache, 290 Name: cacheName + "_" + "items_total", 291 Help: "total number of items in the cache", 292 }) 293 294 countKeyGetSuccess := prometheus.NewCounter(prometheus.CounterOpts{ 295 Namespace: nameSpace, 296 Subsystem: subsystemHeroCache, 297 Name: cacheName + "_" + "successful_read_count_total", 298 Help: "total number of successful read queries", 299 }) 300 301 countKeyGetFailure := prometheus.NewCounter(prometheus.CounterOpts{ 302 Namespace: nameSpace, 303 Subsystem: subsystemHeroCache, 304 Name: cacheName + "_" + "unsuccessful_read_count_total", 305 Help: "total number of unsuccessful read queries", 306 }) 307 308 countKeyPutAttempt := prometheus.NewCounter(prometheus.CounterOpts{ 309 Namespace: nameSpace, 310 Subsystem: subsystemHeroCache, 311 Name: cacheName + "_" + "write_attempt_count_total", 312 Help: "total number of put queries", 313 }) 314 315 countKeyPutDrop := prometheus.NewCounter(prometheus.CounterOpts{ 316 Namespace: nameSpace, 317 Subsystem: subsystemHeroCache, 318 Name: cacheName + "_" + "write_drop_count_total", 319 Help: "total number of put queries dropped due to full capacity", 320 }) 321 322 countKeyPutSuccess := prometheus.NewCounter(prometheus.CounterOpts{ 323 Namespace: nameSpace, 324 Subsystem: subsystemHeroCache, 325 Name: cacheName + "_" + "successful_write_count_total", 326 Help: "total number successful write queries", 327 }) 328 329 countKeyPutDeduplicated := prometheus.NewCounter(prometheus.CounterOpts{ 330 Namespace: nameSpace, 331 Subsystem: subsystemHeroCache, 332 Name: cacheName + "_" + "unsuccessful_write_count_total", 333 Help: "total number of queries writing an already existing (duplicate) entity to the cache", 334 }) 335 336 countKeyRemoved := prometheus.NewCounter(prometheus.CounterOpts{ 337 Namespace: nameSpace, 338 Subsystem: subsystemHeroCache, 339 Name: cacheName + "_" + "removed_count_total", 340 Help: "total number of entities removed from the cache", 341 }) 342 343 countKeyEjectionDueToFullCapacity := prometheus.NewCounter(prometheus.CounterOpts{ 344 Namespace: nameSpace, 345 Subsystem: subsystemHeroCache, 346 Name: cacheName + "_" + "full_capacity_entity_ejection_total", 347 Help: "total number of entities ejected when writing new entities at full capacity", 348 }) 349 350 countKeyEjectionDueToEmergency := prometheus.NewCounter(prometheus.CounterOpts{ 351 Namespace: nameSpace, 352 Subsystem: subsystemHeroCache, 353 Name: cacheName + "_" + "emergency_key_ejection_total", 354 Help: "total number of emergency key ejections at bucket level", 355 }) 356 357 registrar.MustRegister( 358 // available slot distribution 359 histogramNormalizedBucketSlotAvailable, 360 361 // size 362 size, 363 364 // read 365 countKeyGetSuccess, 366 countKeyGetFailure, 367 368 // write 369 countKeyPutSuccess, 370 countKeyPutDeduplicated, 371 countKeyPutDrop, 372 countKeyPutAttempt, 373 374 // remove 375 countKeyRemoved, 376 377 // ejection 378 countKeyEjectionDueToFullCapacity, 379 countKeyEjectionDueToEmergency) 380 381 return &HeroCacheCollector{ 382 histogramNormalizedBucketSlotAvailable: histogramNormalizedBucketSlotAvailable, 383 size: size, 384 countKeyGetSuccess: countKeyGetSuccess, 385 countKeyGetFailure: countKeyGetFailure, 386 387 countKeyPutAttempt: countKeyPutAttempt, 388 countKeyPutSuccess: countKeyPutSuccess, 389 countKeyPutDeduplicated: countKeyPutDeduplicated, 390 countKeyPutDrop: countKeyPutDrop, 391 392 countKeyRemoved: countKeyRemoved, 393 394 countKeyEjectionDueToFullCapacity: countKeyEjectionDueToFullCapacity, 395 countKeyEjectionDueToEmergency: countKeyEjectionDueToEmergency, 396 } 397 } 398 399 // BucketAvailableSlots keeps track of number of available slots in buckets of cache. 400 func (h *HeroCacheCollector) BucketAvailableSlots(availableSlots uint64, totalSlots uint64) { 401 normalizedAvailableSlots := float64(availableSlots) / float64(totalSlots) 402 h.histogramNormalizedBucketSlotAvailable.Observe(normalizedAvailableSlots) 403 } 404 405 // OnKeyPutSuccess is called whenever a new (key, entity) pair is successfully added to the cache. 406 // size parameter is the current size of the cache post insertion. 407 func (h *HeroCacheCollector) OnKeyPutSuccess(size uint32) { 408 h.countKeyPutSuccess.Inc() 409 h.size.Set(float64(size)) 410 } 411 412 // OnKeyPutDeduplicated is tracking the total number of unsuccessful writes caused by adding a duplicate key to the cache. 413 // A duplicate key is dropped by the cache when it is written to the cache. 414 // Note: in context of HeroCache, the key corresponds to the identifier of its entity. Hence, a duplicate key corresponds to 415 // a duplicate entity. 416 func (h *HeroCacheCollector) OnKeyPutDeduplicated() { 417 h.countKeyPutDeduplicated.Inc() 418 } 419 420 // OnKeyGetSuccess tracks total number of successful read queries. 421 // A read query is successful if the entity corresponding to its key is available in the cache. 422 // Note: in context of HeroCache, the key corresponds to the identifier of its entity. 423 func (h *HeroCacheCollector) OnKeyGetSuccess() { 424 h.countKeyGetSuccess.Inc() 425 } 426 427 // OnKeyGetFailure tracks total number of unsuccessful read queries. 428 // A read query is unsuccessful if the entity corresponding to its key is not available in the cache. 429 // Note: in context of HeroCache, the key corresponds to the identifier of its entity. 430 func (h *HeroCacheCollector) OnKeyGetFailure() { 431 h.countKeyGetFailure.Inc() 432 } 433 434 // OnKeyPutAttempt is called whenever a new (key, value) pair is attempted to be put in cache. 435 // It does not reflect whether the put was successful or not. 436 // A (key, value) pair put attempt may fail if the cache is full, or the key already exists. 437 // size parameter is the current size of the cache prior to the put attempt. 438 func (h *HeroCacheCollector) OnKeyPutAttempt(size uint32) { 439 h.countKeyPutAttempt.Inc() 440 h.size.Set(float64(size)) 441 } 442 443 // OnKeyPutDrop is called whenever a new (key, entity) pair is dropped from the cache due to full cache. 444 func (h *HeroCacheCollector) OnKeyPutDrop() { 445 h.countKeyPutDrop.Inc() 446 } 447 448 // OnKeyRemoved is called whenever a (key, entity) pair is removed from the cache. 449 // size parameter is the current size of the cache. 450 func (h *HeroCacheCollector) OnKeyRemoved(size uint32) { 451 h.countKeyRemoved.Inc() 452 h.size.Set(float64(size)) 453 } 454 455 // OnEntityEjectionDueToFullCapacity is called whenever adding a new (key, entity) to the cache results in ejection of another (key', entity') pair. 456 // This normally happens -- and is expected -- when the cache is full. 457 // Note: in context of HeroCache, the key corresponds to the identifier of its entity. 458 func (h *HeroCacheCollector) OnEntityEjectionDueToFullCapacity() { 459 h.countKeyEjectionDueToFullCapacity.Inc() 460 } 461 462 // OnEntityEjectionDueToEmergency is called whenever a bucket is found full and all of its keys are valid, i.e., 463 // each key belongs to an existing (key, entity) pair. 464 // Hence, adding a new key to that bucket will replace the oldest valid key inside that bucket. 465 // Note: in context of HeroCache, the key corresponds to the identifier of its entity. 466 func (h *HeroCacheCollector) OnEntityEjectionDueToEmergency() { 467 h.countKeyEjectionDueToEmergency.Inc() 468 }