github.com/onflow/flow-go@v0.35.7-crescendo-preview.23-atree-inlining/module/metrics/herocache.go (about)

     1  package metrics
     2  
     3  import (
     4  	"fmt"
     5  
     6  	"github.com/prometheus/client_golang/prometheus"
     7  
     8  	"github.com/onflow/flow-go/module"
     9  	"github.com/onflow/flow-go/network"
    10  )
    11  
    12  const subsystemHeroCache = "hero_cache"
    13  
    14  var _ module.HeroCacheMetrics = (*HeroCacheCollector)(nil)
    15  
    16  type HeroCacheCollector struct {
    17  	histogramNormalizedBucketSlotAvailable prometheus.Histogram
    18  
    19  	countKeyGetSuccess prometheus.Counter
    20  	countKeyGetFailure prometheus.Counter
    21  
    22  	countKeyPutSuccess      prometheus.Counter
    23  	countKeyPutDrop         prometheus.Counter
    24  	countKeyPutDeduplicated prometheus.Counter
    25  	countKeyPutAttempt      prometheus.Counter
    26  	countKeyRemoved         prometheus.Counter
    27  
    28  	size prometheus.Gauge
    29  
    30  	countKeyEjectionDueToFullCapacity prometheus.Counter
    31  	countKeyEjectionDueToEmergency    prometheus.Counter
    32  }
    33  
    34  type HeroCacheMetricsRegistrationFunc func(uint64) module.HeroCacheMetrics
    35  
    36  // HeroCacheMetricsFactory is a factory method to create a new HeroCacheCollector for a specific cache
    37  // with a specific namespace and a specific name.
    38  // Args:
    39  // - namespace: the namespace of the cache
    40  // - cacheName: the name of the cache
    41  type HeroCacheMetricsFactory func(namespace string, cacheName string) module.HeroCacheMetrics
    42  
    43  // NewHeroCacheMetricsFactory creates a new HeroCacheMetricsFactory for the given registrar. It allows to defer the
    44  // registration of the metrics to the point where the cache is created without exposing the registrar to the cache.
    45  // Args:
    46  // - registrar: the prometheus registrar to register the metrics with
    47  // Returns:
    48  // - a HeroCacheMetricsFactory that can be used to create a new HeroCacheCollector for a specific cache
    49  func NewHeroCacheMetricsFactory(registrar prometheus.Registerer) HeroCacheMetricsFactory {
    50  	return func(namespace string, cacheName string) module.HeroCacheMetrics {
    51  		return NewHeroCacheCollector(namespace, cacheName, registrar)
    52  	}
    53  }
    54  
    55  // NewNoopHeroCacheMetricsFactory creates a new HeroCacheMetricsFactory that returns a noop collector.
    56  // This is useful for tests that don't want to register metrics.
    57  // Args:
    58  // - none
    59  // Returns:
    60  // - a HeroCacheMetricsFactory that returns a noop collector
    61  func NewNoopHeroCacheMetricsFactory() HeroCacheMetricsFactory {
    62  	return func(string, string) module.HeroCacheMetrics {
    63  		return NewNoopCollector()
    64  	}
    65  }
    66  
    67  func NetworkReceiveCacheMetricsFactory(f HeroCacheMetricsFactory, networkType network.NetworkingType) module.HeroCacheMetrics {
    68  	r := ResourceNetworkingReceiveCache
    69  	if networkType == network.PublicNetwork {
    70  		r = PrependPublicPrefix(r)
    71  	}
    72  	return f(namespaceNetwork, r)
    73  }
    74  
    75  func NewSubscriptionRecordCacheMetricsFactory(f HeroCacheMetricsFactory, networkType network.NetworkingType) module.HeroCacheMetrics {
    76  	r := ResourceNetworkingSubscriptionRecordsCache
    77  	if networkType == network.PublicNetwork {
    78  		r = PrependPublicPrefix(r)
    79  	}
    80  	return f(namespaceNetwork, r)
    81  }
    82  
    83  // NewGossipSubApplicationSpecificScoreCacheMetrics is the factory method for creating a new HeroCacheCollector for the
    84  // application specific score cache of the GossipSub peer scoring module. The application specific score cache is used
    85  // to keep track of the application specific score of peers in GossipSub.
    86  // Args:
    87  // - f: the HeroCacheMetricsFactory to create the collector
    88  // Returns:
    89  // - a HeroCacheMetrics for the application specific score cache
    90  func NewGossipSubApplicationSpecificScoreCacheMetrics(f HeroCacheMetricsFactory, networkingType network.NetworkingType) module.HeroCacheMetrics {
    91  	r := ResourceNetworkingGossipSubApplicationSpecificScoreCache
    92  	if networkingType == network.PublicNetwork {
    93  		r = PrependPublicPrefix(r)
    94  	}
    95  	return f(namespaceNetwork, r)
    96  }
    97  
    98  // DisallowListCacheMetricsFactory is the factory method for creating a new HeroCacheCollector for the disallow list cache.
    99  // The disallow-list cache is used to keep track of peers that are disallow-listed and the reasons for it.
   100  // Args:
   101  // - f: the HeroCacheMetricsFactory to create the collector
   102  // - networkingType: the networking type of the cache, i.e., whether it is used for the public or the private network
   103  // Returns:
   104  // - a HeroCacheMetrics for the disallow list cache
   105  func DisallowListCacheMetricsFactory(f HeroCacheMetricsFactory, networkingType network.NetworkingType) module.HeroCacheMetrics {
   106  	r := ResourceNetworkingDisallowListCache
   107  	if networkingType == network.PublicNetwork {
   108  		r = PrependPublicPrefix(r)
   109  	}
   110  	return f(namespaceNetwork, r)
   111  }
   112  
   113  // GossipSubSpamRecordCacheMetricsFactory is the factory method for creating a new HeroCacheCollector for the spam record cache.
   114  // The spam record cache is used to keep track of peers that are spamming the network and the reasons for it.
   115  func GossipSubSpamRecordCacheMetricsFactory(f HeroCacheMetricsFactory, networkingType network.NetworkingType) module.HeroCacheMetrics {
   116  	r := ResourceNetworkingGossipSubSpamRecordCache
   117  	if networkingType == network.PublicNetwork {
   118  		r = PrependPublicPrefix(r)
   119  	}
   120  	return f(namespaceNetwork, r)
   121  }
   122  
   123  func NetworkDnsTxtCacheMetricsFactory(registrar prometheus.Registerer) *HeroCacheCollector {
   124  	return NewHeroCacheCollector(namespaceNetwork, ResourceNetworkingDnsTxtCache, registrar)
   125  }
   126  
   127  func NetworkDnsIpCacheMetricsFactory(registrar prometheus.Registerer) *HeroCacheCollector {
   128  	return NewHeroCacheCollector(namespaceNetwork, ResourceNetworkingDnsIpCache, registrar)
   129  }
   130  
   131  func ChunkDataPackRequestQueueMetricsFactory(registrar prometheus.Registerer) *HeroCacheCollector {
   132  	return NewHeroCacheCollector(namespaceExecution, ResourceChunkDataPackRequests, registrar)
   133  }
   134  
   135  func ReceiptRequestsQueueMetricFactory(registrar prometheus.Registerer) *HeroCacheCollector {
   136  	return NewHeroCacheCollector(namespaceExecution, ResourceReceipt, registrar)
   137  }
   138  
   139  func CollectionRequestsQueueMetricFactory(registrar prometheus.Registerer) *HeroCacheCollector {
   140  	return NewHeroCacheCollector(namespaceCollection, ResourceCollection, registrar)
   141  }
   142  
   143  func DisallowListNotificationQueueMetricFactory(registrar prometheus.Registerer) *HeroCacheCollector {
   144  	return NewHeroCacheCollector(namespaceNetwork, ResourceNetworkingDisallowListNotificationQueue, registrar)
   145  }
   146  
   147  func ApplicationLayerSpamRecordCacheMetricFactory(f HeroCacheMetricsFactory, networkType network.NetworkingType) module.HeroCacheMetrics {
   148  	r := ResourceNetworkingApplicationLayerSpamRecordCache
   149  	if networkType == network.PublicNetwork {
   150  		r = PrependPublicPrefix(r)
   151  	}
   152  
   153  	return f(namespaceNetwork, r)
   154  }
   155  
   156  func DialConfigCacheMetricFactory(f HeroCacheMetricsFactory, networkType network.NetworkingType) module.HeroCacheMetrics {
   157  	r := ResourceNetworkingUnicastDialConfigCache
   158  	if networkType == network.PublicNetwork {
   159  		r = PrependPublicPrefix(r)
   160  	}
   161  	return f(namespaceNetwork, r)
   162  }
   163  
   164  func ApplicationLayerSpamRecordQueueMetricsFactory(f HeroCacheMetricsFactory, networkType network.NetworkingType) module.HeroCacheMetrics {
   165  	r := ResourceNetworkingApplicationLayerSpamReportQueue
   166  	if networkType == network.PublicNetwork {
   167  		r = PrependPublicPrefix(r)
   168  	}
   169  	return f(namespaceNetwork, r)
   170  }
   171  
   172  func GossipSubRPCInspectorQueueMetricFactory(f HeroCacheMetricsFactory, networkType network.NetworkingType) module.HeroCacheMetrics {
   173  	// we don't use the public prefix for the metrics here for sake of backward compatibility of metric name.
   174  	r := ResourceNetworkingRpcValidationInspectorQueue
   175  	if networkType == network.PublicNetwork {
   176  		r = PrependPublicPrefix(r)
   177  	}
   178  	return f(namespaceNetwork, r)
   179  }
   180  
   181  func GossipSubDuplicateMessageTrackerCacheMetricFactory(f HeroCacheMetricsFactory, networkType network.NetworkingType) module.HeroCacheMetrics {
   182  	r := ResourceNetworkingGossipsubDuplicateMessagesTrackerCache
   183  	if networkType == network.PublicNetwork {
   184  		r = PrependPublicPrefix(r)
   185  	}
   186  	return f(namespaceNetwork, r)
   187  }
   188  
   189  func GossipSubRPCSentTrackerMetricFactory(f HeroCacheMetricsFactory, networkType network.NetworkingType) module.HeroCacheMetrics {
   190  	// we don't use the public prefix for the metrics here for sake of backward compatibility of metric name.
   191  	r := ResourceNetworkingRPCSentTrackerCache
   192  	if networkType == network.PublicNetwork {
   193  		r = PrependPublicPrefix(r)
   194  	}
   195  	return f(namespaceNetwork, r)
   196  }
   197  
   198  func GossipSubRPCSentTrackerQueueMetricFactory(f HeroCacheMetricsFactory, networkType network.NetworkingType) module.HeroCacheMetrics {
   199  	// we don't use the public prefix for the metrics here for sake of backward compatibility of metric name.
   200  	r := ResourceNetworkingRPCSentTrackerQueue
   201  	if networkType == network.PublicNetwork {
   202  		r = PrependPublicPrefix(r)
   203  	}
   204  	return f(namespaceNetwork, r)
   205  }
   206  
   207  func RpcInspectorNotificationQueueMetricFactory(f HeroCacheMetricsFactory, networkType network.NetworkingType) module.HeroCacheMetrics {
   208  	r := ResourceNetworkingRpcInspectorNotificationQueue
   209  	if networkType == network.PublicNetwork {
   210  		r = PrependPublicPrefix(r)
   211  	}
   212  	return f(namespaceNetwork, r)
   213  }
   214  
   215  func GossipSubRPCInspectorClusterPrefixedCacheMetricFactory(f HeroCacheMetricsFactory, networkType network.NetworkingType) module.HeroCacheMetrics {
   216  	// we don't use the public prefix for the metrics here for sake of backward compatibility of metric name.
   217  	r := ResourceNetworkingRpcClusterPrefixReceivedCache
   218  	if networkType == network.PublicNetwork {
   219  		r = PrependPublicPrefix(r)
   220  	}
   221  	return f(namespaceNetwork, r)
   222  }
   223  
   224  // GossipSubAppSpecificScoreUpdateQueueMetricFactory is the factory method for creating a new HeroCacheCollector for the
   225  // app-specific score update queue of the GossipSub peer scoring module. The app-specific score update queue is used to
   226  // queue the update requests for the app-specific score of peers. The update requests are queued in a worker pool and
   227  // processed asynchronously.
   228  // Args:
   229  // - f: the HeroCacheMetricsFactory to create the collector
   230  // Returns:
   231  // - a HeroCacheMetrics for the app-specific score update queue.
   232  func GossipSubAppSpecificScoreUpdateQueueMetricFactory(f HeroCacheMetricsFactory, networkingType network.NetworkingType) module.HeroCacheMetrics {
   233  	r := ResourceNetworkingAppSpecificScoreUpdateQueue
   234  	if networkingType == network.PublicNetwork {
   235  		r = PrependPublicPrefix(r)
   236  	}
   237  	return f(namespaceNetwork, r)
   238  }
   239  
   240  func CollectionNodeTransactionsCacheMetrics(registrar prometheus.Registerer, epoch uint64) *HeroCacheCollector {
   241  	return NewHeroCacheCollector(namespaceCollection, fmt.Sprintf("%s_%d", ResourceTransaction, epoch), registrar)
   242  }
   243  
   244  func FollowerCacheMetrics(registrar prometheus.Registerer) *HeroCacheCollector {
   245  	return NewHeroCacheCollector(namespaceFollowerEngine, ResourceFollowerPendingBlocksCache, registrar)
   246  }
   247  
   248  func AccessNodeExecutionDataCacheMetrics(registrar prometheus.Registerer) *HeroCacheCollector {
   249  	return NewHeroCacheCollector(namespaceAccess, ResourceExecutionDataCache, registrar)
   250  }
   251  
   252  // PrependPublicPrefix prepends the string "public" to the given string.
   253  // This is used to distinguish between public and private metrics.
   254  // Args:
   255  // - str: the string to prepend, example: "my_metric"
   256  // Returns:
   257  // - the prepended string, example: "public_my_metric"
   258  func PrependPublicPrefix(str string) string {
   259  	return fmt.Sprintf("%s_%s", "public", str)
   260  }
   261  
   262  func NewHeroCacheCollector(nameSpace string, cacheName string, registrar prometheus.Registerer) *HeroCacheCollector {
   263  
   264  	histogramNormalizedBucketSlotAvailable := prometheus.NewHistogram(prometheus.HistogramOpts{
   265  		Namespace: nameSpace,
   266  		Subsystem: subsystemHeroCache,
   267  
   268  		// Note that the notion of "bucket" in HeroCache differs from Prometheus.
   269  		// A HeroCache "bucket" is used to group the keys of the entities.
   270  		// A Prometheus "bucket" is used to group collected data points within a range.
   271  		// This metric represents the histogram of normalized available slots in buckets, where
   272  		// a data point set to 1 represents a bucket with all slots available (i.e., a fully empty bucket),
   273  		// and a data point set to 0 means a bucket with no available slots (i.e., a completely full bucket).
   274  		//
   275  		// We generally set total slots of a bucket in HeroCache to 16. Hence:
   276  		// Prometheus bucket 1 represents total number of HeroCache buckets with at most 16 available slots.
   277  		// Prometheus bucket 0.75 represents total number of HeroCache buckets with at most 12 available slots.
   278  		// Prometheus bucket 0.5 represents total number of HeroCache buckets with at most 8 available slots.
   279  		// Prometheus bucket 0.25 represents total number of HeroCache buckets with at most 4 available slots.
   280  		// Prometheus bucket 0.1 represents total number of HeroCache buckets with at most 1 available slots.
   281  		// Prometheus bucket 0 represents total number of HeroCache buckets with no (i.e., zero) available slots.
   282  		Buckets: []float64{0, 0.1, 0.25, 0.5, 0.75, 1},
   283  		Name:    cacheName + "_" + "normalized_bucket_available_slot_count",
   284  		Help:    "normalized histogram of available slots across all buckets",
   285  	})
   286  
   287  	size := prometheus.NewGauge(prometheus.GaugeOpts{
   288  		Namespace: nameSpace,
   289  		Subsystem: subsystemHeroCache,
   290  		Name:      cacheName + "_" + "items_total",
   291  		Help:      "total number of items in the cache",
   292  	})
   293  
   294  	countKeyGetSuccess := prometheus.NewCounter(prometheus.CounterOpts{
   295  		Namespace: nameSpace,
   296  		Subsystem: subsystemHeroCache,
   297  		Name:      cacheName + "_" + "successful_read_count_total",
   298  		Help:      "total number of successful read queries",
   299  	})
   300  
   301  	countKeyGetFailure := prometheus.NewCounter(prometheus.CounterOpts{
   302  		Namespace: nameSpace,
   303  		Subsystem: subsystemHeroCache,
   304  		Name:      cacheName + "_" + "unsuccessful_read_count_total",
   305  		Help:      "total number of unsuccessful read queries",
   306  	})
   307  
   308  	countKeyPutAttempt := prometheus.NewCounter(prometheus.CounterOpts{
   309  		Namespace: nameSpace,
   310  		Subsystem: subsystemHeroCache,
   311  		Name:      cacheName + "_" + "write_attempt_count_total",
   312  		Help:      "total number of put queries",
   313  	})
   314  
   315  	countKeyPutDrop := prometheus.NewCounter(prometheus.CounterOpts{
   316  		Namespace: nameSpace,
   317  		Subsystem: subsystemHeroCache,
   318  		Name:      cacheName + "_" + "write_drop_count_total",
   319  		Help:      "total number of put queries dropped due to full capacity",
   320  	})
   321  
   322  	countKeyPutSuccess := prometheus.NewCounter(prometheus.CounterOpts{
   323  		Namespace: nameSpace,
   324  		Subsystem: subsystemHeroCache,
   325  		Name:      cacheName + "_" + "successful_write_count_total",
   326  		Help:      "total number successful write queries",
   327  	})
   328  
   329  	countKeyPutDeduplicated := prometheus.NewCounter(prometheus.CounterOpts{
   330  		Namespace: nameSpace,
   331  		Subsystem: subsystemHeroCache,
   332  		Name:      cacheName + "_" + "unsuccessful_write_count_total",
   333  		Help:      "total number of queries writing an already existing (duplicate) entity to the cache",
   334  	})
   335  
   336  	countKeyRemoved := prometheus.NewCounter(prometheus.CounterOpts{
   337  		Namespace: nameSpace,
   338  		Subsystem: subsystemHeroCache,
   339  		Name:      cacheName + "_" + "removed_count_total",
   340  		Help:      "total number of entities removed from the cache",
   341  	})
   342  
   343  	countKeyEjectionDueToFullCapacity := prometheus.NewCounter(prometheus.CounterOpts{
   344  		Namespace: nameSpace,
   345  		Subsystem: subsystemHeroCache,
   346  		Name:      cacheName + "_" + "full_capacity_entity_ejection_total",
   347  		Help:      "total number of entities ejected when writing new entities at full capacity",
   348  	})
   349  
   350  	countKeyEjectionDueToEmergency := prometheus.NewCounter(prometheus.CounterOpts{
   351  		Namespace: nameSpace,
   352  		Subsystem: subsystemHeroCache,
   353  		Name:      cacheName + "_" + "emergency_key_ejection_total",
   354  		Help:      "total number of emergency key ejections at bucket level",
   355  	})
   356  
   357  	registrar.MustRegister(
   358  		// available slot distribution
   359  		histogramNormalizedBucketSlotAvailable,
   360  
   361  		// size
   362  		size,
   363  
   364  		// read
   365  		countKeyGetSuccess,
   366  		countKeyGetFailure,
   367  
   368  		// write
   369  		countKeyPutSuccess,
   370  		countKeyPutDeduplicated,
   371  		countKeyPutDrop,
   372  		countKeyPutAttempt,
   373  
   374  		// remove
   375  		countKeyRemoved,
   376  
   377  		// ejection
   378  		countKeyEjectionDueToFullCapacity,
   379  		countKeyEjectionDueToEmergency)
   380  
   381  	return &HeroCacheCollector{
   382  		histogramNormalizedBucketSlotAvailable: histogramNormalizedBucketSlotAvailable,
   383  		size:                                   size,
   384  		countKeyGetSuccess:                     countKeyGetSuccess,
   385  		countKeyGetFailure:                     countKeyGetFailure,
   386  
   387  		countKeyPutAttempt:      countKeyPutAttempt,
   388  		countKeyPutSuccess:      countKeyPutSuccess,
   389  		countKeyPutDeduplicated: countKeyPutDeduplicated,
   390  		countKeyPutDrop:         countKeyPutDrop,
   391  
   392  		countKeyRemoved: countKeyRemoved,
   393  
   394  		countKeyEjectionDueToFullCapacity: countKeyEjectionDueToFullCapacity,
   395  		countKeyEjectionDueToEmergency:    countKeyEjectionDueToEmergency,
   396  	}
   397  }
   398  
   399  // BucketAvailableSlots keeps track of number of available slots in buckets of cache.
   400  func (h *HeroCacheCollector) BucketAvailableSlots(availableSlots uint64, totalSlots uint64) {
   401  	normalizedAvailableSlots := float64(availableSlots) / float64(totalSlots)
   402  	h.histogramNormalizedBucketSlotAvailable.Observe(normalizedAvailableSlots)
   403  }
   404  
   405  // OnKeyPutSuccess is called whenever a new (key, entity) pair is successfully added to the cache.
   406  // size parameter is the current size of the cache post insertion.
   407  func (h *HeroCacheCollector) OnKeyPutSuccess(size uint32) {
   408  	h.countKeyPutSuccess.Inc()
   409  	h.size.Set(float64(size))
   410  }
   411  
   412  // OnKeyPutDeduplicated is tracking the total number of unsuccessful writes caused by adding a duplicate key to the cache.
   413  // A duplicate key is dropped by the cache when it is written to the cache.
   414  // Note: in context of HeroCache, the key corresponds to the identifier of its entity. Hence, a duplicate key corresponds to
   415  // a duplicate entity.
   416  func (h *HeroCacheCollector) OnKeyPutDeduplicated() {
   417  	h.countKeyPutDeduplicated.Inc()
   418  }
   419  
   420  // OnKeyGetSuccess tracks total number of successful read queries.
   421  // A read query is successful if the entity corresponding to its key is available in the cache.
   422  // Note: in context of HeroCache, the key corresponds to the identifier of its entity.
   423  func (h *HeroCacheCollector) OnKeyGetSuccess() {
   424  	h.countKeyGetSuccess.Inc()
   425  }
   426  
   427  // OnKeyGetFailure tracks total number of unsuccessful read queries.
   428  // A read query is unsuccessful if the entity corresponding to its key is not available in the cache.
   429  // Note: in context of HeroCache, the key corresponds to the identifier of its entity.
   430  func (h *HeroCacheCollector) OnKeyGetFailure() {
   431  	h.countKeyGetFailure.Inc()
   432  }
   433  
   434  // OnKeyPutAttempt is called whenever a new (key, value) pair is attempted to be put in cache.
   435  // It does not reflect whether the put was successful or not.
   436  // A (key, value) pair put attempt may fail if the cache is full, or the key already exists.
   437  // size parameter is the current size of the cache prior to the put attempt.
   438  func (h *HeroCacheCollector) OnKeyPutAttempt(size uint32) {
   439  	h.countKeyPutAttempt.Inc()
   440  	h.size.Set(float64(size))
   441  }
   442  
   443  // OnKeyPutDrop is called whenever a new (key, entity) pair is dropped from the cache due to full cache.
   444  func (h *HeroCacheCollector) OnKeyPutDrop() {
   445  	h.countKeyPutDrop.Inc()
   446  }
   447  
   448  // OnKeyRemoved is called whenever a (key, entity) pair is removed from the cache.
   449  // size parameter is the current size of the cache.
   450  func (h *HeroCacheCollector) OnKeyRemoved(size uint32) {
   451  	h.countKeyRemoved.Inc()
   452  	h.size.Set(float64(size))
   453  }
   454  
   455  // OnEntityEjectionDueToFullCapacity is called whenever adding a new (key, entity) to the cache results in ejection of another (key', entity') pair.
   456  // This normally happens -- and is expected -- when the cache is full.
   457  // Note: in context of HeroCache, the key corresponds to the identifier of its entity.
   458  func (h *HeroCacheCollector) OnEntityEjectionDueToFullCapacity() {
   459  	h.countKeyEjectionDueToFullCapacity.Inc()
   460  }
   461  
   462  // OnEntityEjectionDueToEmergency is called whenever a bucket is found full and all of its keys are valid, i.e.,
   463  // each key belongs to an existing (key, entity) pair.
   464  // Hence, adding a new key to that bucket will replace the oldest valid key inside that bucket.
   465  // Note: in context of HeroCache, the key corresponds to the identifier of its entity.
   466  func (h *HeroCacheCollector) OnEntityEjectionDueToEmergency() {
   467  	h.countKeyEjectionDueToEmergency.Inc()
   468  }