github.com/onflow/flow-go@v0.35.7-crescendo-preview.23-atree-inlining/module/metrics.go (about) 1 package module 2 3 import ( 4 "context" 5 "time" 6 7 "github.com/libp2p/go-libp2p/core/peer" 8 rcmgr "github.com/libp2p/go-libp2p/p2p/host/resource-manager" 9 httpmetrics "github.com/slok/go-http-metrics/metrics" 10 11 "github.com/onflow/flow-go/model/chainsync" 12 "github.com/onflow/flow-go/model/cluster" 13 "github.com/onflow/flow-go/model/flow" 14 "github.com/onflow/flow-go/network/channels" 15 p2pmsg "github.com/onflow/flow-go/network/p2p/message" 16 ) 17 18 type EntriesFunc func() uint 19 20 // ResolverMetrics encapsulates the metrics collectors for dns resolver module of the networking layer. 21 type ResolverMetrics interface { 22 // DNSLookupDuration tracks the time spent to resolve a DNS address. 23 DNSLookupDuration(duration time.Duration) 24 25 // OnDNSCacheMiss tracks the total number of dns requests resolved through looking up the network. 26 OnDNSCacheMiss() 27 28 // OnDNSCacheHit tracks the total number of dns requests resolved through the cache without 29 // looking up the network. 30 OnDNSCacheHit() 31 32 // OnDNSCacheInvalidated is called whenever dns cache is invalidated for an entry 33 OnDNSCacheInvalidated() 34 35 // OnDNSLookupRequestDropped tracks the number of dns lookup requests that are dropped due to a full queue 36 OnDNSLookupRequestDropped() 37 } 38 39 // NetworkSecurityMetrics metrics related to network protection. 40 type NetworkSecurityMetrics interface { 41 // OnUnauthorizedMessage tracks the number of unauthorized messages seen on the network. 42 OnUnauthorizedMessage(role, msgType, topic, offense string) 43 44 // OnRateLimitedPeer tracks the number of rate limited unicast messages seen on the network. 45 OnRateLimitedPeer(pid peer.ID, role, msgType, topic, reason string) 46 47 // OnViolationReportSkipped tracks the number of slashing violations consumer violations that were not 48 // reported for misbehavior when the identity of the sender not known. 49 OnViolationReportSkipped() 50 } 51 52 // GossipSubRpcInspectorMetrics encapsulates the metrics collectors for GossipSub RPC Inspector module of the networking layer. 53 // The RPC inspector is the entry point of the GossipSub protocol. It inspects the incoming RPC messages and decides 54 // whether to accept, prune, or reject the RPC message. 55 // The GossipSubRpcInspectorMetrics tracks the number of RPC messages received by the local node from other nodes over 56 // the GossipSub protocol. It also tracks the number of control messages included in the RPC messages, i.e., IHAVE, IWANT, 57 // GRAFT, PRUNE. It also tracks the number of actual messages included in the RPC messages. 58 // The GossipSubRpcInspectorMetrics differs from LocalGossipSubRouterMetrics in that the former tracks the messages 59 // received by the local node from other nodes over the GossipSub protocol but may not all be accepted by the local node, 60 // e.g., due to RPC pruning or throttling; while the latter tracks the local node's view of the GossipSub protocol, i.e., entirely 61 // containing the messages that are accepted by the local node (either as whole RPC or only for the control messages). 62 // Having this distinction is useful for debugging and troubleshooting the GossipSub protocol, for example, the number of 63 // messages received by the local node from other nodes over the GossipSub protocol may be much higher than the number 64 // of messages accepted by the local node, which may indicate that the local node is throttling the incoming messages. 65 type GossipSubRpcInspectorMetrics interface { 66 // OnIWantMessageIDsReceived tracks the number of message ids received by the node from other nodes on an RPC. 67 // Note: this function is called on each IWANT message received by the node, not on each message id received. 68 OnIWantMessageIDsReceived(msgIdCount int) 69 70 // OnIHaveMessageIDsReceived tracks the number of message ids received by the node from other nodes on an iHave message. 71 // This function is called on each iHave message received by the node. 72 // Args: 73 // - channel: the channel on which the iHave message was received. 74 // - msgIdCount: the number of message ids received on the iHave message. 75 OnIHaveMessageIDsReceived(channel string, msgIdCount int) 76 77 // OnIncomingRpcReceived tracks the number of RPC messages received by the node. 78 // Args: 79 // iHaveCount: the number of iHAVE messages included in the RPC. 80 // iWantCount: the number of iWANT messages included in the RPC. 81 // graftCount: the number of GRAFT messages included in the RPC. 82 // pruneCount: the number of PRUNE messages included in the RPC. 83 // msgCount: the number of publish messages included in the RPC. 84 OnIncomingRpcReceived(iHaveCount, iWantCount, graftCount, pruneCount, msgCount int) 85 } 86 87 // GossipSubScoringRegistryMetrics encapsulates the metrics collectors for collecting metrics related to the Gossipsub scoring registry. 88 // GossipSubScoringRegistryMetrics encapsulates various metrics collectors offering insights into penalties and 89 // other factors used by the scoring registry to compute the application-specific score. It focuses on tracking internal 90 // aspects of the application-specific score, distinguishing itself from GossipSubScoringMetrics. 91 type GossipSubScoringRegistryMetrics interface { 92 // DuplicateMessagePenalties tracks the duplicate message penalty for a node. 93 DuplicateMessagePenalties(penalty float64) 94 // DuplicateMessagesCounts tracks the duplicate message count for a node. 95 DuplicateMessagesCounts(count float64) 96 } 97 98 // LocalGossipSubRouterMetrics encapsulates the metrics collectors for GossipSub router of the local node. 99 // It gives a lens into the local GossipSub node's view of the GossipSub protocol. 100 // LocalGossipSubRouterMetrics differs from GossipSubRpcInspectorMetrics in that the former tracks the local node's view 101 // of the GossipSub protocol, while the latter tracks the messages received by the local node from other nodes over the 102 // GossipSub protocol but may not all be accepted by the local node, e.g., due to RPC pruning or throttling. 103 // Having this distinction is useful for debugging and troubleshooting the GossipSub protocol, for example, the number of 104 // messages received by the local node from other nodes over the GossipSub protocol may be much higher than the number 105 // of messages accepted by the local node, which may indicate that the local node is throttling the incoming messages. 106 type LocalGossipSubRouterMetrics interface { 107 // OnLocalMeshSizeUpdated tracks the size of the local mesh for a topic. 108 OnLocalMeshSizeUpdated(topic string, size int) 109 110 // OnPeerAddedToProtocol is called when the local node receives a stream from a peer on a gossipsub-related protocol. 111 // Args: 112 // protocol: the protocol name that the peer is connected to. 113 OnPeerAddedToProtocol(protocol string) 114 115 // OnPeerRemovedFromProtocol is called when the local considers a remote peer blacklisted or unavailable. 116 OnPeerRemovedFromProtocol() 117 118 // OnLocalPeerJoinedTopic is called when the local node subscribes to a gossipsub topic. 119 OnLocalPeerJoinedTopic() 120 121 // OnLocalPeerLeftTopic is called when the local node unsubscribes from a gossipsub topic. 122 OnLocalPeerLeftTopic() 123 124 // OnPeerGraftTopic is called when the local node receives a GRAFT message from a remote peer on a topic. 125 // Note: the received GRAFT at this point is considered passed the RPC inspection, and is accepted by the local node. 126 OnPeerGraftTopic(topic string) 127 128 // OnPeerPruneTopic is called when the local node receives a PRUNE message from a remote peer on a topic. 129 // Note: the received PRUNE at this point is considered passed the RPC inspection, and is accepted by the local node. 130 OnPeerPruneTopic(topic string) 131 132 // OnMessageEnteredValidation is called when a received pubsub message enters the validation pipeline. It is the 133 // internal validation pipeline of GossipSub protocol. The message may be rejected or accepted by the validation 134 // pipeline. 135 OnMessageEnteredValidation(size int) 136 137 // OnMessageRejected is called when a received pubsub message is rejected by the validation pipeline. 138 // Args: 139 // 140 // reason: the reason for rejection. 141 // size: the size of the message in bytes. 142 OnMessageRejected(size int, reason string) 143 144 // OnMessageDuplicate is called when a received pubsub message is a duplicate of a previously received message, and 145 // is dropped. 146 // Args: 147 // size: the size of the message in bytes. 148 OnMessageDuplicate(size int) 149 150 // OnPeerThrottled is called when a peer is throttled by the local node, i.e., the local node is not accepting any 151 // pubsub message from the peer but may still accept control messages. 152 OnPeerThrottled() 153 154 // OnRpcReceived is called when an RPC message is received by the local node. The received RPC is considered 155 // passed the RPC inspection, and is accepted by the local node. 156 // Args: 157 // msgCount: the number of messages included in the RPC. 158 // iHaveCount: the number of iHAVE messages included in the RPC. 159 // iWantCount: the number of iWANT messages included in the RPC. 160 // graftCount: the number of GRAFT messages included in the RPC. 161 // pruneCount: the number of PRUNE messages included in the RPC. 162 OnRpcReceived(msgCount int, iHaveCount int, iWantCount int, graftCount int, pruneCount int) 163 164 // OnRpcSent is called when an RPC message is sent by the local node. 165 // Note: the sent RPC is considered passed the RPC inspection, and is accepted by the local node. 166 // Args: 167 // msgCount: the number of messages included in the RPC. 168 // iHaveCount: the number of iHAVE messages included in the RPC. 169 // iWantCount: the number of iWANT messages included in the RPC. 170 // graftCount: the number of GRAFT messages included in the RPC. 171 // pruneCount: the number of PRUNE messages included in the RPC. 172 OnRpcSent(msgCount int, iHaveCount int, iWantCount int, graftCount int, pruneCount int) 173 174 // OnOutboundRpcDropped is called when an outbound RPC message is dropped by the local node, typically because the local node 175 // outbound message queue is full; or the RPC is big and the local node cannot fragment it. 176 OnOutboundRpcDropped() 177 178 // OnUndeliveredMessage is called when a message is not delivered at least one subscriber of the topic, for example when 179 // the subscriber is too slow to process the message. 180 OnUndeliveredMessage() 181 182 // OnMessageDeliveredToAllSubscribers is called when a message is delivered to all subscribers of the topic. 183 OnMessageDeliveredToAllSubscribers(size int) 184 } 185 186 // UnicastManagerMetrics unicast manager metrics. 187 type UnicastManagerMetrics interface { 188 // OnStreamCreated tracks the overall time it takes to create a stream successfully and the number of retry attempts. 189 OnStreamCreated(duration time.Duration, attempts int) 190 // OnStreamCreationFailure tracks the amount of time taken and number of retry attempts used when the unicast manager fails to create a stream. 191 OnStreamCreationFailure(duration time.Duration, attempts int) 192 // OnPeerDialed tracks the time it takes to dial a peer during stream creation and the number of retry attempts before a peer 193 // is dialed successfully. 194 OnPeerDialed(duration time.Duration, attempts int) 195 // OnPeerDialFailure tracks the amount of time taken and number of retry attempts used when the unicast manager cannot dial a peer 196 // to establish the initial connection between the two. 197 OnPeerDialFailure(duration time.Duration, attempts int) 198 // OnStreamEstablished tracks the time it takes to create a stream successfully on the available open connection during stream 199 // creation and the number of retry attempts. 200 OnStreamEstablished(duration time.Duration, attempts int) 201 // OnEstablishStreamFailure tracks the amount of time taken and number of retry attempts used when the unicast manager cannot establish 202 // a stream on the open connection between two peers. 203 OnEstablishStreamFailure(duration time.Duration, attempts int) 204 205 // OnDialRetryBudgetUpdated tracks the history of the dial retry budget updates. 206 OnDialRetryBudgetUpdated(budget uint64) 207 208 // OnStreamCreationRetryBudgetUpdated tracks the history of the stream creation retry budget updates. 209 OnStreamCreationRetryBudgetUpdated(budget uint64) 210 211 // OnDialRetryBudgetResetToDefault tracks the number of times the dial retry budget is reset to default. 212 OnDialRetryBudgetResetToDefault() 213 214 // OnStreamCreationRetryBudgetResetToDefault tracks the number of times the stream creation retry budget is reset to default. 215 OnStreamCreationRetryBudgetResetToDefault() 216 } 217 218 type GossipSubMetrics interface { 219 GossipSubScoringMetrics 220 GossipSubRpcInspectorMetrics 221 LocalGossipSubRouterMetrics 222 GossipSubRpcValidationInspectorMetrics 223 } 224 225 type LibP2PMetrics interface { 226 GossipSubMetrics 227 ResolverMetrics 228 DHTMetrics 229 rcmgr.MetricsReporter 230 LibP2PConnectionMetrics 231 UnicastManagerMetrics 232 GossipSubScoringRegistryMetrics 233 } 234 235 // GossipSubScoringMetrics encapsulates the metrics collectors for the peer scoring module of GossipSub protocol. 236 // It tracks the scores of the peers in the local mesh and the different factors that contribute to the score of a peer. 237 // It also tracks the scores of the topics in the local mesh and the different factors that contribute to the score of a topic. 238 type GossipSubScoringMetrics interface { 239 // OnOverallPeerScoreUpdated tracks the overall score of peers in the local mesh. 240 OnOverallPeerScoreUpdated(float64) 241 // OnAppSpecificScoreUpdated tracks the application specific score of peers in the local mesh. 242 OnAppSpecificScoreUpdated(float64) 243 // OnIPColocationFactorUpdated tracks the IP colocation factor of peers in the local mesh. 244 OnIPColocationFactorUpdated(float64) 245 // OnBehaviourPenaltyUpdated tracks the behaviour penalty of peers in the local mesh. 246 OnBehaviourPenaltyUpdated(float64) 247 // OnTimeInMeshUpdated tracks the time in mesh factor of peers in the local mesh for a given topic. 248 OnTimeInMeshUpdated(channels.Topic, time.Duration) 249 // OnFirstMessageDeliveredUpdated tracks the first message delivered factor of peers in the local mesh for a given topic. 250 OnFirstMessageDeliveredUpdated(channels.Topic, float64) 251 // OnMeshMessageDeliveredUpdated tracks the mesh message delivered factor of peers in the local mesh for a given topic. 252 OnMeshMessageDeliveredUpdated(channels.Topic, float64) 253 // OnInvalidMessageDeliveredUpdated tracks the invalid message delivered factor of peers in the local mesh for a given topic. 254 OnInvalidMessageDeliveredUpdated(channels.Topic, float64) 255 // SetWarningStateCount tracks the warning score state of peers in the local mesh. It updates the total number of 256 // peers in the local mesh that are in the warning state based on their score. 257 SetWarningStateCount(uint) 258 } 259 260 // GossipSubRpcValidationInspectorMetrics encapsulates the metrics collectors for the gossipsub rpc validation control message inspectors. 261 type GossipSubRpcValidationInspectorMetrics interface { 262 GossipSubRpcInspectorMetrics 263 264 // AsyncProcessingStarted increments the metric tracking the number of inspect message request being processed by workers in the rpc validator worker pool. 265 AsyncProcessingStarted() 266 // AsyncProcessingFinished tracks the time spent by a rpc validation inspector worker to process an inspect message request asynchronously and decrements the metric tracking 267 // the number of inspect message requests being processed asynchronously by the rpc validation inspector workers. 268 AsyncProcessingFinished(duration time.Duration) 269 270 // OnIHaveControlMessageIdsTruncated tracks the number of times message ids on an iHave message were truncated. 271 // Note that this function is called only when the message ids are truncated from an iHave message, not when the iHave message itself is truncated. 272 // This is different from the OnControlMessagesTruncated function which is called when a slice of control messages truncated from an RPC with all their message ids. 273 // Args: 274 // 275 // diff: the number of actual messages truncated. 276 OnIHaveControlMessageIdsTruncated(diff int) 277 278 // OnIWantControlMessageIdsTruncated tracks the number of times message ids on an iWant message were truncated. 279 // Note that this function is called only when the message ids are truncated from an iWant message, not when the iWant message itself is truncated. 280 // This is different from the OnControlMessagesTruncated function which is called when a slice of control messages truncated from an RPC with all their message ids. 281 // Args: 282 // diff: the number of actual messages truncated. 283 OnIWantControlMessageIdsTruncated(diff int) 284 285 // OnControlMessagesTruncated tracks the number of times a slice of control messages is truncated from an RPC with all their included message ids. 286 // Args: 287 // 288 // messageType: the type of the control message that was truncated 289 // diff: the number of control messages truncated. 290 OnControlMessagesTruncated(messageType p2pmsg.ControlMessageType, diff int) 291 292 // OnIWantMessagesInspected tracks the number of duplicate and cache miss message ids received by the node on iWant messages at the end of the async inspection iWants 293 // across one RPC, regardless of the result of the inspection. 294 // 295 // duplicateCount: the total number of duplicate message ids received by the node on the iWant messages at the end of the async inspection of the RPC. 296 // cacheMissCount: the total number of cache miss message ids received by the node on the iWant message at the end of the async inspection of the RPC. 297 OnIWantMessagesInspected(duplicateCount int, cacheMissCount int) 298 299 // OnIWantDuplicateMessageIdsExceedThreshold tracks the number of times that async inspection of iWant messages failed due to the total number of duplicate message ids 300 // received by the node on the iWant messages of a single RPC exceeding the threshold, which results in a misbehaviour report. 301 OnIWantDuplicateMessageIdsExceedThreshold() 302 303 // OnIWantCacheMissMessageIdsExceedThreshold tracks the number of times that async inspection of iWant messages failed due to the total 304 // number of cache miss message ids received by the node on the iWant messages of a single RPC exceeding the threshold, which results in a misbehaviour report. 305 OnIWantCacheMissMessageIdsExceedThreshold() 306 307 // OnIHaveMessagesInspected is called at the end of the async inspection of iHave messages of a single RPC, regardless of the result of the inspection. 308 // It tracks the number of duplicate topic ids and duplicate message ids received by the node on the iHave messages of that single RPC at the end of the async inspection iHaves. 309 // Args: 310 // 311 // duplicateTopicIds: the total number of duplicate topic ids received by the node on the iHave messages at the end of the async inspection of the RPC. 312 // duplicateMessageIds: the number of duplicate message ids received by the node on the iHave messages at the end of the async inspection of the RPC. 313 // invalidTopicIds: the number of invalid message ids received by the node on the iHave messages at the end of the async inspection of the RPC. 314 OnIHaveMessagesInspected(duplicateTopicIds int, duplicateMessageIds, invalidTopicIds int) 315 316 // OnIHaveDuplicateTopicIdsExceedThreshold tracks the number of times that the async inspection of iHave messages of a single RPC failed due to the total number of duplicate topic ids 317 // received by the node on the iHave messages of that RPC exceeding the threshold, which results in a misbehaviour report. 318 OnIHaveDuplicateTopicIdsExceedThreshold() 319 320 // OnIHaveInvalidTopicIdsExceedThreshold tracks the number of times that the async inspection of iHave messages of a single RPC failed due to the total number of invalid topic ids 321 // received by the node on the iHave messages of that RPC exceeding the threshold, which results in a misbehaviour report. 322 OnIHaveInvalidTopicIdsExceedThreshold() 323 324 // OnIHaveDuplicateMessageIdsExceedThreshold tracks the number of times that the async inspection of iHave messages of a single RPC failed due to the total number of duplicate message ids 325 // received by the node on an iHave message exceeding the threshold, which results in a misbehaviour report. 326 OnIHaveDuplicateMessageIdsExceedThreshold() 327 328 // OnInvalidTopicIdDetectedForControlMessage tracks the number of times that the async inspection of a control message type on a single RPC failed due to an invalid topic id. 329 // Args: 330 // - messageType: the type of the control message that was truncated. 331 OnInvalidTopicIdDetectedForControlMessage(messageType p2pmsg.ControlMessageType) 332 333 // OnActiveClusterIDsNotSetErr tracks the number of times that the async inspection of a control message type on a single RPC failed due to active cluster ids not set inspection failure. 334 // This is not causing a misbehaviour report. 335 OnActiveClusterIDsNotSetErr() 336 337 // OnUnstakedPeerInspectionFailed tracks the number of times that the async inspection of a control message type on a single RPC failed due to unstaked peer inspection failure. 338 // This is not causing a misbehaviour report. 339 OnUnstakedPeerInspectionFailed() 340 341 // OnInvalidControlMessageNotificationSent tracks the number of times that the async inspection of a control message failed and resulted in dissemination of an invalid control message was sent. 342 OnInvalidControlMessageNotificationSent() 343 344 // OnRpcRejectedFromUnknownSender tracks the number of rpc's rejected from unstaked nodes. 345 OnRpcRejectedFromUnknownSender() 346 347 // OnPublishMessagesInspectionErrorExceedsThreshold tracks the number of times that async inspection of publish messages failed due to the number of errors. 348 OnPublishMessagesInspectionErrorExceedsThreshold() 349 350 // OnPruneDuplicateTopicIdsExceedThreshold tracks the number of times that the async inspection of prune messages for an RPC failed due to the number of duplicate topic ids 351 // received by the node on prune messages of the same RPC excesses threshold, which results in a misbehaviour report. 352 OnPruneDuplicateTopicIdsExceedThreshold() 353 354 // OnPruneInvalidTopicIdsExceedThreshold tracks the number of times that the async inspection of prune messages for an RPC failed due to the number of invalid topic ids 355 // received by the node on prune messages of the same RPC excesses threshold, which results in a misbehaviour report. 356 OnPruneInvalidTopicIdsExceedThreshold() 357 358 // OnPruneMessageInspected is called at the end of the async inspection of prune messages of the RPC, regardless of the result of the inspection. 359 // Args: 360 // duplicateTopicIds: the number of duplicate topic ids received by the node on the prune messages of the RPC at the end of the async inspection prunes. 361 // invalidTopicIds: the number of invalid topic ids received by the node on the prune messages at the end of the async inspection of a single RPC. 362 OnPruneMessageInspected(duplicateTopicIds, invalidTopicIds int) 363 364 // OnGraftDuplicateTopicIdsExceedThreshold tracks the number of times that the async inspection of the graft messages of a single RPC failed due to the number of duplicate topic ids 365 // received by the node on graft messages of the same RPC excesses threshold, which results in a misbehaviour report. 366 OnGraftDuplicateTopicIdsExceedThreshold() 367 368 // OnGraftInvalidTopicIdsExceedThreshold tracks the number of times that the async inspection of the graft messages of a single RPC failed due to the number of invalid topic ids 369 // received by the node on graft messages of the same RPC excesses threshold, which results in a misbehaviour report. 370 OnGraftInvalidTopicIdsExceedThreshold() 371 372 // OnGraftMessageInspected is called at the end of the async inspection of graft messages of a single RPC, regardless of the result of the inspection. 373 // Args: 374 // duplicateTopicIds: the number of duplicate topic ids received by the node on the graft messages at the end of the async inspection of a single RPC. 375 // invalidTopicIds: the number of invalid topic ids received by the node on the graft messages at the end of the async inspection of a single RPC. 376 OnGraftMessageInspected(duplicateTopicIds, invalidTopicIds int) 377 378 // OnPublishMessageInspected is called at the end of the async inspection of publish messages of a single RPC, regardless of the result of the inspection. 379 // It tracks the total number of errors detected during the async inspection of the rpc together with their individual breakdown. 380 // Args: 381 // - errCount: the number of errors that occurred during the async inspection of publish messages. 382 // - invalidTopicIdsCount: the number of times that an invalid topic id was detected during the async inspection of publish messages. 383 // - invalidSubscriptionsCount: the number of times that an invalid subscription was detected during the async inspection of publish messages. 384 // - invalidSendersCount: the number of times that an invalid sender was detected during the async inspection of publish messages. 385 OnPublishMessageInspected(totalErrCount int, invalidTopicIdsCount int, invalidSubscriptionsCount int, invalidSendersCount int) 386 } 387 388 // NetworkInboundQueueMetrics encapsulates the metrics collectors for the inbound queue of the networking layer. 389 type NetworkInboundQueueMetrics interface { 390 391 // MessageAdded increments the metric tracking the number of messages in the queue with the given priority 392 MessageAdded(priority int) 393 394 // MessageRemoved decrements the metric tracking the number of messages in the queue with the given priority 395 MessageRemoved(priority int) 396 397 // QueueDuration tracks the time spent by a message with the given priority in the queue 398 QueueDuration(duration time.Duration, priority int) 399 } 400 401 // NetworkCoreMetrics encapsulates the metrics collectors for the core networking layer functionality. 402 type NetworkCoreMetrics interface { 403 NetworkInboundQueueMetrics 404 AlspMetrics 405 NetworkSecurityMetrics 406 407 // OutboundMessageSent collects metrics related to a message sent by the node. 408 OutboundMessageSent(sizeBytes int, topic string, protocol string, messageType string) 409 // InboundMessageReceived collects metrics related to a message received by the node. 410 InboundMessageReceived(sizeBytes int, topic string, protocol string, messageType string) 411 // DuplicateInboundMessagesDropped increments the metric tracking the number of duplicate messages dropped by the node. 412 DuplicateInboundMessagesDropped(topic string, protocol string, messageType string) 413 // UnicastMessageSendingStarted increments the metric tracking the number of unicast messages sent by the node. 414 UnicastMessageSendingStarted(topic string) 415 // UnicastMessageSendingCompleted decrements the metric tracking the number of unicast messages sent by the node. 416 UnicastMessageSendingCompleted(topic string) 417 // MessageProcessingStarted increments the metric tracking the number of messages being processed by the node. 418 MessageProcessingStarted(topic string) 419 // MessageProcessingFinished tracks the time spent by the node to process a message and decrements the metric tracking 420 // the number of messages being processed by the node. 421 MessageProcessingFinished(topic string, duration time.Duration) 422 } 423 424 // LibP2PConnectionMetrics encapsulates the metrics collectors for the connection manager of the libp2p node. 425 type LibP2PConnectionMetrics interface { 426 // OutboundConnections updates the metric tracking the number of outbound connections of this node 427 OutboundConnections(connectionCount uint) 428 429 // InboundConnections updates the metric tracking the number of inbound connections of this node 430 InboundConnections(connectionCount uint) 431 } 432 433 // AlspMetrics encapsulates the metrics collectors for the Application Layer Spam Prevention (ALSP) module, which 434 // is part of the networking layer. ALSP is responsible to prevent spam attacks on the application layer messages that 435 // appear to be valid for the networking layer but carry on a malicious intent on the application layer (i.e., Flow protocols). 436 type AlspMetrics interface { 437 // OnMisbehaviorReported is called when a misbehavior is reported by the application layer to ALSP. 438 // An engine detecting a spamming-related misbehavior reports it to the ALSP module. 439 // Args: 440 // - channel: the channel on which the misbehavior was reported 441 // - misbehaviorType: the type of misbehavior reported 442 OnMisbehaviorReported(channel string, misbehaviorType string) 443 } 444 445 // NetworkMetrics is the blanket abstraction that encapsulates the metrics collectors for the networking layer. 446 type NetworkMetrics interface { 447 LibP2PMetrics 448 NetworkCoreMetrics 449 } 450 451 // EngineMetrics is a generic metrics consumer for node-internal data processing 452 // components (aka engines). Implementations must be non-blocking and concurrency safe. 453 type EngineMetrics interface { 454 // MessageSent reports that the engine transmitted the message over the network. 455 // Unicasts, broadcasts, and multicasts are all reported once. 456 MessageSent(engine string, message string) 457 // MessageReceived reports that the engine received the message over the network. 458 MessageReceived(engine string, message string) 459 // MessageHandled reports that the engine has finished processing the message. 460 // Both invalid and valid messages should be reported. 461 // A message must be reported as either handled or dropped, not both. 462 MessageHandled(engine string, messages string) 463 // InboundMessageDropped reports that the engine has dropped inbound message without processing it. 464 // Inbound messages must be reported as either handled or dropped, not both. 465 InboundMessageDropped(engine string, messages string) 466 // OutboundMessageDropped reports that the engine has dropped outbound message without processing it. 467 // Outbound messages must be reported as either sent or dropped, not both. 468 OutboundMessageDropped(engine string, messages string) 469 } 470 471 type ComplianceMetrics interface { 472 FinalizedHeight(height uint64) 473 EpochTransitionHeight(height uint64) 474 SealedHeight(height uint64) 475 BlockFinalized(*flow.Block) 476 BlockSealed(*flow.Block) 477 CurrentEpochCounter(counter uint64) 478 CurrentEpochPhase(phase flow.EpochPhase) 479 CurrentEpochFinalView(view uint64) 480 CurrentDKGPhase1FinalView(view uint64) 481 CurrentDKGPhase2FinalView(view uint64) 482 CurrentDKGPhase3FinalView(view uint64) 483 EpochEmergencyFallbackTriggered() 484 } 485 486 type CleanerMetrics interface { 487 RanGC(took time.Duration) 488 } 489 490 type CacheMetrics interface { 491 // CacheEntries report the total number of cached items 492 CacheEntries(resource string, entries uint) 493 // CacheHit report the number of times the queried item is found in the cache 494 CacheHit(resource string) 495 // CacheNotFound records the number of times the queried item was not found in either cache or database. 496 CacheNotFound(resource string) 497 // CacheMiss report the number of times the queried item is not found in the cache, but found in the database. 498 CacheMiss(resource string) 499 } 500 501 type MempoolMetrics interface { 502 MempoolEntries(resource string, entries uint) 503 Register(resource string, entriesFunc EntriesFunc) error 504 } 505 506 type HotstuffMetrics interface { 507 // HotStuffBusyDuration reports Metrics C6 HotStuff Busy Duration 508 HotStuffBusyDuration(duration time.Duration, event string) 509 510 // HotStuffIdleDuration reports Metrics C6 HotStuff Idle Duration 511 HotStuffIdleDuration(duration time.Duration) 512 513 // HotStuffWaitDuration reports Metrics C6 HotStuff Idle Duration - the time between receiving and 514 // enqueueing a message to beginning to process that message. 515 HotStuffWaitDuration(duration time.Duration, event string) 516 517 // SetCurView reports Metrics C8: Current View maintained by Pacemaker. 518 SetCurView(view uint64) 519 520 // SetQCView reports Metrics C9: View of the newest QC known to Pacemaker. 521 SetQCView(view uint64) 522 523 // SetTCView reports last TC known to Pacemaker. 524 SetTCView(view uint64) 525 526 // CountSkipped counts the number of skips we did. 527 CountSkipped() 528 529 // CountTimeout tracks the number of views that this replica left due to observing a TC. 530 CountTimeout() 531 532 // SetTimeout sets the current timeout duration 533 SetTimeout(duration time.Duration) 534 535 // BlockProcessingDuration measures the time which the compliance engine 536 // spends to process one block proposal. 537 BlockProcessingDuration(duration time.Duration) 538 539 // VoteProcessingDuration measures the time which the hotstuff.VoteAggregator 540 // spends to process one vote. 541 VoteProcessingDuration(duration time.Duration) 542 543 // TimeoutObjectProcessingDuration measures the time which the hotstuff.TimeoutAggregator 544 // spends to process one timeout object. 545 TimeoutObjectProcessingDuration(duration time.Duration) 546 547 // CommitteeProcessingDuration measures the time which the HotStuff's core logic 548 // spends in the hotstuff.Replicas component, i.e. the time determining consensus 549 // committee relations. 550 CommitteeProcessingDuration(duration time.Duration) 551 552 // SignerProcessingDuration measures the time which the HotStuff's core logic 553 // spends in the hotstuff.Signer component, i.e. the with crypto-related operations. 554 SignerProcessingDuration(duration time.Duration) 555 556 // ValidatorProcessingDuration measures the time which the HotStuff's core logic 557 // spends in the hotstuff.Validator component, i.e. the with verifying 558 // consensus messages. 559 ValidatorProcessingDuration(duration time.Duration) 560 561 // PayloadProductionDuration measures the time which the HotStuff's core logic 562 // spends in the module.Builder component, i.e. the with generating block payloads. 563 PayloadProductionDuration(duration time.Duration) 564 565 // TimeoutCollectorsRange collects information from the node's `TimeoutAggregator` component. 566 // Specifically, it measurers the number of views for which we are currently collecting timeouts 567 // (i.e. the number of `TimeoutCollector` instances we are maintaining) and their lowest/highest view. 568 TimeoutCollectorsRange(lowestRetainedView uint64, newestViewCreatedCollector uint64, activeCollectors int) 569 } 570 571 type CruiseCtlMetrics interface { 572 573 // PIDError measures the current error values for the proportional, integration, 574 // and derivative terms of the PID controller. 575 PIDError(p, i, d float64) 576 577 // TargetProposalDuration measures the current value of the Block Time Controller output: 578 // the target duration from parent to child proposal. 579 TargetProposalDuration(duration time.Duration) 580 581 // ControllerOutput measures the output of the cruise control PID controller. 582 // Concretely, this is the quantity to subtract from the baseline view duration. 583 ControllerOutput(duration time.Duration) 584 585 // ProposalPublicationDelay measures the effective delay the controller imposes on publishing 586 // the node's own proposals, with all limits of authority applied. 587 // Note: Technically, our metrics capture the publication delay relative to when the publication delay was 588 // last requested. Currently, only the EventHandler requests a publication delay, exactly once per proposal. 589 ProposalPublicationDelay(duration time.Duration) 590 } 591 592 type CollectionMetrics interface { 593 // TransactionIngested is called when a new transaction is ingested by the 594 // node. It increments the total count of ingested transactions and starts 595 // a tx->col span for the transaction. 596 TransactionIngested(txID flow.Identifier) 597 598 // ClusterBlockProposed is called when a new collection is proposed by us or 599 // any other node in the cluster. 600 ClusterBlockProposed(block *cluster.Block) 601 602 // ClusterBlockFinalized is called when a collection is finalized. 603 ClusterBlockFinalized(block *cluster.Block) 604 } 605 606 type ConsensusMetrics interface { 607 // StartCollectionToFinalized reports Metrics C1: Collection Received by CCL→ Collection Included in Finalized Block 608 StartCollectionToFinalized(collectionID flow.Identifier) 609 610 // FinishCollectionToFinalized reports Metrics C1: Collection Received by CCL→ Collection Included in Finalized Block 611 FinishCollectionToFinalized(collectionID flow.Identifier) 612 613 // StartBlockToSeal reports Metrics C4: Block Received by CCL → Block Seal in finalized block 614 StartBlockToSeal(blockID flow.Identifier) 615 616 // FinishBlockToSeal reports Metrics C4: Block Received by CCL → Block Seal in finalized block 617 FinishBlockToSeal(blockID flow.Identifier) 618 619 // EmergencySeal increments the number of seals that were created in emergency mode 620 EmergencySeal() 621 622 // OnReceiptProcessingDuration records the number of seconds spent processing a receipt 623 OnReceiptProcessingDuration(duration time.Duration) 624 625 // OnApprovalProcessingDuration records the number of seconds spent processing an approval 626 OnApprovalProcessingDuration(duration time.Duration) 627 628 // CheckSealingDuration records absolute time for the full sealing check by the consensus match engine 629 CheckSealingDuration(duration time.Duration) 630 } 631 632 type VerificationMetrics interface { 633 // OnBlockConsumerJobDone is invoked by block consumer whenever it is notified a job is done by a worker. It 634 // sets the last processed block job index. 635 OnBlockConsumerJobDone(uint64) 636 // OnChunkConsumerJobDone is invoked by chunk consumer whenever it is notified a job is done by a worker. It 637 // sets the last processed chunk job index. 638 OnChunkConsumerJobDone(uint64) 639 // OnExecutionResultReceivedAtAssignerEngine is called whenever a new execution result arrives 640 // at Assigner engine. It increments total number of received execution results. 641 OnExecutionResultReceivedAtAssignerEngine() 642 643 // OnVerifiableChunkReceivedAtVerifierEngine increments a counter that keeps track of number of verifiable chunks received at 644 // verifier engine from fetcher engine. 645 OnVerifiableChunkReceivedAtVerifierEngine() 646 647 // OnFinalizedBlockArrivedAtAssigner sets a gauge that keeps track of number of the latest block height arrives 648 // at assigner engine. Note that it assumes blocks are coming to assigner engine in strictly increasing order of their height. 649 OnFinalizedBlockArrivedAtAssigner(height uint64) 650 651 // OnChunksAssignmentDoneAtAssigner increments a counter that keeps track of the total number of assigned chunks to 652 // the verification node. 653 OnChunksAssignmentDoneAtAssigner(chunks int) 654 655 // OnAssignedChunkProcessedAtAssigner increments a counter that keeps track of the total number of assigned chunks pushed by 656 // assigner engine to the fetcher engine. 657 OnAssignedChunkProcessedAtAssigner() 658 659 // OnAssignedChunkReceivedAtFetcher increments a counter that keeps track of number of assigned chunks arrive at fetcher engine. 660 OnAssignedChunkReceivedAtFetcher() 661 662 // OnChunkDataPackRequestSentByFetcher increments a counter that keeps track of number of chunk data pack requests that fetcher engine 663 // sends to requester engine. 664 OnChunkDataPackRequestSentByFetcher() 665 666 // OnChunkDataPackRequestReceivedByRequester increments a counter that keeps track of number of chunk data pack requests 667 // arrive at the requester engine from the fetcher engine. 668 OnChunkDataPackRequestReceivedByRequester() 669 670 // OnChunkDataPackRequestDispatchedInNetwork increments a counter that keeps track of number of chunk data pack requests that the 671 // requester engine dispatches in the network (to the execution nodes). 672 OnChunkDataPackRequestDispatchedInNetworkByRequester() 673 674 // OnChunkDataPackResponseReceivedFromNetwork increments a counter that keeps track of number of chunk data pack responses that the 675 // requester engine receives from execution nodes (through network). 676 OnChunkDataPackResponseReceivedFromNetworkByRequester() 677 678 // SetMaxChunkDataPackAttemptsForNextUnsealedHeightAtRequester is invoked when a cycle of requesting chunk data packs is done by requester engine. 679 // It updates the maximum number of attempts made by requester engine for requesting the chunk data packs of the next unsealed height. 680 // The maximum is taken over the history of all chunk data packs requested during that cycle that belong to the next unsealed height. 681 SetMaxChunkDataPackAttemptsForNextUnsealedHeightAtRequester(attempts uint64) 682 683 // OnChunkDataPackSentToFetcher increments a counter that keeps track of number of chunk data packs sent to the fetcher engine from 684 // requester engine. 685 OnChunkDataPackSentToFetcher() 686 687 // OnChunkDataPackArrivedAtFetcher increments a counter that keeps track of number of chunk data packs arrived at fetcher engine from 688 // requester engine. 689 OnChunkDataPackArrivedAtFetcher() 690 691 // OnVerifiableChunkSentToVerifier increments a counter that keeps track of number of verifiable chunks fetcher engine sent to verifier engine. 692 OnVerifiableChunkSentToVerifier() 693 694 // OnResultApprovalDispatchedInNetwork increments a counter that keeps track of number of result approvals dispatched in the network 695 // by verifier engine. 696 OnResultApprovalDispatchedInNetworkByVerifier() 697 } 698 699 // LedgerMetrics provides an interface to record Ledger Storage metrics. 700 // Ledger storage is non-linear (fork-aware) so certain metrics are averaged 701 // and computed before emitting for better visibility 702 type LedgerMetrics interface { 703 // ForestApproxMemorySize records approximate memory usage of forest (all in-memory trees) 704 ForestApproxMemorySize(bytes uint64) 705 706 // ForestNumberOfTrees current number of trees in a forest (in memory) 707 ForestNumberOfTrees(number uint64) 708 709 // LatestTrieRegCount records the number of unique register allocated (the latest created trie) 710 LatestTrieRegCount(number uint64) 711 712 // LatestTrieRegCountDiff records the difference between the number of unique register allocated of the latest created trie and parent trie 713 LatestTrieRegCountDiff(number int64) 714 715 // LatestTrieRegSize records the size of unique register allocated (the latest created trie) 716 LatestTrieRegSize(size uint64) 717 718 // LatestTrieRegSizeDiff records the difference between the size of unique register allocated of the latest created trie and parent trie 719 LatestTrieRegSizeDiff(size int64) 720 721 // LatestTrieMaxDepthTouched records the maximum depth touched of the lastest created trie 722 LatestTrieMaxDepthTouched(maxDepth uint16) 723 724 // UpdateCount increase a counter of performed updates 725 UpdateCount() 726 727 // ProofSize records a proof size 728 ProofSize(bytes uint32) 729 730 // UpdateValuesNumber accumulates number of updated values 731 UpdateValuesNumber(number uint64) 732 733 // UpdateValuesSize total size (in bytes) of updates values 734 UpdateValuesSize(byte uint64) 735 736 // UpdateDuration records absolute time for the update of a trie 737 UpdateDuration(duration time.Duration) 738 739 // UpdateDurationPerItem records update time for single value (total duration / number of updated values) 740 UpdateDurationPerItem(duration time.Duration) 741 742 // ReadValuesNumber accumulates number of read values 743 ReadValuesNumber(number uint64) 744 745 // ReadValuesSize total size (in bytes) of read values 746 ReadValuesSize(byte uint64) 747 748 // ReadDuration records absolute time for the read from a trie 749 ReadDuration(duration time.Duration) 750 751 // ReadDurationPerItem records read time for single value (total duration / number of read values) 752 ReadDurationPerItem(duration time.Duration) 753 } 754 755 type WALMetrics interface { 756 // ExecutionCheckpointSize reports the size of a checkpoint in bytes 757 ExecutionCheckpointSize(bytes uint64) 758 } 759 760 type RateLimitedBlockstoreMetrics interface { 761 BytesRead(int) 762 } 763 764 type BitswapMetrics interface { 765 Peers(prefix string, n int) 766 Wantlist(prefix string, n int) 767 BlobsReceived(prefix string, n uint64) 768 DataReceived(prefix string, n uint64) 769 BlobsSent(prefix string, n uint64) 770 DataSent(prefix string, n uint64) 771 DupBlobsReceived(prefix string, n uint64) 772 DupDataReceived(prefix string, n uint64) 773 MessagesReceived(prefix string, n uint64) 774 } 775 776 type ExecutionDataRequesterMetrics interface { 777 // ExecutionDataFetchStarted records an in-progress download 778 ExecutionDataFetchStarted() 779 780 // ExecutionDataFetchFinished records a completed download 781 ExecutionDataFetchFinished(duration time.Duration, success bool, height uint64) 782 783 // NotificationSent reports that ExecutionData received notifications were sent for a block height 784 NotificationSent(height uint64) 785 786 // FetchRetried reports that a download retry was processed 787 FetchRetried() 788 } 789 790 type ExecutionStateIndexerMetrics interface { 791 // BlockIndexed records metrics from indexing execution data from a single block. 792 BlockIndexed(height uint64, duration time.Duration, events, registers, transactionResults int) 793 794 // BlockReindexed records that a previously indexed block was indexed again. 795 BlockReindexed() 796 797 // InitializeLatestHeight records the latest height that has been indexed. 798 // This should only be used during startup. After startup, use BlockIndexed to record newly 799 // indexed heights. 800 InitializeLatestHeight(height uint64) 801 } 802 803 type RuntimeMetrics interface { 804 // RuntimeTransactionParsed reports the time spent parsing a single transaction 805 RuntimeTransactionParsed(dur time.Duration) 806 807 // RuntimeTransactionChecked reports the time spent checking a single transaction 808 RuntimeTransactionChecked(dur time.Duration) 809 810 // RuntimeTransactionInterpreted reports the time spent interpreting a single transaction 811 RuntimeTransactionInterpreted(dur time.Duration) 812 813 // RuntimeSetNumberOfAccounts Sets the total number of accounts on the network 814 RuntimeSetNumberOfAccounts(count uint64) 815 816 // RuntimeTransactionProgramsCacheMiss reports a programs cache miss 817 // during transaction execution 818 RuntimeTransactionProgramsCacheMiss() 819 820 // RuntimeTransactionProgramsCacheHit reports a programs cache hit 821 // during transaction execution 822 RuntimeTransactionProgramsCacheHit() 823 } 824 825 type ProviderMetrics interface { 826 // ChunkDataPackRequestProcessed is executed every time a chunk data pack request is picked up for processing at execution node. 827 // It increases the request processed counter by one. 828 ChunkDataPackRequestProcessed() 829 } 830 831 type ExecutionDataProviderMetrics interface { 832 RootIDComputed(duration time.Duration, numberOfChunks int) 833 AddBlobsSucceeded(duration time.Duration, totalSize uint64) 834 AddBlobsFailed() 835 } 836 837 type ExecutionDataRequesterV2Metrics interface { 838 FulfilledHeight(blockHeight uint64) 839 ReceiptSkipped() 840 RequestSucceeded(blockHeight uint64, duration time.Duration, totalSize uint64, numberOfAttempts int) 841 RequestFailed(duration time.Duration, retryable bool) 842 RequestCanceled() 843 ResponseDropped() 844 } 845 846 type ExecutionDataPrunerMetrics interface { 847 Pruned(height uint64, duration time.Duration) 848 } 849 850 type RestMetrics interface { 851 // Example recorder taken from: 852 // https://github.com/slok/go-http-metrics/blob/master/metrics/prometheus/prometheus.go 853 httpmetrics.Recorder 854 AddTotalRequests(ctx context.Context, method string, routeName string) 855 } 856 857 type GRPCConnectionPoolMetrics interface { 858 // TotalConnectionsInPool updates the number connections to collection/execution nodes stored in the pool, and the size of the pool 859 TotalConnectionsInPool(connectionCount uint, connectionPoolSize uint) 860 861 // ConnectionFromPoolReused tracks the number of times a connection to a collection/execution node is reused from the connection pool 862 ConnectionFromPoolReused() 863 864 // ConnectionAddedToPool tracks the number of times a collection/execution node is added to the connection pool 865 ConnectionAddedToPool() 866 867 // NewConnectionEstablished tracks the number of times a new grpc connection is established 868 NewConnectionEstablished() 869 870 // ConnectionFromPoolInvalidated tracks the number of times a cached grpc connection is invalidated and closed 871 ConnectionFromPoolInvalidated() 872 873 // ConnectionFromPoolUpdated tracks the number of times a cached connection is updated 874 ConnectionFromPoolUpdated() 875 876 // ConnectionFromPoolEvicted tracks the number of times a cached connection is evicted from the cache 877 ConnectionFromPoolEvicted() 878 } 879 880 type AccessMetrics interface { 881 RestMetrics 882 GRPCConnectionPoolMetrics 883 TransactionMetrics 884 BackendScriptsMetrics 885 886 // UpdateExecutionReceiptMaxHeight is called whenever we store an execution receipt from a block from a newer height 887 UpdateExecutionReceiptMaxHeight(height uint64) 888 889 // UpdateLastFullBlockHeight tracks the height of the last block for which all collections were received 890 UpdateLastFullBlockHeight(height uint64) 891 } 892 893 type ExecutionResultStats struct { 894 ComputationUsed uint64 895 MemoryUsed uint64 896 EventCounts int 897 EventSize int 898 NumberOfRegistersTouched int 899 NumberOfBytesWrittenToRegisters int 900 NumberOfCollections int 901 NumberOfTransactions int 902 } 903 904 func (stats *ExecutionResultStats) Merge(other ExecutionResultStats) { 905 stats.ComputationUsed += other.ComputationUsed 906 stats.MemoryUsed += other.MemoryUsed 907 stats.EventCounts += other.EventCounts 908 stats.EventSize += other.EventSize 909 stats.NumberOfRegistersTouched += other.NumberOfRegistersTouched 910 stats.NumberOfBytesWrittenToRegisters += other.NumberOfBytesWrittenToRegisters 911 stats.NumberOfCollections += other.NumberOfCollections 912 stats.NumberOfTransactions += other.NumberOfTransactions 913 } 914 915 type ExecutionMetrics interface { 916 LedgerMetrics 917 RuntimeMetrics 918 ProviderMetrics 919 WALMetrics 920 921 // StartBlockReceivedToExecuted starts a span to trace the duration of a block 922 // from being received for execution to execution being finished 923 StartBlockReceivedToExecuted(blockID flow.Identifier) 924 925 // FinishBlockReceivedToExecuted finishes a span to trace the duration of a block 926 // from being received for execution to execution being finished 927 FinishBlockReceivedToExecuted(blockID flow.Identifier) 928 929 // ExecutionStorageStateCommitment reports the storage size of a state commitment in bytes 930 ExecutionStorageStateCommitment(bytes int64) 931 932 // ExecutionLastExecutedBlockHeight reports last executed block height 933 ExecutionLastExecutedBlockHeight(height uint64) 934 935 // ExecutionLastFinalizedExecutedBlockHeight reports last finalized and executed block height 936 ExecutionLastFinalizedExecutedBlockHeight(height uint64) 937 938 // ExecutionBlockExecuted reports the total time and computation spent on executing a block 939 ExecutionBlockExecuted(dur time.Duration, stats ExecutionResultStats) 940 941 // ExecutionBlockExecutionEffortVectorComponent reports the unweighted effort of given ComputationKind at block level 942 ExecutionBlockExecutionEffortVectorComponent(string, uint) 943 944 // ExecutionBlockCachedPrograms reports the number of cached programs at the end of a block 945 ExecutionBlockCachedPrograms(programs int) 946 947 // ExecutionCollectionExecuted reports the total time and computation spent on executing a collection 948 ExecutionCollectionExecuted(dur time.Duration, stats ExecutionResultStats) 949 950 // ExecutionTransactionExecuted reports stats on executing a single transaction 951 ExecutionTransactionExecuted( 952 dur time.Duration, 953 numTxnConflictRetries int, 954 compUsed uint64, 955 memoryUsed uint64, 956 eventCounts int, 957 eventSize int, 958 failed bool) 959 960 // ExecutionChunkDataPackGenerated reports stats on chunk data pack generation 961 ExecutionChunkDataPackGenerated(proofSize, numberOfTransactions int) 962 963 // ExecutionScriptExecuted reports the time and memory spent on executing an script 964 ExecutionScriptExecuted(dur time.Duration, compUsed, memoryUsed, memoryEstimate uint64) 965 966 // ExecutionCollectionRequestSent reports when a request for a collection is sent to a collection node 967 ExecutionCollectionRequestSent() 968 969 // Unused 970 ExecutionCollectionRequestRetried() 971 972 // ExecutionSync reports when the state syncing is triggered or stopped. 973 ExecutionSync(syncing bool) 974 975 // Upload metrics 976 ExecutionBlockDataUploadStarted() 977 ExecutionBlockDataUploadFinished(dur time.Duration) 978 ExecutionComputationResultUploaded() 979 ExecutionComputationResultUploadRetried() 980 981 UpdateCollectionMaxHeight(height uint64) 982 } 983 984 type BackendScriptsMetrics interface { 985 // ScriptExecuted records the round trip time while executing a script 986 ScriptExecuted(dur time.Duration, size int) 987 988 // ScriptExecutionErrorLocal records script execution failures from local execution 989 ScriptExecutionErrorLocal() 990 991 // ScriptExecutionErrorOnExecutionNode records script execution failures on Execution Nodes 992 ScriptExecutionErrorOnExecutionNode() 993 994 // ScriptExecutionResultMismatch records script execution result mismatches between local and 995 // execution nodes 996 ScriptExecutionResultMismatch() 997 998 // ScriptExecutionResultMatch records script execution result matches between local and 999 // execution nodes 1000 ScriptExecutionResultMatch() 1001 1002 // ScriptExecutionErrorMismatch records script execution error mismatches between local and 1003 // execution nodes 1004 ScriptExecutionErrorMismatch() 1005 1006 // ScriptExecutionErrorMatch records script execution error matches between local and 1007 // execution nodes 1008 ScriptExecutionErrorMatch() 1009 1010 // ScriptExecutionNotIndexed records script execution matches where data for the block is not 1011 // indexed locally yet 1012 ScriptExecutionNotIndexed() 1013 } 1014 1015 type TransactionMetrics interface { 1016 // Record the round trip time while getting a transaction result 1017 TransactionResultFetched(dur time.Duration, size int) 1018 1019 // TransactionReceived starts tracking of transaction execution/finalization/sealing 1020 TransactionReceived(txID flow.Identifier, when time.Time) 1021 1022 // TransactionFinalized reports the time spent between the transaction being received and finalized. Reporting only 1023 // works if the transaction was earlier added as received. 1024 TransactionFinalized(txID flow.Identifier, when time.Time) 1025 1026 // TransactionExecuted reports the time spent between the transaction being received and executed. Reporting only 1027 // works if the transaction was earlier added as received. 1028 TransactionExecuted(txID flow.Identifier, when time.Time) 1029 1030 // TransactionExpired tracks number of expired transactions 1031 TransactionExpired(txID flow.Identifier) 1032 1033 // TransactionSubmissionFailed should be called whenever we try to submit a transaction and it fails 1034 TransactionSubmissionFailed() 1035 } 1036 1037 type PingMetrics interface { 1038 // NodeReachable tracks the round trip time in milliseconds taken to ping a node 1039 // The nodeInfo provides additional information about the node such as the name of the node operator 1040 NodeReachable(node *flow.Identity, nodeInfo string, rtt time.Duration) 1041 1042 // NodeInfo tracks the software version, sealed height and hotstuff view of a node 1043 NodeInfo(node *flow.Identity, nodeInfo string, version string, sealedHeight uint64, hotstuffCurView uint64) 1044 } 1045 1046 type HeroCacheMetrics interface { 1047 // BucketAvailableSlots keeps track of number of available slots in buckets of cache. 1048 BucketAvailableSlots(uint64, uint64) 1049 1050 // OnKeyPutAttempt is called whenever a new (key, value) pair is attempted to be put in cache. 1051 // It does not reflect whether the put was successful or not. 1052 // A (key, value) pair put attempt may fail if the cache is full, or the key already exists. 1053 OnKeyPutAttempt(size uint32) 1054 1055 // OnKeyPutSuccess is called whenever a new (key, entity) pair is successfully added to the cache. 1056 OnKeyPutSuccess(size uint32) 1057 1058 // OnKeyPutDrop is called whenever a new (key, entity) pair is dropped from the cache due to full cache. 1059 OnKeyPutDrop() 1060 1061 // OnKeyPutDeduplicated is tracking the total number of unsuccessful writes caused by adding a duplicate key to the cache. 1062 // A duplicate key is dropped by the cache when it is written to the cache. 1063 // Note: in context of HeroCache, the key corresponds to the identifier of its entity. Hence, a duplicate key corresponds to 1064 // a duplicate entity. 1065 OnKeyPutDeduplicated() 1066 1067 // OnKeyRemoved is called whenever a (key, entity) pair is removed from the cache. 1068 OnKeyRemoved(size uint32) 1069 1070 // OnKeyGetSuccess tracks total number of successful read queries. 1071 // A read query is successful if the entity corresponding to its key is available in the cache. 1072 // Note: in context of HeroCache, the key corresponds to the identifier of its entity. 1073 OnKeyGetSuccess() 1074 1075 // OnKeyGetFailure tracks total number of unsuccessful read queries. 1076 // A read query is unsuccessful if the entity corresponding to its key is not available in the cache. 1077 // Note: in context of HeroCache, the key corresponds to the identifier of its entity. 1078 OnKeyGetFailure() 1079 1080 // OnEntityEjectionDueToFullCapacity is called whenever adding a new (key, entity) to the cache results in ejection of another (key', entity') pair. 1081 // This normally happens -- and is expected -- when the cache is full. 1082 // Note: in context of HeroCache, the key corresponds to the identifier of its entity. 1083 OnEntityEjectionDueToFullCapacity() 1084 1085 // OnEntityEjectionDueToEmergency is called whenever a bucket is found full and all of its keys are valid, i.e., 1086 // each key belongs to an existing (key, entity) pair. 1087 // Hence, adding a new key to that bucket will replace the oldest valid key inside that bucket. 1088 // Note: in context of HeroCache, the key corresponds to the identifier of its entity. 1089 OnEntityEjectionDueToEmergency() 1090 } 1091 1092 type ChainSyncMetrics interface { 1093 // record pruned blocks. requested and received times might be zero values 1094 PrunedBlockById(status *chainsync.Status) 1095 1096 PrunedBlockByHeight(status *chainsync.Status) 1097 1098 // totalByHeight and totalById are the number of blocks pruned for blocks requested by height and by id 1099 // storedByHeight and storedById are the number of blocks still stored by height and id 1100 PrunedBlocks(totalByHeight, totalById, storedByHeight, storedById int) 1101 1102 RangeRequested(ran chainsync.Range) 1103 1104 BatchRequested(batch chainsync.Batch) 1105 } 1106 1107 type DHTMetrics interface { 1108 RoutingTablePeerAdded() 1109 RoutingTablePeerRemoved() 1110 } 1111 1112 type CollectionExecutedMetric interface { 1113 CollectionFinalized(light flow.LightCollection) 1114 CollectionExecuted(light flow.LightCollection) 1115 BlockFinalized(block *flow.Block) 1116 ExecutionReceiptReceived(r *flow.ExecutionReceipt) 1117 UpdateLastFullBlockHeight(height uint64) 1118 } 1119 1120 type MachineAccountMetrics interface { 1121 // AccountBalance reports the current balance of the machine account. 1122 AccountBalance(bal float64) 1123 // RecommendedMinBalance reports the recommended minimum balance. If the actual balance 1124 // falls below this level, it must be refilled. 1125 // NOTE: Operators should alert on `AccountBalance < RecommendedMinBalance` 1126 RecommendedMinBalance(bal float64) 1127 // IsMisconfigured reports whether a critical misconfiguration has been detected. 1128 // NOTE Operators should alert on non-zero values reported here. 1129 IsMisconfigured(misconfigured bool) 1130 }