github.com/onflow/flow-go@v0.35.7-crescendo-preview.23-atree-inlining/module/metrics/hotstuff.go (about)

     1  package metrics
     2  
     3  import (
     4  	"time"
     5  
     6  	"github.com/prometheus/client_golang/prometheus"
     7  	"github.com/prometheus/client_golang/prometheus/promauto"
     8  
     9  	"github.com/onflow/flow-go/model/flow"
    10  	"github.com/onflow/flow-go/module"
    11  )
    12  
    13  // HotStuff Metrics
    14  const (
    15  	HotstuffEventTypeLocalTimeout = "localtimeout"
    16  	HotstuffEventTypeOnProposal   = "onproposal"
    17  	HotstuffEventTypeOnQC         = "onqc"
    18  	HotstuffEventTypeOnTC         = "ontc"
    19  	HotstuffEventTypeOnPartialTc  = "onpartialtc"
    20  )
    21  
    22  // HotstuffCollector implements only the metrics emitted by the HotStuff core logic.
    23  // We have multiple instances of HotStuff running within Flow: Consensus Nodes form
    24  // the main consensus committee. In addition each Collector node cluster runs their
    25  // own HotStuff instance. Depending on the node role, the name space is different. Furthermore,
    26  // even within the `collection` name space, we need to separate metrics between the different
    27  // clusters. We do this by adding the label `committeeID` to the HotStuff metrics and
    28  // allowing for configurable name space.
    29  type HotstuffCollector struct {
    30  	busyDuration                  *prometheus.HistogramVec
    31  	idleDuration                  prometheus.Histogram
    32  	waitDuration                  *prometheus.HistogramVec
    33  	curView                       prometheus.Gauge
    34  	qcView                        prometheus.Gauge
    35  	tcView                        prometheus.Gauge
    36  	skips                         prometheus.Counter
    37  	timeouts                      prometheus.Counter
    38  	timeoutDuration               prometheus.Gauge
    39  	voteProcessingDuration        prometheus.Histogram
    40  	timeoutProcessingDuration     prometheus.Histogram
    41  	blockProcessingDuration       prometheus.Histogram
    42  	committeeComputationsDuration prometheus.Histogram
    43  	signerComputationsDuration    prometheus.Histogram
    44  	validatorComputationsDuration prometheus.Histogram
    45  	payloadProductionDuration     prometheus.Histogram
    46  	timeoutCollectorsRange        *prometheus.GaugeVec
    47  	numberOfActiveCollectors      prometheus.Gauge
    48  }
    49  
    50  var _ module.HotstuffMetrics = (*HotstuffCollector)(nil)
    51  
    52  func NewHotstuffCollector(chain flow.ChainID) *HotstuffCollector {
    53  
    54  	hc := &HotstuffCollector{
    55  
    56  		busyDuration: promauto.NewHistogramVec(prometheus.HistogramOpts{
    57  			Name:        "busy_duration_seconds",
    58  			Namespace:   namespaceConsensus,
    59  			Subsystem:   subsystemHotstuff,
    60  			Help:        "duration [seconds; measured with float64 precision] of how long HotStuff's event loop has been busy processing one event",
    61  			Buckets:     []float64{0.05, 0.2, 0.5, 1, 2, 5},
    62  			ConstLabels: prometheus.Labels{LabelChain: chain.String()},
    63  		}, []string{"event_type"}),
    64  
    65  		idleDuration: promauto.NewHistogram(prometheus.HistogramOpts{
    66  			Name:        "idle_duration_seconds",
    67  			Namespace:   namespaceConsensus,
    68  			Subsystem:   subsystemHotstuff,
    69  			Help:        "duration [seconds; measured with float64 precision] of how long HotStuff's event loop has been idle without processing any event",
    70  			Buckets:     []float64{0.05, 0.2, 0.5, 1, 2, 5},
    71  			ConstLabels: prometheus.Labels{LabelChain: chain.String()},
    72  		}),
    73  
    74  		waitDuration: promauto.NewHistogramVec(prometheus.HistogramOpts{
    75  			Name:        "wait_duration_seconds",
    76  			Namespace:   namespaceConsensus,
    77  			Subsystem:   subsystemHotstuff,
    78  			Help:        "duration [seconds; measured with float64 precision] of how long an event has been waited in the HotStuff event loop queue before being processed.",
    79  			Buckets:     []float64{0.05, 0.2, 0.5, 1, 2, 5},
    80  			ConstLabels: prometheus.Labels{LabelChain: chain.String()},
    81  		}, []string{"event_type"}),
    82  
    83  		curView: promauto.NewGauge(prometheus.GaugeOpts{
    84  			Name:        "cur_view",
    85  			Namespace:   namespaceConsensus,
    86  			Subsystem:   subsystemHotstuff,
    87  			Help:        "the current view that the event handler has entered",
    88  			ConstLabels: prometheus.Labels{LabelChain: chain.String()},
    89  		}),
    90  
    91  		qcView: promauto.NewGauge(prometheus.GaugeOpts{
    92  			Name:        "qc_view",
    93  			Namespace:   namespaceConsensus,
    94  			Subsystem:   subsystemHotstuff,
    95  			Help:        "The view of the newest known QC from HotStuff",
    96  			ConstLabels: prometheus.Labels{LabelChain: chain.String()},
    97  		}),
    98  
    99  		tcView: promauto.NewGauge(prometheus.GaugeOpts{
   100  			Name:        "tc_view",
   101  			Namespace:   namespaceConsensus,
   102  			Subsystem:   subsystemHotstuff,
   103  			Help:        "The view of the newest known TC from HotStuff",
   104  			ConstLabels: prometheus.Labels{LabelChain: chain.String()},
   105  		}),
   106  
   107  		skips: promauto.NewCounter(prometheus.CounterOpts{
   108  			Name:        "skips_total",
   109  			Namespace:   namespaceConsensus,
   110  			Subsystem:   subsystemHotstuff,
   111  			Help:        "The number of times we skipped ahead some views",
   112  			ConstLabels: prometheus.Labels{LabelChain: chain.String()},
   113  		}),
   114  
   115  		timeouts: promauto.NewCounter(prometheus.CounterOpts{
   116  			Name:        "timeouts_total",
   117  			Namespace:   namespaceConsensus,
   118  			Subsystem:   subsystemHotstuff,
   119  			Help:        "The number of views that this replica left due to observing a TC",
   120  			ConstLabels: prometheus.Labels{LabelChain: chain.String()},
   121  		}),
   122  
   123  		timeoutDuration: promauto.NewGauge(prometheus.GaugeOpts{
   124  			Name:        "timeout_seconds",
   125  			Namespace:   namespaceConsensus,
   126  			Subsystem:   subsystemHotstuff,
   127  			Help:        "The current length of the timeout",
   128  			ConstLabels: prometheus.Labels{LabelChain: chain.String()},
   129  		}),
   130  
   131  		committeeComputationsDuration: promauto.NewHistogram(prometheus.HistogramOpts{
   132  			Name:        "committee_computations_seconds",
   133  			Namespace:   namespaceConsensus,
   134  			Subsystem:   subsystemHotstuff,
   135  			Help:        "duration [seconds; measured with float64 precision] of how long HotStuff sends computing consensus committee relations",
   136  			Buckets:     []float64{0.02, 0.05, 0.1, 0.2, 0.5, 1, 2},
   137  			ConstLabels: prometheus.Labels{LabelChain: chain.String()},
   138  		}),
   139  
   140  		signerComputationsDuration: promauto.NewHistogram(prometheus.HistogramOpts{
   141  			Name:        "crypto_computations_seconds",
   142  			Namespace:   namespaceConsensus,
   143  			Subsystem:   subsystemHotstuff,
   144  			Help:        "duration [seconds; measured with float64 precision] of how long HotStuff sends with crypto-related operations",
   145  			Buckets:     []float64{0.02, 0.05, 0.1, 0.2, 0.5, 1, 2},
   146  			ConstLabels: prometheus.Labels{LabelChain: chain.String()},
   147  		}),
   148  
   149  		validatorComputationsDuration: promauto.NewHistogram(prometheus.HistogramOpts{
   150  			Name:        "message_validation_seconds",
   151  			Namespace:   namespaceConsensus,
   152  			Subsystem:   subsystemHotstuff,
   153  			Help:        "duration [seconds; measured with float64 precision] of how long HotStuff sends with message-validation",
   154  			Buckets:     []float64{0.02, 0.05, 0.1, 0.2, 0.5, 1, 2},
   155  			ConstLabels: prometheus.Labels{LabelChain: chain.String()},
   156  		}),
   157  
   158  		payloadProductionDuration: promauto.NewHistogram(prometheus.HistogramOpts{
   159  			Name:        "payload_production_seconds",
   160  			Namespace:   namespaceConsensus,
   161  			Subsystem:   subsystemHotstuff,
   162  			Help:        "duration [seconds; measured with float64 precision] of how long HotStuff sends with payload production",
   163  			Buckets:     []float64{0.02, 0.05, 0.1, 0.2, 0.5, 1, 2},
   164  			ConstLabels: prometheus.Labels{LabelChain: chain.String()},
   165  		}),
   166  		blockProcessingDuration: promauto.NewHistogram(prometheus.HistogramOpts{
   167  			Name:        "block_processing_seconds",
   168  			Namespace:   namespaceConsensus,
   169  			Subsystem:   subsystemHotstuff,
   170  			Help:        "duration [seconds; measured with float64 precision] of how long compliance engine processes one block",
   171  			Buckets:     []float64{0.02, 0.05, 0.1, 0.2, 0.5, 1, 2},
   172  			ConstLabels: prometheus.Labels{LabelChain: chain.String()},
   173  		}),
   174  		voteProcessingDuration: promauto.NewHistogram(prometheus.HistogramOpts{
   175  			Name:        "vote_processing_seconds",
   176  			Namespace:   namespaceConsensus,
   177  			Subsystem:   subsystemHotstuff,
   178  			Help:        "duration [seconds; measured with float64 precision] of how long VoteAggregator processes one message",
   179  			Buckets:     []float64{0.02, 0.05, 0.1, 0.2, 0.5, 1, 2},
   180  			ConstLabels: prometheus.Labels{LabelChain: chain.String()},
   181  		}),
   182  		timeoutProcessingDuration: promauto.NewHistogram(prometheus.HistogramOpts{
   183  			Name:        "timeout_object_processing_seconds",
   184  			Namespace:   namespaceConsensus,
   185  			Subsystem:   subsystemHotstuff,
   186  			Help:        "duration [seconds; measured with float64 precision] of how long TimeoutAggregator processes one message",
   187  			Buckets:     []float64{0.02, 0.05, 0.1, 0.2, 0.5, 1, 2},
   188  			ConstLabels: prometheus.Labels{LabelChain: chain.String()},
   189  		}),
   190  		timeoutCollectorsRange: promauto.NewGaugeVec(prometheus.GaugeOpts{
   191  			Name:        "timeout_collectors_range",
   192  			Namespace:   namespaceConsensus,
   193  			Subsystem:   subsystemHotstuff,
   194  			Help:        "lowest and highest views that we are maintaining TimeoutCollectors for",
   195  			ConstLabels: prometheus.Labels{LabelChain: chain.String()},
   196  		}, []string{"prefix"}),
   197  		numberOfActiveCollectors: promauto.NewGauge(prometheus.GaugeOpts{
   198  			Name:        "active_collectors",
   199  			Namespace:   namespaceConsensus,
   200  			Subsystem:   subsystemHotstuff,
   201  			Help:        "number of active TimeoutCollectors that the TimeoutAggregator component currently maintains",
   202  			ConstLabels: prometheus.Labels{LabelChain: chain.String()},
   203  		}),
   204  	}
   205  
   206  	return hc
   207  }
   208  
   209  // HotStuffBusyDuration reports Metrics C6 HotStuff Busy Duration
   210  func (hc *HotstuffCollector) HotStuffBusyDuration(duration time.Duration, event string) {
   211  	hc.busyDuration.WithLabelValues(event).Observe(duration.Seconds()) // unit: seconds; with float64 precision
   212  }
   213  
   214  // HotStuffIdleDuration reports Metrics C6 HotStuff Idle Duration
   215  func (hc *HotstuffCollector) HotStuffIdleDuration(duration time.Duration) {
   216  	hc.idleDuration.Observe(duration.Seconds()) // unit: seconds; with float64 precision
   217  }
   218  
   219  // HotStuffWaitDuration reports Metrics C6 HotStuff Idle Duration - the time between receiving and
   220  // enqueueing a message to beginning to process that message.
   221  func (hc *HotstuffCollector) HotStuffWaitDuration(duration time.Duration, event string) {
   222  	hc.waitDuration.WithLabelValues(event).Observe(duration.Seconds()) // unit: seconds; with float64 precision
   223  }
   224  
   225  // CountSkipped counts the number of skips we did.
   226  func (hc *HotstuffCollector) CountSkipped() {
   227  	hc.skips.Inc()
   228  }
   229  
   230  // CountTimeout tracks the number of views that this replica left due to observing a TC.
   231  func (hc *HotstuffCollector) CountTimeout() {
   232  	hc.timeouts.Inc()
   233  }
   234  
   235  // SetCurView reports Metrics C8: Current View
   236  func (hc *HotstuffCollector) SetCurView(view uint64) {
   237  	hc.curView.Set(float64(view))
   238  }
   239  
   240  // SetQCView reports Metrics C9: View of Newest Known QC
   241  func (hc *HotstuffCollector) SetQCView(view uint64) {
   242  	hc.qcView.Set(float64(view))
   243  }
   244  
   245  // SetTCView reports the view of the newest known TC
   246  func (hc *HotstuffCollector) SetTCView(view uint64) {
   247  	hc.tcView.Set(float64(view))
   248  }
   249  
   250  // BlockProcessingDuration measures the time which the compliance engine
   251  // spends to process one block proposal.
   252  func (hc *HotstuffCollector) BlockProcessingDuration(duration time.Duration) {
   253  	hc.blockProcessingDuration.Observe(duration.Seconds())
   254  }
   255  
   256  // VoteProcessingDuration reports the processing time for a single vote
   257  func (hc *HotstuffCollector) VoteProcessingDuration(duration time.Duration) {
   258  	hc.voteProcessingDuration.Observe(duration.Seconds())
   259  }
   260  
   261  // TimeoutObjectProcessingDuration reports the processing time for a TimeoutObject
   262  func (hc *HotstuffCollector) TimeoutObjectProcessingDuration(duration time.Duration) {
   263  	hc.timeoutProcessingDuration.Observe(duration.Seconds())
   264  }
   265  
   266  // SetTimeout sets the current timeout duration.
   267  func (hc *HotstuffCollector) SetTimeout(duration time.Duration) {
   268  	hc.timeoutDuration.Set(duration.Seconds()) // unit: seconds; with float64 precision
   269  }
   270  
   271  // CommitteeProcessingDuration measures the time which the HotStuff's core logic
   272  // spends in the hotstuff.Committee component, i.e. the time determining consensus
   273  // committee relations.
   274  func (hc *HotstuffCollector) CommitteeProcessingDuration(duration time.Duration) {
   275  	hc.committeeComputationsDuration.Observe(duration.Seconds()) // unit: seconds; with float64 precision
   276  }
   277  
   278  // SignerProcessingDuration reports the time which the HotStuff's core logic
   279  // spends in the hotstuff.Signer component, i.e. the with crypto-related operations.
   280  func (hc *HotstuffCollector) SignerProcessingDuration(duration time.Duration) {
   281  	hc.signerComputationsDuration.Observe(duration.Seconds()) // unit: seconds; with float64 precision
   282  }
   283  
   284  // ValidatorProcessingDuration reports the time which the HotStuff's core logic
   285  // spends in the hotstuff.Validator component, i.e. the with verifying higher-level
   286  // consensus messages.
   287  func (hc *HotstuffCollector) ValidatorProcessingDuration(duration time.Duration) {
   288  	hc.validatorComputationsDuration.Observe(duration.Seconds()) // unit: seconds; with float64 precision
   289  }
   290  
   291  // PayloadProductionDuration reports the time which the HotStuff's core logic
   292  // spends in the module.Builder component, i.e. the with generating block payloads
   293  func (hc *HotstuffCollector) PayloadProductionDuration(duration time.Duration) {
   294  	hc.payloadProductionDuration.Observe(duration.Seconds()) // unit: seconds; with float64 precision
   295  }
   296  
   297  // TimeoutCollectorsRange collects information from the node's `TimeoutAggregator` component.
   298  // Specifically, it measurers the number of views for which we are currently collecting timeouts
   299  // (i.e. the number of `TimeoutCollector` instances we are maintaining) and their lowest/highest view.
   300  func (hc *HotstuffCollector) TimeoutCollectorsRange(lowestRetainedView uint64, newestViewCreatedCollector uint64, activeCollectors int) {
   301  	hc.timeoutCollectorsRange.WithLabelValues("lowest_view_of_active_timeout_collectors").Set(float64(lowestRetainedView))
   302  	hc.timeoutCollectorsRange.WithLabelValues("newest_view_of_active_timeout_collectors").Set(float64(newestViewCreatedCollector))
   303  	hc.numberOfActiveCollectors.Set(float64(activeCollectors))
   304  }