github.com/onflow/flow-go@v0.35.7-crescendo-preview.23-atree-inlining/module/metrics/hotstuff.go (about) 1 package metrics 2 3 import ( 4 "time" 5 6 "github.com/prometheus/client_golang/prometheus" 7 "github.com/prometheus/client_golang/prometheus/promauto" 8 9 "github.com/onflow/flow-go/model/flow" 10 "github.com/onflow/flow-go/module" 11 ) 12 13 // HotStuff Metrics 14 const ( 15 HotstuffEventTypeLocalTimeout = "localtimeout" 16 HotstuffEventTypeOnProposal = "onproposal" 17 HotstuffEventTypeOnQC = "onqc" 18 HotstuffEventTypeOnTC = "ontc" 19 HotstuffEventTypeOnPartialTc = "onpartialtc" 20 ) 21 22 // HotstuffCollector implements only the metrics emitted by the HotStuff core logic. 23 // We have multiple instances of HotStuff running within Flow: Consensus Nodes form 24 // the main consensus committee. In addition each Collector node cluster runs their 25 // own HotStuff instance. Depending on the node role, the name space is different. Furthermore, 26 // even within the `collection` name space, we need to separate metrics between the different 27 // clusters. We do this by adding the label `committeeID` to the HotStuff metrics and 28 // allowing for configurable name space. 29 type HotstuffCollector struct { 30 busyDuration *prometheus.HistogramVec 31 idleDuration prometheus.Histogram 32 waitDuration *prometheus.HistogramVec 33 curView prometheus.Gauge 34 qcView prometheus.Gauge 35 tcView prometheus.Gauge 36 skips prometheus.Counter 37 timeouts prometheus.Counter 38 timeoutDuration prometheus.Gauge 39 voteProcessingDuration prometheus.Histogram 40 timeoutProcessingDuration prometheus.Histogram 41 blockProcessingDuration prometheus.Histogram 42 committeeComputationsDuration prometheus.Histogram 43 signerComputationsDuration prometheus.Histogram 44 validatorComputationsDuration prometheus.Histogram 45 payloadProductionDuration prometheus.Histogram 46 timeoutCollectorsRange *prometheus.GaugeVec 47 numberOfActiveCollectors prometheus.Gauge 48 } 49 50 var _ module.HotstuffMetrics = (*HotstuffCollector)(nil) 51 52 func NewHotstuffCollector(chain flow.ChainID) *HotstuffCollector { 53 54 hc := &HotstuffCollector{ 55 56 busyDuration: promauto.NewHistogramVec(prometheus.HistogramOpts{ 57 Name: "busy_duration_seconds", 58 Namespace: namespaceConsensus, 59 Subsystem: subsystemHotstuff, 60 Help: "duration [seconds; measured with float64 precision] of how long HotStuff's event loop has been busy processing one event", 61 Buckets: []float64{0.05, 0.2, 0.5, 1, 2, 5}, 62 ConstLabels: prometheus.Labels{LabelChain: chain.String()}, 63 }, []string{"event_type"}), 64 65 idleDuration: promauto.NewHistogram(prometheus.HistogramOpts{ 66 Name: "idle_duration_seconds", 67 Namespace: namespaceConsensus, 68 Subsystem: subsystemHotstuff, 69 Help: "duration [seconds; measured with float64 precision] of how long HotStuff's event loop has been idle without processing any event", 70 Buckets: []float64{0.05, 0.2, 0.5, 1, 2, 5}, 71 ConstLabels: prometheus.Labels{LabelChain: chain.String()}, 72 }), 73 74 waitDuration: promauto.NewHistogramVec(prometheus.HistogramOpts{ 75 Name: "wait_duration_seconds", 76 Namespace: namespaceConsensus, 77 Subsystem: subsystemHotstuff, 78 Help: "duration [seconds; measured with float64 precision] of how long an event has been waited in the HotStuff event loop queue before being processed.", 79 Buckets: []float64{0.05, 0.2, 0.5, 1, 2, 5}, 80 ConstLabels: prometheus.Labels{LabelChain: chain.String()}, 81 }, []string{"event_type"}), 82 83 curView: promauto.NewGauge(prometheus.GaugeOpts{ 84 Name: "cur_view", 85 Namespace: namespaceConsensus, 86 Subsystem: subsystemHotstuff, 87 Help: "the current view that the event handler has entered", 88 ConstLabels: prometheus.Labels{LabelChain: chain.String()}, 89 }), 90 91 qcView: promauto.NewGauge(prometheus.GaugeOpts{ 92 Name: "qc_view", 93 Namespace: namespaceConsensus, 94 Subsystem: subsystemHotstuff, 95 Help: "The view of the newest known QC from HotStuff", 96 ConstLabels: prometheus.Labels{LabelChain: chain.String()}, 97 }), 98 99 tcView: promauto.NewGauge(prometheus.GaugeOpts{ 100 Name: "tc_view", 101 Namespace: namespaceConsensus, 102 Subsystem: subsystemHotstuff, 103 Help: "The view of the newest known TC from HotStuff", 104 ConstLabels: prometheus.Labels{LabelChain: chain.String()}, 105 }), 106 107 skips: promauto.NewCounter(prometheus.CounterOpts{ 108 Name: "skips_total", 109 Namespace: namespaceConsensus, 110 Subsystem: subsystemHotstuff, 111 Help: "The number of times we skipped ahead some views", 112 ConstLabels: prometheus.Labels{LabelChain: chain.String()}, 113 }), 114 115 timeouts: promauto.NewCounter(prometheus.CounterOpts{ 116 Name: "timeouts_total", 117 Namespace: namespaceConsensus, 118 Subsystem: subsystemHotstuff, 119 Help: "The number of views that this replica left due to observing a TC", 120 ConstLabels: prometheus.Labels{LabelChain: chain.String()}, 121 }), 122 123 timeoutDuration: promauto.NewGauge(prometheus.GaugeOpts{ 124 Name: "timeout_seconds", 125 Namespace: namespaceConsensus, 126 Subsystem: subsystemHotstuff, 127 Help: "The current length of the timeout", 128 ConstLabels: prometheus.Labels{LabelChain: chain.String()}, 129 }), 130 131 committeeComputationsDuration: promauto.NewHistogram(prometheus.HistogramOpts{ 132 Name: "committee_computations_seconds", 133 Namespace: namespaceConsensus, 134 Subsystem: subsystemHotstuff, 135 Help: "duration [seconds; measured with float64 precision] of how long HotStuff sends computing consensus committee relations", 136 Buckets: []float64{0.02, 0.05, 0.1, 0.2, 0.5, 1, 2}, 137 ConstLabels: prometheus.Labels{LabelChain: chain.String()}, 138 }), 139 140 signerComputationsDuration: promauto.NewHistogram(prometheus.HistogramOpts{ 141 Name: "crypto_computations_seconds", 142 Namespace: namespaceConsensus, 143 Subsystem: subsystemHotstuff, 144 Help: "duration [seconds; measured with float64 precision] of how long HotStuff sends with crypto-related operations", 145 Buckets: []float64{0.02, 0.05, 0.1, 0.2, 0.5, 1, 2}, 146 ConstLabels: prometheus.Labels{LabelChain: chain.String()}, 147 }), 148 149 validatorComputationsDuration: promauto.NewHistogram(prometheus.HistogramOpts{ 150 Name: "message_validation_seconds", 151 Namespace: namespaceConsensus, 152 Subsystem: subsystemHotstuff, 153 Help: "duration [seconds; measured with float64 precision] of how long HotStuff sends with message-validation", 154 Buckets: []float64{0.02, 0.05, 0.1, 0.2, 0.5, 1, 2}, 155 ConstLabels: prometheus.Labels{LabelChain: chain.String()}, 156 }), 157 158 payloadProductionDuration: promauto.NewHistogram(prometheus.HistogramOpts{ 159 Name: "payload_production_seconds", 160 Namespace: namespaceConsensus, 161 Subsystem: subsystemHotstuff, 162 Help: "duration [seconds; measured with float64 precision] of how long HotStuff sends with payload production", 163 Buckets: []float64{0.02, 0.05, 0.1, 0.2, 0.5, 1, 2}, 164 ConstLabels: prometheus.Labels{LabelChain: chain.String()}, 165 }), 166 blockProcessingDuration: promauto.NewHistogram(prometheus.HistogramOpts{ 167 Name: "block_processing_seconds", 168 Namespace: namespaceConsensus, 169 Subsystem: subsystemHotstuff, 170 Help: "duration [seconds; measured with float64 precision] of how long compliance engine processes one block", 171 Buckets: []float64{0.02, 0.05, 0.1, 0.2, 0.5, 1, 2}, 172 ConstLabels: prometheus.Labels{LabelChain: chain.String()}, 173 }), 174 voteProcessingDuration: promauto.NewHistogram(prometheus.HistogramOpts{ 175 Name: "vote_processing_seconds", 176 Namespace: namespaceConsensus, 177 Subsystem: subsystemHotstuff, 178 Help: "duration [seconds; measured with float64 precision] of how long VoteAggregator processes one message", 179 Buckets: []float64{0.02, 0.05, 0.1, 0.2, 0.5, 1, 2}, 180 ConstLabels: prometheus.Labels{LabelChain: chain.String()}, 181 }), 182 timeoutProcessingDuration: promauto.NewHistogram(prometheus.HistogramOpts{ 183 Name: "timeout_object_processing_seconds", 184 Namespace: namespaceConsensus, 185 Subsystem: subsystemHotstuff, 186 Help: "duration [seconds; measured with float64 precision] of how long TimeoutAggregator processes one message", 187 Buckets: []float64{0.02, 0.05, 0.1, 0.2, 0.5, 1, 2}, 188 ConstLabels: prometheus.Labels{LabelChain: chain.String()}, 189 }), 190 timeoutCollectorsRange: promauto.NewGaugeVec(prometheus.GaugeOpts{ 191 Name: "timeout_collectors_range", 192 Namespace: namespaceConsensus, 193 Subsystem: subsystemHotstuff, 194 Help: "lowest and highest views that we are maintaining TimeoutCollectors for", 195 ConstLabels: prometheus.Labels{LabelChain: chain.String()}, 196 }, []string{"prefix"}), 197 numberOfActiveCollectors: promauto.NewGauge(prometheus.GaugeOpts{ 198 Name: "active_collectors", 199 Namespace: namespaceConsensus, 200 Subsystem: subsystemHotstuff, 201 Help: "number of active TimeoutCollectors that the TimeoutAggregator component currently maintains", 202 ConstLabels: prometheus.Labels{LabelChain: chain.String()}, 203 }), 204 } 205 206 return hc 207 } 208 209 // HotStuffBusyDuration reports Metrics C6 HotStuff Busy Duration 210 func (hc *HotstuffCollector) HotStuffBusyDuration(duration time.Duration, event string) { 211 hc.busyDuration.WithLabelValues(event).Observe(duration.Seconds()) // unit: seconds; with float64 precision 212 } 213 214 // HotStuffIdleDuration reports Metrics C6 HotStuff Idle Duration 215 func (hc *HotstuffCollector) HotStuffIdleDuration(duration time.Duration) { 216 hc.idleDuration.Observe(duration.Seconds()) // unit: seconds; with float64 precision 217 } 218 219 // HotStuffWaitDuration reports Metrics C6 HotStuff Idle Duration - the time between receiving and 220 // enqueueing a message to beginning to process that message. 221 func (hc *HotstuffCollector) HotStuffWaitDuration(duration time.Duration, event string) { 222 hc.waitDuration.WithLabelValues(event).Observe(duration.Seconds()) // unit: seconds; with float64 precision 223 } 224 225 // CountSkipped counts the number of skips we did. 226 func (hc *HotstuffCollector) CountSkipped() { 227 hc.skips.Inc() 228 } 229 230 // CountTimeout tracks the number of views that this replica left due to observing a TC. 231 func (hc *HotstuffCollector) CountTimeout() { 232 hc.timeouts.Inc() 233 } 234 235 // SetCurView reports Metrics C8: Current View 236 func (hc *HotstuffCollector) SetCurView(view uint64) { 237 hc.curView.Set(float64(view)) 238 } 239 240 // SetQCView reports Metrics C9: View of Newest Known QC 241 func (hc *HotstuffCollector) SetQCView(view uint64) { 242 hc.qcView.Set(float64(view)) 243 } 244 245 // SetTCView reports the view of the newest known TC 246 func (hc *HotstuffCollector) SetTCView(view uint64) { 247 hc.tcView.Set(float64(view)) 248 } 249 250 // BlockProcessingDuration measures the time which the compliance engine 251 // spends to process one block proposal. 252 func (hc *HotstuffCollector) BlockProcessingDuration(duration time.Duration) { 253 hc.blockProcessingDuration.Observe(duration.Seconds()) 254 } 255 256 // VoteProcessingDuration reports the processing time for a single vote 257 func (hc *HotstuffCollector) VoteProcessingDuration(duration time.Duration) { 258 hc.voteProcessingDuration.Observe(duration.Seconds()) 259 } 260 261 // TimeoutObjectProcessingDuration reports the processing time for a TimeoutObject 262 func (hc *HotstuffCollector) TimeoutObjectProcessingDuration(duration time.Duration) { 263 hc.timeoutProcessingDuration.Observe(duration.Seconds()) 264 } 265 266 // SetTimeout sets the current timeout duration. 267 func (hc *HotstuffCollector) SetTimeout(duration time.Duration) { 268 hc.timeoutDuration.Set(duration.Seconds()) // unit: seconds; with float64 precision 269 } 270 271 // CommitteeProcessingDuration measures the time which the HotStuff's core logic 272 // spends in the hotstuff.Committee component, i.e. the time determining consensus 273 // committee relations. 274 func (hc *HotstuffCollector) CommitteeProcessingDuration(duration time.Duration) { 275 hc.committeeComputationsDuration.Observe(duration.Seconds()) // unit: seconds; with float64 precision 276 } 277 278 // SignerProcessingDuration reports the time which the HotStuff's core logic 279 // spends in the hotstuff.Signer component, i.e. the with crypto-related operations. 280 func (hc *HotstuffCollector) SignerProcessingDuration(duration time.Duration) { 281 hc.signerComputationsDuration.Observe(duration.Seconds()) // unit: seconds; with float64 precision 282 } 283 284 // ValidatorProcessingDuration reports the time which the HotStuff's core logic 285 // spends in the hotstuff.Validator component, i.e. the with verifying higher-level 286 // consensus messages. 287 func (hc *HotstuffCollector) ValidatorProcessingDuration(duration time.Duration) { 288 hc.validatorComputationsDuration.Observe(duration.Seconds()) // unit: seconds; with float64 precision 289 } 290 291 // PayloadProductionDuration reports the time which the HotStuff's core logic 292 // spends in the module.Builder component, i.e. the with generating block payloads 293 func (hc *HotstuffCollector) PayloadProductionDuration(duration time.Duration) { 294 hc.payloadProductionDuration.Observe(duration.Seconds()) // unit: seconds; with float64 precision 295 } 296 297 // TimeoutCollectorsRange collects information from the node's `TimeoutAggregator` component. 298 // Specifically, it measurers the number of views for which we are currently collecting timeouts 299 // (i.e. the number of `TimeoutCollector` instances we are maintaining) and their lowest/highest view. 300 func (hc *HotstuffCollector) TimeoutCollectorsRange(lowestRetainedView uint64, newestViewCreatedCollector uint64, activeCollectors int) { 301 hc.timeoutCollectorsRange.WithLabelValues("lowest_view_of_active_timeout_collectors").Set(float64(lowestRetainedView)) 302 hc.timeoutCollectorsRange.WithLabelValues("newest_view_of_active_timeout_collectors").Set(float64(newestViewCreatedCollector)) 303 hc.numberOfActiveCollectors.Set(float64(activeCollectors)) 304 }