github.com/MetalBlockchain/metalgo@v1.11.9/snow/networking/handler/message_queue.go (about) 1 // Copyright (C) 2019-2024, Ava Labs, Inc. All rights reserved. 2 // See the file LICENSE for licensing terms. 3 4 package handler 5 6 import ( 7 "context" 8 "sync" 9 10 "github.com/prometheus/client_golang/prometheus" 11 "go.uber.org/zap" 12 13 "github.com/MetalBlockchain/metalgo/ids" 14 "github.com/MetalBlockchain/metalgo/message" 15 "github.com/MetalBlockchain/metalgo/proto/pb/p2p" 16 "github.com/MetalBlockchain/metalgo/snow/networking/tracker" 17 "github.com/MetalBlockchain/metalgo/snow/validators" 18 "github.com/MetalBlockchain/metalgo/utils/buffer" 19 "github.com/MetalBlockchain/metalgo/utils/logging" 20 "github.com/MetalBlockchain/metalgo/utils/timer/mockable" 21 ) 22 23 var _ MessageQueue = (*messageQueue)(nil) 24 25 // Message defines individual messages that have been parsed from the network 26 // and are now pending execution from the chain. 27 type Message struct { 28 // The original message from the peer 29 message.InboundMessage 30 // The desired engine type to execute this message. If not specified, 31 // the current executing engine type is used. 32 EngineType p2p.EngineType 33 } 34 35 type MessageQueue interface { 36 // Add a message. 37 // 38 // If called after [Shutdown], the message will immediately be marked as 39 // having been handled. 40 Push(context.Context, Message) 41 42 // Remove and return a message and its context. 43 // 44 // If there are no available messages, this function will block until a 45 // message becomes available or the queue is [Shutdown]. 46 Pop() (context.Context, Message, bool) 47 48 // Returns the number of messages currently on the queue 49 Len() int 50 51 // Shutdown and empty the queue. 52 Shutdown() 53 } 54 55 // TODO: Use a better data structure for this. 56 // We can do something better than pushing to the back of a queue. A multi-level 57 // queue? 58 type messageQueue struct { 59 // Useful for faking time in tests 60 clock mockable.Clock 61 metrics messageQueueMetrics 62 63 log logging.Logger 64 subnetID ids.ID 65 // Validator set for the chain associated with this 66 vdrs validators.Manager 67 // Tracks CPU utilization of each node 68 cpuTracker tracker.Tracker 69 70 cond *sync.Cond 71 closed bool 72 // Node ID --> Messages this node has in [msgs] 73 nodeToUnprocessedMsgs map[ids.NodeID]int 74 // Unprocessed messages 75 msgAndCtxs buffer.Deque[*msgAndContext] 76 } 77 78 func NewMessageQueue( 79 log logging.Logger, 80 subnetID ids.ID, 81 vdrs validators.Manager, 82 cpuTracker tracker.Tracker, 83 metricsNamespace string, 84 reg prometheus.Registerer, 85 ) (MessageQueue, error) { 86 m := &messageQueue{ 87 log: log, 88 subnetID: subnetID, 89 vdrs: vdrs, 90 cpuTracker: cpuTracker, 91 cond: sync.NewCond(&sync.Mutex{}), 92 nodeToUnprocessedMsgs: make(map[ids.NodeID]int), 93 msgAndCtxs: buffer.NewUnboundedDeque[*msgAndContext](1 /*=initSize*/), 94 } 95 return m, m.metrics.initialize(metricsNamespace, reg) 96 } 97 98 func (m *messageQueue) Push(ctx context.Context, msg Message) { 99 m.cond.L.Lock() 100 defer m.cond.L.Unlock() 101 102 if m.closed { 103 msg.OnFinishedHandling() 104 return 105 } 106 107 // Add the message to the queue 108 m.msgAndCtxs.PushRight(&msgAndContext{ 109 msg: msg, 110 ctx: ctx, 111 }) 112 m.nodeToUnprocessedMsgs[msg.NodeID()]++ 113 114 // Update metrics 115 m.metrics.count.With(prometheus.Labels{ 116 opLabel: msg.Op().String(), 117 }).Inc() 118 m.metrics.nodesWithMessages.Set(float64(len(m.nodeToUnprocessedMsgs))) 119 120 // Signal a waiting thread 121 m.cond.Signal() 122 } 123 124 // FIFO, but skip over messages whose senders whose messages have caused us to 125 // use excessive CPU recently. 126 func (m *messageQueue) Pop() (context.Context, Message, bool) { 127 m.cond.L.Lock() 128 defer m.cond.L.Unlock() 129 130 for { 131 if m.closed { 132 return nil, Message{}, false 133 } 134 if m.msgAndCtxs.Len() != 0 { 135 break 136 } 137 m.cond.Wait() 138 } 139 140 n := m.msgAndCtxs.Len() // note that n > 0 141 i := 0 142 for { 143 if i == n { 144 m.log.Debug("canPop is false for all unprocessed messages", 145 zap.Int("numMessages", n), 146 ) 147 } 148 149 var ( 150 msgAndCtx, _ = m.msgAndCtxs.PopLeft() 151 msg = msgAndCtx.msg 152 ctx = msgAndCtx.ctx 153 nodeID = msg.NodeID() 154 ) 155 156 // See if it's OK to process [msg] next 157 if m.canPop(msg) || i == n { // i should never == n but handle anyway as a fail-safe 158 m.nodeToUnprocessedMsgs[nodeID]-- 159 if m.nodeToUnprocessedMsgs[nodeID] == 0 { 160 delete(m.nodeToUnprocessedMsgs, nodeID) 161 } 162 m.metrics.count.With(prometheus.Labels{ 163 opLabel: msg.Op().String(), 164 }).Dec() 165 m.metrics.nodesWithMessages.Set(float64(len(m.nodeToUnprocessedMsgs))) 166 return ctx, msg, true 167 } 168 // [msg.nodeID] is causing excessive CPU usage. 169 // Push [msg] to back of [m.msgs] and handle it later. 170 m.msgAndCtxs.PushRight(msgAndCtx) 171 i++ 172 m.metrics.numExcessiveCPU.Inc() 173 } 174 } 175 176 func (m *messageQueue) Len() int { 177 m.cond.L.Lock() 178 defer m.cond.L.Unlock() 179 180 return m.msgAndCtxs.Len() 181 } 182 183 func (m *messageQueue) Shutdown() { 184 m.cond.L.Lock() 185 defer m.cond.L.Unlock() 186 187 // Remove all the current messages from the queue 188 for m.msgAndCtxs.Len() > 0 { 189 msgAndCtx, _ := m.msgAndCtxs.PopLeft() 190 msgAndCtx.msg.OnFinishedHandling() 191 } 192 m.nodeToUnprocessedMsgs = nil 193 194 // Update metrics 195 m.metrics.count.Reset() 196 m.metrics.nodesWithMessages.Set(0) 197 198 // Mark the queue as closed 199 m.closed = true 200 m.cond.Broadcast() 201 } 202 203 // canPop will return true for at least one message in [m.msgs] 204 func (m *messageQueue) canPop(msg message.InboundMessage) bool { 205 // Always pop connected and disconnected messages. 206 if op := msg.Op(); op == message.ConnectedOp || op == message.DisconnectedOp || op == message.ConnectedSubnetOp { 207 return true 208 } 209 210 // If the deadline to handle [msg] has passed, always pop it. 211 // It will be dropped immediately. 212 if expiration := msg.Expiration(); m.clock.Time().After(expiration) { 213 return true 214 } 215 // Every node has some allowed CPU allocation depending on 216 // the number of nodes with unprocessed messages. 217 baseMaxCPU := 1 / float64(len(m.nodeToUnprocessedMsgs)) 218 nodeID := msg.NodeID() 219 weight := m.vdrs.GetWeight(m.subnetID, nodeID) 220 221 var portionWeight float64 222 if totalVdrsWeight, err := m.vdrs.TotalWeight(m.subnetID); err != nil { 223 // The sum of validator weights should never overflow, but if they do, 224 // we treat portionWeight as 0. 225 m.log.Error("failed to get total weight of validators", 226 zap.Stringer("subnetID", m.subnetID), 227 zap.Error(err), 228 ) 229 } else if totalVdrsWeight == 0 { 230 // The sum of validator weights should never be 0, but handle that case 231 // for completeness here to avoid divide by 0. 232 m.log.Warn("validator set is empty", 233 zap.Stringer("subnetID", m.subnetID), 234 ) 235 } else { 236 portionWeight = float64(weight) / float64(totalVdrsWeight) 237 } 238 239 // Validators are allowed to use more CPU. More weight --> more CPU use allowed. 240 recentCPUUsage := m.cpuTracker.Usage(nodeID, m.clock.Time()) 241 maxCPU := baseMaxCPU + (1.0-baseMaxCPU)*portionWeight 242 return recentCPUUsage <= maxCPU 243 } 244 245 type msgAndContext struct { 246 msg Message 247 ctx context.Context 248 }