github.com/MetalBlockchain/metalgo@v1.11.9/snow/networking/handler/message_queue.go (about)

     1  // Copyright (C) 2019-2024, Ava Labs, Inc. All rights reserved.
     2  // See the file LICENSE for licensing terms.
     3  
     4  package handler
     5  
     6  import (
     7  	"context"
     8  	"sync"
     9  
    10  	"github.com/prometheus/client_golang/prometheus"
    11  	"go.uber.org/zap"
    12  
    13  	"github.com/MetalBlockchain/metalgo/ids"
    14  	"github.com/MetalBlockchain/metalgo/message"
    15  	"github.com/MetalBlockchain/metalgo/proto/pb/p2p"
    16  	"github.com/MetalBlockchain/metalgo/snow/networking/tracker"
    17  	"github.com/MetalBlockchain/metalgo/snow/validators"
    18  	"github.com/MetalBlockchain/metalgo/utils/buffer"
    19  	"github.com/MetalBlockchain/metalgo/utils/logging"
    20  	"github.com/MetalBlockchain/metalgo/utils/timer/mockable"
    21  )
    22  
    23  var _ MessageQueue = (*messageQueue)(nil)
    24  
    25  // Message defines individual messages that have been parsed from the network
    26  // and are now pending execution from the chain.
    27  type Message struct {
    28  	// The original message from the peer
    29  	message.InboundMessage
    30  	// The desired engine type to execute this message. If not specified,
    31  	// the current executing engine type is used.
    32  	EngineType p2p.EngineType
    33  }
    34  
    35  type MessageQueue interface {
    36  	// Add a message.
    37  	//
    38  	// If called after [Shutdown], the message will immediately be marked as
    39  	// having been handled.
    40  	Push(context.Context, Message)
    41  
    42  	// Remove and return a message and its context.
    43  	//
    44  	// If there are no available messages, this function will block until a
    45  	// message becomes available or the queue is [Shutdown].
    46  	Pop() (context.Context, Message, bool)
    47  
    48  	// Returns the number of messages currently on the queue
    49  	Len() int
    50  
    51  	// Shutdown and empty the queue.
    52  	Shutdown()
    53  }
    54  
    55  // TODO: Use a better data structure for this.
    56  // We can do something better than pushing to the back of a queue. A multi-level
    57  // queue?
    58  type messageQueue struct {
    59  	// Useful for faking time in tests
    60  	clock   mockable.Clock
    61  	metrics messageQueueMetrics
    62  
    63  	log      logging.Logger
    64  	subnetID ids.ID
    65  	// Validator set for the chain associated with this
    66  	vdrs validators.Manager
    67  	// Tracks CPU utilization of each node
    68  	cpuTracker tracker.Tracker
    69  
    70  	cond   *sync.Cond
    71  	closed bool
    72  	// Node ID --> Messages this node has in [msgs]
    73  	nodeToUnprocessedMsgs map[ids.NodeID]int
    74  	// Unprocessed messages
    75  	msgAndCtxs buffer.Deque[*msgAndContext]
    76  }
    77  
    78  func NewMessageQueue(
    79  	log logging.Logger,
    80  	subnetID ids.ID,
    81  	vdrs validators.Manager,
    82  	cpuTracker tracker.Tracker,
    83  	metricsNamespace string,
    84  	reg prometheus.Registerer,
    85  ) (MessageQueue, error) {
    86  	m := &messageQueue{
    87  		log:                   log,
    88  		subnetID:              subnetID,
    89  		vdrs:                  vdrs,
    90  		cpuTracker:            cpuTracker,
    91  		cond:                  sync.NewCond(&sync.Mutex{}),
    92  		nodeToUnprocessedMsgs: make(map[ids.NodeID]int),
    93  		msgAndCtxs:            buffer.NewUnboundedDeque[*msgAndContext](1 /*=initSize*/),
    94  	}
    95  	return m, m.metrics.initialize(metricsNamespace, reg)
    96  }
    97  
    98  func (m *messageQueue) Push(ctx context.Context, msg Message) {
    99  	m.cond.L.Lock()
   100  	defer m.cond.L.Unlock()
   101  
   102  	if m.closed {
   103  		msg.OnFinishedHandling()
   104  		return
   105  	}
   106  
   107  	// Add the message to the queue
   108  	m.msgAndCtxs.PushRight(&msgAndContext{
   109  		msg: msg,
   110  		ctx: ctx,
   111  	})
   112  	m.nodeToUnprocessedMsgs[msg.NodeID()]++
   113  
   114  	// Update metrics
   115  	m.metrics.count.With(prometheus.Labels{
   116  		opLabel: msg.Op().String(),
   117  	}).Inc()
   118  	m.metrics.nodesWithMessages.Set(float64(len(m.nodeToUnprocessedMsgs)))
   119  
   120  	// Signal a waiting thread
   121  	m.cond.Signal()
   122  }
   123  
   124  // FIFO, but skip over messages whose senders whose messages have caused us to
   125  // use excessive CPU recently.
   126  func (m *messageQueue) Pop() (context.Context, Message, bool) {
   127  	m.cond.L.Lock()
   128  	defer m.cond.L.Unlock()
   129  
   130  	for {
   131  		if m.closed {
   132  			return nil, Message{}, false
   133  		}
   134  		if m.msgAndCtxs.Len() != 0 {
   135  			break
   136  		}
   137  		m.cond.Wait()
   138  	}
   139  
   140  	n := m.msgAndCtxs.Len() // note that n > 0
   141  	i := 0
   142  	for {
   143  		if i == n {
   144  			m.log.Debug("canPop is false for all unprocessed messages",
   145  				zap.Int("numMessages", n),
   146  			)
   147  		}
   148  
   149  		var (
   150  			msgAndCtx, _ = m.msgAndCtxs.PopLeft()
   151  			msg          = msgAndCtx.msg
   152  			ctx          = msgAndCtx.ctx
   153  			nodeID       = msg.NodeID()
   154  		)
   155  
   156  		// See if it's OK to process [msg] next
   157  		if m.canPop(msg) || i == n { // i should never == n but handle anyway as a fail-safe
   158  			m.nodeToUnprocessedMsgs[nodeID]--
   159  			if m.nodeToUnprocessedMsgs[nodeID] == 0 {
   160  				delete(m.nodeToUnprocessedMsgs, nodeID)
   161  			}
   162  			m.metrics.count.With(prometheus.Labels{
   163  				opLabel: msg.Op().String(),
   164  			}).Dec()
   165  			m.metrics.nodesWithMessages.Set(float64(len(m.nodeToUnprocessedMsgs)))
   166  			return ctx, msg, true
   167  		}
   168  		// [msg.nodeID] is causing excessive CPU usage.
   169  		// Push [msg] to back of [m.msgs] and handle it later.
   170  		m.msgAndCtxs.PushRight(msgAndCtx)
   171  		i++
   172  		m.metrics.numExcessiveCPU.Inc()
   173  	}
   174  }
   175  
   176  func (m *messageQueue) Len() int {
   177  	m.cond.L.Lock()
   178  	defer m.cond.L.Unlock()
   179  
   180  	return m.msgAndCtxs.Len()
   181  }
   182  
   183  func (m *messageQueue) Shutdown() {
   184  	m.cond.L.Lock()
   185  	defer m.cond.L.Unlock()
   186  
   187  	// Remove all the current messages from the queue
   188  	for m.msgAndCtxs.Len() > 0 {
   189  		msgAndCtx, _ := m.msgAndCtxs.PopLeft()
   190  		msgAndCtx.msg.OnFinishedHandling()
   191  	}
   192  	m.nodeToUnprocessedMsgs = nil
   193  
   194  	// Update metrics
   195  	m.metrics.count.Reset()
   196  	m.metrics.nodesWithMessages.Set(0)
   197  
   198  	// Mark the queue as closed
   199  	m.closed = true
   200  	m.cond.Broadcast()
   201  }
   202  
   203  // canPop will return true for at least one message in [m.msgs]
   204  func (m *messageQueue) canPop(msg message.InboundMessage) bool {
   205  	// Always pop connected and disconnected messages.
   206  	if op := msg.Op(); op == message.ConnectedOp || op == message.DisconnectedOp || op == message.ConnectedSubnetOp {
   207  		return true
   208  	}
   209  
   210  	// If the deadline to handle [msg] has passed, always pop it.
   211  	// It will be dropped immediately.
   212  	if expiration := msg.Expiration(); m.clock.Time().After(expiration) {
   213  		return true
   214  	}
   215  	// Every node has some allowed CPU allocation depending on
   216  	// the number of nodes with unprocessed messages.
   217  	baseMaxCPU := 1 / float64(len(m.nodeToUnprocessedMsgs))
   218  	nodeID := msg.NodeID()
   219  	weight := m.vdrs.GetWeight(m.subnetID, nodeID)
   220  
   221  	var portionWeight float64
   222  	if totalVdrsWeight, err := m.vdrs.TotalWeight(m.subnetID); err != nil {
   223  		// The sum of validator weights should never overflow, but if they do,
   224  		// we treat portionWeight as 0.
   225  		m.log.Error("failed to get total weight of validators",
   226  			zap.Stringer("subnetID", m.subnetID),
   227  			zap.Error(err),
   228  		)
   229  	} else if totalVdrsWeight == 0 {
   230  		// The sum of validator weights should never be 0, but handle that case
   231  		// for completeness here to avoid divide by 0.
   232  		m.log.Warn("validator set is empty",
   233  			zap.Stringer("subnetID", m.subnetID),
   234  		)
   235  	} else {
   236  		portionWeight = float64(weight) / float64(totalVdrsWeight)
   237  	}
   238  
   239  	// Validators are allowed to use more CPU. More weight --> more CPU use allowed.
   240  	recentCPUUsage := m.cpuTracker.Usage(nodeID, m.clock.Time())
   241  	maxCPU := baseMaxCPU + (1.0-baseMaxCPU)*portionWeight
   242  	return recentCPUUsage <= maxCPU
   243  }
   244  
   245  type msgAndContext struct {
   246  	msg Message
   247  	ctx context.Context
   248  }