github.com/MetalBlockchain/metalgo@v1.11.9/network/p2p/gossip/gossip.go (about)

     1  // Copyright (C) 2019-2024, Ava Labs, Inc. All rights reserved.
     2  // See the file LICENSE for licensing terms.
     3  
     4  package gossip
     5  
     6  import (
     7  	"context"
     8  	"errors"
     9  	"fmt"
    10  	"sync"
    11  	"time"
    12  
    13  	"github.com/prometheus/client_golang/prometheus"
    14  	"go.uber.org/zap"
    15  
    16  	"github.com/MetalBlockchain/metalgo/cache"
    17  	"github.com/MetalBlockchain/metalgo/ids"
    18  	"github.com/MetalBlockchain/metalgo/network/p2p"
    19  	"github.com/MetalBlockchain/metalgo/snow/engine/common"
    20  	"github.com/MetalBlockchain/metalgo/utils/bloom"
    21  	"github.com/MetalBlockchain/metalgo/utils/buffer"
    22  	"github.com/MetalBlockchain/metalgo/utils/logging"
    23  	"github.com/MetalBlockchain/metalgo/utils/set"
    24  )
    25  
    26  const (
    27  	ioLabel    = "io"
    28  	sentIO     = "sent"
    29  	receivedIO = "received"
    30  
    31  	typeLabel  = "type"
    32  	pushType   = "push"
    33  	pullType   = "pull"
    34  	unsentType = "unsent"
    35  	sentType   = "sent"
    36  
    37  	defaultGossipableCount = 64
    38  )
    39  
    40  var (
    41  	_ Gossiper = (*ValidatorGossiper)(nil)
    42  	_ Gossiper = (*PullGossiper[*testTx])(nil)
    43  	_ Gossiper = (*NoOpGossiper)(nil)
    44  
    45  	_ Set[*testTx] = (*EmptySet[*testTx])(nil)
    46  	_ Set[*testTx] = (*FullSet[*testTx])(nil)
    47  
    48  	ioTypeLabels   = []string{ioLabel, typeLabel}
    49  	sentPushLabels = prometheus.Labels{
    50  		ioLabel:   sentIO,
    51  		typeLabel: pushType,
    52  	}
    53  	receivedPushLabels = prometheus.Labels{
    54  		ioLabel:   receivedIO,
    55  		typeLabel: pushType,
    56  	}
    57  	sentPullLabels = prometheus.Labels{
    58  		ioLabel:   sentIO,
    59  		typeLabel: pullType,
    60  	}
    61  	receivedPullLabels = prometheus.Labels{
    62  		ioLabel:   receivedIO,
    63  		typeLabel: pullType,
    64  	}
    65  	typeLabels   = []string{typeLabel}
    66  	unsentLabels = prometheus.Labels{
    67  		typeLabel: unsentType,
    68  	}
    69  	sentLabels = prometheus.Labels{
    70  		typeLabel: sentType,
    71  	}
    72  
    73  	ErrInvalidNumValidators     = errors.New("num validators cannot be negative")
    74  	ErrInvalidNumNonValidators  = errors.New("num non-validators cannot be negative")
    75  	ErrInvalidNumPeers          = errors.New("num peers cannot be negative")
    76  	ErrInvalidNumToGossip       = errors.New("must gossip to at least one peer")
    77  	ErrInvalidDiscardedSize     = errors.New("discarded size cannot be negative")
    78  	ErrInvalidTargetGossipSize  = errors.New("target gossip size cannot be negative")
    79  	ErrInvalidRegossipFrequency = errors.New("re-gossip frequency cannot be negative")
    80  
    81  	errEmptySetCantAdd = errors.New("empty set can not add")
    82  )
    83  
    84  // Gossiper gossips Gossipables to other nodes
    85  type Gossiper interface {
    86  	// Gossip runs a cycle of gossip. Returns an error if we failed to gossip.
    87  	Gossip(ctx context.Context) error
    88  }
    89  
    90  // ValidatorGossiper only calls [Gossip] if the given node is a validator
    91  type ValidatorGossiper struct {
    92  	Gossiper
    93  
    94  	NodeID     ids.NodeID
    95  	Validators p2p.ValidatorSet
    96  }
    97  
    98  // Metrics that are tracked across a gossip protocol. A given protocol should
    99  // only use a single instance of Metrics.
   100  type Metrics struct {
   101  	count                   *prometheus.CounterVec
   102  	bytes                   *prometheus.CounterVec
   103  	tracking                *prometheus.GaugeVec
   104  	trackingLifetimeAverage prometheus.Gauge
   105  	topValidators           *prometheus.GaugeVec
   106  }
   107  
   108  // NewMetrics returns a common set of metrics
   109  func NewMetrics(
   110  	metrics prometheus.Registerer,
   111  	namespace string,
   112  ) (Metrics, error) {
   113  	m := Metrics{
   114  		count: prometheus.NewCounterVec(
   115  			prometheus.CounterOpts{
   116  				Namespace: namespace,
   117  				Name:      "gossip_count",
   118  				Help:      "amount of gossip (n)",
   119  			},
   120  			ioTypeLabels,
   121  		),
   122  		bytes: prometheus.NewCounterVec(
   123  			prometheus.CounterOpts{
   124  				Namespace: namespace,
   125  				Name:      "gossip_bytes",
   126  				Help:      "amount of gossip (bytes)",
   127  			},
   128  			ioTypeLabels,
   129  		),
   130  		tracking: prometheus.NewGaugeVec(
   131  			prometheus.GaugeOpts{
   132  				Namespace: namespace,
   133  				Name:      "gossip_tracking",
   134  				Help:      "number of gossipables being tracked",
   135  			},
   136  			typeLabels,
   137  		),
   138  		trackingLifetimeAverage: prometheus.NewGauge(prometheus.GaugeOpts{
   139  			Namespace: namespace,
   140  			Name:      "gossip_tracking_lifetime_average",
   141  			Help:      "average duration a gossipable has been tracked (ns)",
   142  		}),
   143  		topValidators: prometheus.NewGaugeVec(
   144  			prometheus.GaugeOpts{
   145  				Namespace: namespace,
   146  				Name:      "top_validators",
   147  				Help:      "number of validators gossipables are sent to due to stake",
   148  			},
   149  			typeLabels,
   150  		),
   151  	}
   152  	err := errors.Join(
   153  		metrics.Register(m.count),
   154  		metrics.Register(m.bytes),
   155  		metrics.Register(m.tracking),
   156  		metrics.Register(m.trackingLifetimeAverage),
   157  		metrics.Register(m.topValidators),
   158  	)
   159  	return m, err
   160  }
   161  
   162  func (m *Metrics) observeMessage(labels prometheus.Labels, count int, bytes int) error {
   163  	countMetric, err := m.count.GetMetricWith(labels)
   164  	if err != nil {
   165  		return fmt.Errorf("failed to get count metric: %w", err)
   166  	}
   167  
   168  	bytesMetric, err := m.bytes.GetMetricWith(labels)
   169  	if err != nil {
   170  		return fmt.Errorf("failed to get bytes metric: %w", err)
   171  	}
   172  
   173  	countMetric.Add(float64(count))
   174  	bytesMetric.Add(float64(bytes))
   175  	return nil
   176  }
   177  
   178  func (v ValidatorGossiper) Gossip(ctx context.Context) error {
   179  	if !v.Validators.Has(ctx, v.NodeID) {
   180  		return nil
   181  	}
   182  
   183  	return v.Gossiper.Gossip(ctx)
   184  }
   185  
   186  func NewPullGossiper[T Gossipable](
   187  	log logging.Logger,
   188  	marshaller Marshaller[T],
   189  	set Set[T],
   190  	client *p2p.Client,
   191  	metrics Metrics,
   192  	pollSize int,
   193  ) *PullGossiper[T] {
   194  	return &PullGossiper[T]{
   195  		log:        log,
   196  		marshaller: marshaller,
   197  		set:        set,
   198  		client:     client,
   199  		metrics:    metrics,
   200  		pollSize:   pollSize,
   201  	}
   202  }
   203  
   204  type PullGossiper[T Gossipable] struct {
   205  	log        logging.Logger
   206  	marshaller Marshaller[T]
   207  	set        Set[T]
   208  	client     *p2p.Client
   209  	metrics    Metrics
   210  	pollSize   int
   211  }
   212  
   213  func (p *PullGossiper[_]) Gossip(ctx context.Context) error {
   214  	msgBytes, err := MarshalAppRequest(p.set.GetFilter())
   215  	if err != nil {
   216  		return err
   217  	}
   218  
   219  	for i := 0; i < p.pollSize; i++ {
   220  		err := p.client.AppRequestAny(ctx, msgBytes, p.handleResponse)
   221  		if err != nil && !errors.Is(err, p2p.ErrNoPeers) {
   222  			return err
   223  		}
   224  	}
   225  
   226  	return nil
   227  }
   228  
   229  func (p *PullGossiper[_]) handleResponse(
   230  	_ context.Context,
   231  	nodeID ids.NodeID,
   232  	responseBytes []byte,
   233  	err error,
   234  ) {
   235  	if err != nil {
   236  		p.log.Debug(
   237  			"failed gossip request",
   238  			zap.Stringer("nodeID", nodeID),
   239  			zap.Error(err),
   240  		)
   241  		return
   242  	}
   243  
   244  	gossip, err := ParseAppResponse(responseBytes)
   245  	if err != nil {
   246  		p.log.Debug("failed to unmarshal gossip response", zap.Error(err))
   247  		return
   248  	}
   249  
   250  	receivedBytes := 0
   251  	for _, bytes := range gossip {
   252  		receivedBytes += len(bytes)
   253  
   254  		gossipable, err := p.marshaller.UnmarshalGossip(bytes)
   255  		if err != nil {
   256  			p.log.Debug(
   257  				"failed to unmarshal gossip",
   258  				zap.Stringer("nodeID", nodeID),
   259  				zap.Error(err),
   260  			)
   261  			continue
   262  		}
   263  
   264  		gossipID := gossipable.GossipID()
   265  		p.log.Debug(
   266  			"received gossip",
   267  			zap.Stringer("nodeID", nodeID),
   268  			zap.Stringer("id", gossipID),
   269  		)
   270  		if err := p.set.Add(gossipable); err != nil {
   271  			p.log.Debug(
   272  				"failed to add gossip to the known set",
   273  				zap.Stringer("nodeID", nodeID),
   274  				zap.Stringer("id", gossipID),
   275  				zap.Error(err),
   276  			)
   277  			continue
   278  		}
   279  	}
   280  
   281  	if err := p.metrics.observeMessage(receivedPullLabels, len(gossip), receivedBytes); err != nil {
   282  		p.log.Error("failed to update metrics",
   283  			zap.Error(err),
   284  		)
   285  	}
   286  }
   287  
   288  // NewPushGossiper returns an instance of PushGossiper
   289  func NewPushGossiper[T Gossipable](
   290  	marshaller Marshaller[T],
   291  	mempool Set[T],
   292  	validators p2p.ValidatorSubset,
   293  	client *p2p.Client,
   294  	metrics Metrics,
   295  	gossipParams BranchingFactor,
   296  	regossipParams BranchingFactor,
   297  	discardedSize int,
   298  	targetGossipSize int,
   299  	maxRegossipFrequency time.Duration,
   300  ) (*PushGossiper[T], error) {
   301  	if err := gossipParams.Verify(); err != nil {
   302  		return nil, fmt.Errorf("invalid gossip params: %w", err)
   303  	}
   304  	if err := regossipParams.Verify(); err != nil {
   305  		return nil, fmt.Errorf("invalid regossip params: %w", err)
   306  	}
   307  	switch {
   308  	case discardedSize < 0:
   309  		return nil, ErrInvalidDiscardedSize
   310  	case targetGossipSize < 0:
   311  		return nil, ErrInvalidTargetGossipSize
   312  	case maxRegossipFrequency < 0:
   313  		return nil, ErrInvalidRegossipFrequency
   314  	}
   315  
   316  	return &PushGossiper[T]{
   317  		marshaller:           marshaller,
   318  		set:                  mempool,
   319  		validators:           validators,
   320  		client:               client,
   321  		metrics:              metrics,
   322  		gossipParams:         gossipParams,
   323  		regossipParams:       regossipParams,
   324  		targetGossipSize:     targetGossipSize,
   325  		maxRegossipFrequency: maxRegossipFrequency,
   326  
   327  		tracking:   make(map[ids.ID]*tracking),
   328  		toGossip:   buffer.NewUnboundedDeque[T](0),
   329  		toRegossip: buffer.NewUnboundedDeque[T](0),
   330  		discarded:  &cache.LRU[ids.ID, struct{}]{Size: discardedSize},
   331  	}, nil
   332  }
   333  
   334  // PushGossiper broadcasts gossip to peers randomly in the network
   335  type PushGossiper[T Gossipable] struct {
   336  	marshaller Marshaller[T]
   337  	set        Set[T]
   338  	validators p2p.ValidatorSubset
   339  	client     *p2p.Client
   340  	metrics    Metrics
   341  
   342  	gossipParams         BranchingFactor
   343  	regossipParams       BranchingFactor
   344  	targetGossipSize     int
   345  	maxRegossipFrequency time.Duration
   346  
   347  	lock         sync.Mutex
   348  	tracking     map[ids.ID]*tracking
   349  	addedTimeSum float64 // unix nanoseconds
   350  	toGossip     buffer.Deque[T]
   351  	toRegossip   buffer.Deque[T]
   352  	discarded    *cache.LRU[ids.ID, struct{}] // discarded attempts to avoid overgossiping transactions that are frequently dropped
   353  }
   354  
   355  type BranchingFactor struct {
   356  	// StakePercentage determines the percentage of stake that should have
   357  	// gossip sent to based on the inverse CDF of stake weights. This value does
   358  	// not account for the connectivity of the nodes.
   359  	StakePercentage float64
   360  	// Validators specifies the number of connected validators, in addition to
   361  	// any validators sent from the StakePercentage parameter, to send gossip
   362  	// to. These validators are sampled uniformly rather than by stake.
   363  	Validators int
   364  	// NonValidators specifies the number of connected non-validators to send
   365  	// gossip to.
   366  	NonValidators int
   367  	// Peers specifies the number of connected validators or non-validators, in
   368  	// addition to the number sent due to other configs, to send gossip to.
   369  	Peers int
   370  }
   371  
   372  func (b *BranchingFactor) Verify() error {
   373  	switch {
   374  	case b.Validators < 0:
   375  		return ErrInvalidNumValidators
   376  	case b.NonValidators < 0:
   377  		return ErrInvalidNumNonValidators
   378  	case b.Peers < 0:
   379  		return ErrInvalidNumPeers
   380  	case max(b.Validators, b.NonValidators, b.Peers) == 0:
   381  		return ErrInvalidNumToGossip
   382  	default:
   383  		return nil
   384  	}
   385  }
   386  
   387  type tracking struct {
   388  	addedTime    float64 // unix nanoseconds
   389  	lastGossiped time.Time
   390  }
   391  
   392  // Gossip flushes any queued gossipables.
   393  func (p *PushGossiper[T]) Gossip(ctx context.Context) error {
   394  	var (
   395  		now         = time.Now()
   396  		nowUnixNano = float64(now.UnixNano())
   397  	)
   398  
   399  	p.lock.Lock()
   400  	defer func() {
   401  		p.updateMetrics(nowUnixNano)
   402  		p.lock.Unlock()
   403  	}()
   404  
   405  	if len(p.tracking) == 0 {
   406  		return nil
   407  	}
   408  
   409  	if err := p.gossip(
   410  		ctx,
   411  		now,
   412  		p.gossipParams,
   413  		p.toGossip,
   414  		p.toRegossip,
   415  		&cache.Empty[ids.ID, struct{}]{}, // Don't mark dropped unsent transactions as discarded
   416  		unsentLabels,
   417  	); err != nil {
   418  		return fmt.Errorf("unexpected error during gossip: %w", err)
   419  	}
   420  
   421  	if err := p.gossip(
   422  		ctx,
   423  		now,
   424  		p.regossipParams,
   425  		p.toRegossip,
   426  		p.toRegossip,
   427  		p.discarded, // Mark dropped sent transactions as discarded
   428  		sentLabels,
   429  	); err != nil {
   430  		return fmt.Errorf("unexpected error during regossip: %w", err)
   431  	}
   432  	return nil
   433  }
   434  
   435  func (p *PushGossiper[T]) gossip(
   436  	ctx context.Context,
   437  	now time.Time,
   438  	gossipParams BranchingFactor,
   439  	toGossip buffer.Deque[T],
   440  	toRegossip buffer.Deque[T],
   441  	discarded cache.Cacher[ids.ID, struct{}],
   442  	metricsLabels prometheus.Labels,
   443  ) error {
   444  	var (
   445  		sentBytes                   = 0
   446  		gossip                      = make([][]byte, 0, defaultGossipableCount)
   447  		maxLastGossipTimeToRegossip = now.Add(-p.maxRegossipFrequency)
   448  	)
   449  
   450  	for sentBytes < p.targetGossipSize {
   451  		gossipable, ok := toGossip.PopLeft()
   452  		if !ok {
   453  			break
   454  		}
   455  
   456  		// Ensure item is still in the set before we gossip.
   457  		gossipID := gossipable.GossipID()
   458  		tracking := p.tracking[gossipID]
   459  		if !p.set.Has(gossipID) {
   460  			delete(p.tracking, gossipID)
   461  			p.addedTimeSum -= tracking.addedTime
   462  			discarded.Put(gossipID, struct{}{}) // Cache that the item was dropped
   463  			continue
   464  		}
   465  
   466  		// Ensure we don't attempt to send a gossipable too frequently.
   467  		if maxLastGossipTimeToRegossip.Before(tracking.lastGossiped) {
   468  			// Put the gossipable on the front of the queue to keep items sorted
   469  			// by last issuance time.
   470  			toGossip.PushLeft(gossipable)
   471  			break
   472  		}
   473  
   474  		bytes, err := p.marshaller.MarshalGossip(gossipable)
   475  		if err != nil {
   476  			delete(p.tracking, gossipID)
   477  			p.addedTimeSum -= tracking.addedTime
   478  			return err
   479  		}
   480  
   481  		gossip = append(gossip, bytes)
   482  		sentBytes += len(bytes)
   483  		toRegossip.PushRight(gossipable)
   484  		tracking.lastGossiped = now
   485  	}
   486  
   487  	// If there is nothing to gossip, we can exit early.
   488  	if len(gossip) == 0 {
   489  		return nil
   490  	}
   491  
   492  	// Send gossipables to peers
   493  	msgBytes, err := MarshalAppGossip(gossip)
   494  	if err != nil {
   495  		return err
   496  	}
   497  
   498  	if err := p.metrics.observeMessage(sentPushLabels, len(gossip), sentBytes); err != nil {
   499  		return err
   500  	}
   501  
   502  	topValidatorsMetric, err := p.metrics.topValidators.GetMetricWith(metricsLabels)
   503  	if err != nil {
   504  		return fmt.Errorf("failed to get top validators metric: %w", err)
   505  	}
   506  
   507  	validatorsByStake := p.validators.Top(ctx, gossipParams.StakePercentage)
   508  	topValidatorsMetric.Set(float64(len(validatorsByStake)))
   509  
   510  	return p.client.AppGossip(
   511  		ctx,
   512  		common.SendConfig{
   513  			NodeIDs:       set.Of(validatorsByStake...),
   514  			Validators:    gossipParams.Validators,
   515  			NonValidators: gossipParams.NonValidators,
   516  			Peers:         gossipParams.Peers,
   517  		},
   518  		msgBytes,
   519  	)
   520  }
   521  
   522  // Add enqueues new gossipables to be pushed. If a gossiable is already tracked,
   523  // it is not added again.
   524  func (p *PushGossiper[T]) Add(gossipables ...T) {
   525  	var (
   526  		now         = time.Now()
   527  		nowUnixNano = float64(now.UnixNano())
   528  	)
   529  
   530  	p.lock.Lock()
   531  	defer func() {
   532  		p.updateMetrics(nowUnixNano)
   533  		p.lock.Unlock()
   534  	}()
   535  
   536  	// Add new gossipables to be sent.
   537  	for _, gossipable := range gossipables {
   538  		gossipID := gossipable.GossipID()
   539  		if _, ok := p.tracking[gossipID]; ok {
   540  			continue
   541  		}
   542  
   543  		tracking := &tracking{
   544  			addedTime: nowUnixNano,
   545  		}
   546  		if _, ok := p.discarded.Get(gossipID); ok {
   547  			// Pretend that recently discarded transactions were just gossiped.
   548  			tracking.lastGossiped = now
   549  			p.toRegossip.PushRight(gossipable)
   550  		} else {
   551  			p.toGossip.PushRight(gossipable)
   552  		}
   553  		p.tracking[gossipID] = tracking
   554  		p.addedTimeSum += nowUnixNano
   555  	}
   556  }
   557  
   558  func (p *PushGossiper[_]) updateMetrics(nowUnixNano float64) {
   559  	var (
   560  		numUnsent       = float64(p.toGossip.Len())
   561  		numSent         = float64(p.toRegossip.Len())
   562  		numTracking     = numUnsent + numSent
   563  		averageLifetime float64
   564  	)
   565  	if numTracking != 0 {
   566  		averageLifetime = nowUnixNano - p.addedTimeSum/numTracking
   567  	}
   568  
   569  	p.metrics.tracking.With(unsentLabels).Set(numUnsent)
   570  	p.metrics.tracking.With(sentLabels).Set(numSent)
   571  	p.metrics.trackingLifetimeAverage.Set(averageLifetime)
   572  }
   573  
   574  // Every calls [Gossip] every [frequency] amount of time.
   575  func Every(ctx context.Context, log logging.Logger, gossiper Gossiper, frequency time.Duration) {
   576  	ticker := time.NewTicker(frequency)
   577  	defer ticker.Stop()
   578  
   579  	for {
   580  		select {
   581  		case <-ticker.C:
   582  			if err := gossiper.Gossip(ctx); err != nil {
   583  				log.Warn("failed to gossip", zap.Error(err))
   584  			}
   585  		case <-ctx.Done():
   586  			log.Debug("shutting down gossip")
   587  			return
   588  		}
   589  	}
   590  }
   591  
   592  type NoOpGossiper struct{}
   593  
   594  func (NoOpGossiper) Gossip(context.Context) error {
   595  	return nil
   596  }
   597  
   598  type TestGossiper struct {
   599  	GossipF func(ctx context.Context) error
   600  }
   601  
   602  func (t *TestGossiper) Gossip(ctx context.Context) error {
   603  	return t.GossipF(ctx)
   604  }
   605  
   606  type EmptySet[T Gossipable] struct{}
   607  
   608  func (EmptySet[_]) Gossip(context.Context) error {
   609  	return nil
   610  }
   611  
   612  func (EmptySet[T]) Add(T) error {
   613  	return errEmptySetCantAdd
   614  }
   615  
   616  func (EmptySet[T]) Has(ids.ID) bool {
   617  	return false
   618  }
   619  
   620  func (EmptySet[T]) Iterate(func(gossipable T) bool) {}
   621  
   622  func (EmptySet[_]) GetFilter() ([]byte, []byte) {
   623  	return bloom.EmptyFilter.Marshal(), ids.Empty[:]
   624  }
   625  
   626  type FullSet[T Gossipable] struct{}
   627  
   628  func (FullSet[_]) Gossip(context.Context) error {
   629  	return nil
   630  }
   631  
   632  func (FullSet[T]) Add(T) error {
   633  	return nil
   634  }
   635  
   636  func (FullSet[T]) Has(ids.ID) bool {
   637  	return true
   638  }
   639  
   640  func (FullSet[T]) Iterate(func(gossipable T) bool) {}
   641  
   642  func (FullSet[_]) GetFilter() ([]byte, []byte) {
   643  	return bloom.FullFilter.Marshal(), ids.Empty[:]
   644  }