github.com/ava-labs/avalanchego@v1.11.11/network/p2p/gossip/gossip.go (about)

     1  // Copyright (C) 2019-2024, Ava Labs, Inc. All rights reserved.
     2  // See the file LICENSE for licensing terms.
     3  
     4  package gossip
     5  
     6  import (
     7  	"context"
     8  	"errors"
     9  	"fmt"
    10  	"sync"
    11  	"time"
    12  
    13  	"github.com/prometheus/client_golang/prometheus"
    14  	"go.uber.org/zap"
    15  
    16  	"github.com/ava-labs/avalanchego/cache"
    17  	"github.com/ava-labs/avalanchego/ids"
    18  	"github.com/ava-labs/avalanchego/network/p2p"
    19  	"github.com/ava-labs/avalanchego/snow/engine/common"
    20  	"github.com/ava-labs/avalanchego/utils/bloom"
    21  	"github.com/ava-labs/avalanchego/utils/buffer"
    22  	"github.com/ava-labs/avalanchego/utils/logging"
    23  	"github.com/ava-labs/avalanchego/utils/set"
    24  )
    25  
    26  const (
    27  	ioLabel    = "io"
    28  	sentIO     = "sent"
    29  	receivedIO = "received"
    30  
    31  	typeLabel  = "type"
    32  	pushType   = "push"
    33  	pullType   = "pull"
    34  	unsentType = "unsent"
    35  	sentType   = "sent"
    36  
    37  	defaultGossipableCount = 64
    38  )
    39  
    40  var (
    41  	_ Gossiper = (*ValidatorGossiper)(nil)
    42  	_ Gossiper = (*PullGossiper[*testTx])(nil)
    43  	_ Gossiper = (*NoOpGossiper)(nil)
    44  
    45  	_ Set[*testTx] = (*FullSet[*testTx])(nil)
    46  
    47  	ioTypeLabels   = []string{ioLabel, typeLabel}
    48  	sentPushLabels = prometheus.Labels{
    49  		ioLabel:   sentIO,
    50  		typeLabel: pushType,
    51  	}
    52  	receivedPushLabels = prometheus.Labels{
    53  		ioLabel:   receivedIO,
    54  		typeLabel: pushType,
    55  	}
    56  	sentPullLabels = prometheus.Labels{
    57  		ioLabel:   sentIO,
    58  		typeLabel: pullType,
    59  	}
    60  	receivedPullLabels = prometheus.Labels{
    61  		ioLabel:   receivedIO,
    62  		typeLabel: pullType,
    63  	}
    64  	typeLabels   = []string{typeLabel}
    65  	unsentLabels = prometheus.Labels{
    66  		typeLabel: unsentType,
    67  	}
    68  	sentLabels = prometheus.Labels{
    69  		typeLabel: sentType,
    70  	}
    71  
    72  	ErrInvalidNumValidators     = errors.New("num validators cannot be negative")
    73  	ErrInvalidNumNonValidators  = errors.New("num non-validators cannot be negative")
    74  	ErrInvalidNumPeers          = errors.New("num peers cannot be negative")
    75  	ErrInvalidNumToGossip       = errors.New("must gossip to at least one peer")
    76  	ErrInvalidDiscardedSize     = errors.New("discarded size cannot be negative")
    77  	ErrInvalidTargetGossipSize  = errors.New("target gossip size cannot be negative")
    78  	ErrInvalidRegossipFrequency = errors.New("re-gossip frequency cannot be negative")
    79  )
    80  
    81  // Gossiper gossips Gossipables to other nodes
    82  type Gossiper interface {
    83  	// Gossip runs a cycle of gossip. Returns an error if we failed to gossip.
    84  	Gossip(ctx context.Context) error
    85  }
    86  
    87  // ValidatorGossiper only calls [Gossip] if the given node is a validator
    88  type ValidatorGossiper struct {
    89  	Gossiper
    90  
    91  	NodeID     ids.NodeID
    92  	Validators p2p.ValidatorSet
    93  }
    94  
    95  // Metrics that are tracked across a gossip protocol. A given protocol should
    96  // only use a single instance of Metrics.
    97  type Metrics struct {
    98  	count                   *prometheus.CounterVec
    99  	bytes                   *prometheus.CounterVec
   100  	tracking                *prometheus.GaugeVec
   101  	trackingLifetimeAverage prometheus.Gauge
   102  	topValidators           *prometheus.GaugeVec
   103  }
   104  
   105  // NewMetrics returns a common set of metrics
   106  func NewMetrics(
   107  	metrics prometheus.Registerer,
   108  	namespace string,
   109  ) (Metrics, error) {
   110  	m := Metrics{
   111  		count: prometheus.NewCounterVec(
   112  			prometheus.CounterOpts{
   113  				Namespace: namespace,
   114  				Name:      "gossip_count",
   115  				Help:      "amount of gossip (n)",
   116  			},
   117  			ioTypeLabels,
   118  		),
   119  		bytes: prometheus.NewCounterVec(
   120  			prometheus.CounterOpts{
   121  				Namespace: namespace,
   122  				Name:      "gossip_bytes",
   123  				Help:      "amount of gossip (bytes)",
   124  			},
   125  			ioTypeLabels,
   126  		),
   127  		tracking: prometheus.NewGaugeVec(
   128  			prometheus.GaugeOpts{
   129  				Namespace: namespace,
   130  				Name:      "gossip_tracking",
   131  				Help:      "number of gossipables being tracked",
   132  			},
   133  			typeLabels,
   134  		),
   135  		trackingLifetimeAverage: prometheus.NewGauge(prometheus.GaugeOpts{
   136  			Namespace: namespace,
   137  			Name:      "gossip_tracking_lifetime_average",
   138  			Help:      "average duration a gossipable has been tracked (ns)",
   139  		}),
   140  		topValidators: prometheus.NewGaugeVec(
   141  			prometheus.GaugeOpts{
   142  				Namespace: namespace,
   143  				Name:      "top_validators",
   144  				Help:      "number of validators gossipables are sent to due to stake",
   145  			},
   146  			typeLabels,
   147  		),
   148  	}
   149  	err := errors.Join(
   150  		metrics.Register(m.count),
   151  		metrics.Register(m.bytes),
   152  		metrics.Register(m.tracking),
   153  		metrics.Register(m.trackingLifetimeAverage),
   154  		metrics.Register(m.topValidators),
   155  	)
   156  	return m, err
   157  }
   158  
   159  func (m *Metrics) observeMessage(labels prometheus.Labels, count int, bytes int) error {
   160  	countMetric, err := m.count.GetMetricWith(labels)
   161  	if err != nil {
   162  		return fmt.Errorf("failed to get count metric: %w", err)
   163  	}
   164  
   165  	bytesMetric, err := m.bytes.GetMetricWith(labels)
   166  	if err != nil {
   167  		return fmt.Errorf("failed to get bytes metric: %w", err)
   168  	}
   169  
   170  	countMetric.Add(float64(count))
   171  	bytesMetric.Add(float64(bytes))
   172  	return nil
   173  }
   174  
   175  func (v ValidatorGossiper) Gossip(ctx context.Context) error {
   176  	if !v.Validators.Has(ctx, v.NodeID) {
   177  		return nil
   178  	}
   179  
   180  	return v.Gossiper.Gossip(ctx)
   181  }
   182  
   183  func NewPullGossiper[T Gossipable](
   184  	log logging.Logger,
   185  	marshaller Marshaller[T],
   186  	set Set[T],
   187  	client *p2p.Client,
   188  	metrics Metrics,
   189  	pollSize int,
   190  ) *PullGossiper[T] {
   191  	return &PullGossiper[T]{
   192  		log:        log,
   193  		marshaller: marshaller,
   194  		set:        set,
   195  		client:     client,
   196  		metrics:    metrics,
   197  		pollSize:   pollSize,
   198  	}
   199  }
   200  
   201  type PullGossiper[T Gossipable] struct {
   202  	log        logging.Logger
   203  	marshaller Marshaller[T]
   204  	set        Set[T]
   205  	client     *p2p.Client
   206  	metrics    Metrics
   207  	pollSize   int
   208  }
   209  
   210  func (p *PullGossiper[_]) Gossip(ctx context.Context) error {
   211  	msgBytes, err := MarshalAppRequest(p.set.GetFilter())
   212  	if err != nil {
   213  		return err
   214  	}
   215  
   216  	for i := 0; i < p.pollSize; i++ {
   217  		err := p.client.AppRequestAny(ctx, msgBytes, p.handleResponse)
   218  		if err != nil && !errors.Is(err, p2p.ErrNoPeers) {
   219  			return err
   220  		}
   221  	}
   222  
   223  	return nil
   224  }
   225  
   226  func (p *PullGossiper[_]) handleResponse(
   227  	_ context.Context,
   228  	nodeID ids.NodeID,
   229  	responseBytes []byte,
   230  	err error,
   231  ) {
   232  	if err != nil {
   233  		p.log.Debug(
   234  			"failed gossip request",
   235  			zap.Stringer("nodeID", nodeID),
   236  			zap.Error(err),
   237  		)
   238  		return
   239  	}
   240  
   241  	gossip, err := ParseAppResponse(responseBytes)
   242  	if err != nil {
   243  		p.log.Debug("failed to unmarshal gossip response", zap.Error(err))
   244  		return
   245  	}
   246  
   247  	receivedBytes := 0
   248  	for _, bytes := range gossip {
   249  		receivedBytes += len(bytes)
   250  
   251  		gossipable, err := p.marshaller.UnmarshalGossip(bytes)
   252  		if err != nil {
   253  			p.log.Debug(
   254  				"failed to unmarshal gossip",
   255  				zap.Stringer("nodeID", nodeID),
   256  				zap.Error(err),
   257  			)
   258  			continue
   259  		}
   260  
   261  		gossipID := gossipable.GossipID()
   262  		p.log.Debug(
   263  			"received gossip",
   264  			zap.Stringer("nodeID", nodeID),
   265  			zap.Stringer("id", gossipID),
   266  		)
   267  		if err := p.set.Add(gossipable); err != nil {
   268  			p.log.Debug(
   269  				"failed to add gossip to the known set",
   270  				zap.Stringer("nodeID", nodeID),
   271  				zap.Stringer("id", gossipID),
   272  				zap.Error(err),
   273  			)
   274  			continue
   275  		}
   276  	}
   277  
   278  	if err := p.metrics.observeMessage(receivedPullLabels, len(gossip), receivedBytes); err != nil {
   279  		p.log.Error("failed to update metrics",
   280  			zap.Error(err),
   281  		)
   282  	}
   283  }
   284  
   285  // NewPushGossiper returns an instance of PushGossiper
   286  func NewPushGossiper[T Gossipable](
   287  	marshaller Marshaller[T],
   288  	mempool Set[T],
   289  	validators p2p.ValidatorSubset,
   290  	client *p2p.Client,
   291  	metrics Metrics,
   292  	gossipParams BranchingFactor,
   293  	regossipParams BranchingFactor,
   294  	discardedSize int,
   295  	targetGossipSize int,
   296  	maxRegossipFrequency time.Duration,
   297  ) (*PushGossiper[T], error) {
   298  	if err := gossipParams.Verify(); err != nil {
   299  		return nil, fmt.Errorf("invalid gossip params: %w", err)
   300  	}
   301  	if err := regossipParams.Verify(); err != nil {
   302  		return nil, fmt.Errorf("invalid regossip params: %w", err)
   303  	}
   304  	switch {
   305  	case discardedSize < 0:
   306  		return nil, ErrInvalidDiscardedSize
   307  	case targetGossipSize < 0:
   308  		return nil, ErrInvalidTargetGossipSize
   309  	case maxRegossipFrequency < 0:
   310  		return nil, ErrInvalidRegossipFrequency
   311  	}
   312  
   313  	return &PushGossiper[T]{
   314  		marshaller:           marshaller,
   315  		set:                  mempool,
   316  		validators:           validators,
   317  		client:               client,
   318  		metrics:              metrics,
   319  		gossipParams:         gossipParams,
   320  		regossipParams:       regossipParams,
   321  		targetGossipSize:     targetGossipSize,
   322  		maxRegossipFrequency: maxRegossipFrequency,
   323  
   324  		tracking:   make(map[ids.ID]*tracking),
   325  		toGossip:   buffer.NewUnboundedDeque[T](0),
   326  		toRegossip: buffer.NewUnboundedDeque[T](0),
   327  		discarded:  &cache.LRU[ids.ID, struct{}]{Size: discardedSize},
   328  	}, nil
   329  }
   330  
   331  // PushGossiper broadcasts gossip to peers randomly in the network
   332  type PushGossiper[T Gossipable] struct {
   333  	marshaller Marshaller[T]
   334  	set        Set[T]
   335  	validators p2p.ValidatorSubset
   336  	client     *p2p.Client
   337  	metrics    Metrics
   338  
   339  	gossipParams         BranchingFactor
   340  	regossipParams       BranchingFactor
   341  	targetGossipSize     int
   342  	maxRegossipFrequency time.Duration
   343  
   344  	lock         sync.Mutex
   345  	tracking     map[ids.ID]*tracking
   346  	addedTimeSum float64 // unix nanoseconds
   347  	toGossip     buffer.Deque[T]
   348  	toRegossip   buffer.Deque[T]
   349  	discarded    *cache.LRU[ids.ID, struct{}] // discarded attempts to avoid overgossiping transactions that are frequently dropped
   350  }
   351  
   352  type BranchingFactor struct {
   353  	// StakePercentage determines the percentage of stake that should have
   354  	// gossip sent to based on the inverse CDF of stake weights. This value does
   355  	// not account for the connectivity of the nodes.
   356  	StakePercentage float64
   357  	// Validators specifies the number of connected validators, in addition to
   358  	// any validators sent from the StakePercentage parameter, to send gossip
   359  	// to. These validators are sampled uniformly rather than by stake.
   360  	Validators int
   361  	// NonValidators specifies the number of connected non-validators to send
   362  	// gossip to.
   363  	NonValidators int
   364  	// Peers specifies the number of connected validators or non-validators, in
   365  	// addition to the number sent due to other configs, to send gossip to.
   366  	Peers int
   367  }
   368  
   369  func (b *BranchingFactor) Verify() error {
   370  	switch {
   371  	case b.Validators < 0:
   372  		return ErrInvalidNumValidators
   373  	case b.NonValidators < 0:
   374  		return ErrInvalidNumNonValidators
   375  	case b.Peers < 0:
   376  		return ErrInvalidNumPeers
   377  	case max(b.Validators, b.NonValidators, b.Peers) == 0:
   378  		return ErrInvalidNumToGossip
   379  	default:
   380  		return nil
   381  	}
   382  }
   383  
   384  type tracking struct {
   385  	addedTime    float64 // unix nanoseconds
   386  	lastGossiped time.Time
   387  }
   388  
   389  // Gossip flushes any queued gossipables.
   390  func (p *PushGossiper[T]) Gossip(ctx context.Context) error {
   391  	var (
   392  		now         = time.Now()
   393  		nowUnixNano = float64(now.UnixNano())
   394  	)
   395  
   396  	p.lock.Lock()
   397  	defer func() {
   398  		p.updateMetrics(nowUnixNano)
   399  		p.lock.Unlock()
   400  	}()
   401  
   402  	if len(p.tracking) == 0 {
   403  		return nil
   404  	}
   405  
   406  	if err := p.gossip(
   407  		ctx,
   408  		now,
   409  		p.gossipParams,
   410  		p.toGossip,
   411  		p.toRegossip,
   412  		&cache.Empty[ids.ID, struct{}]{}, // Don't mark dropped unsent transactions as discarded
   413  		unsentLabels,
   414  	); err != nil {
   415  		return fmt.Errorf("unexpected error during gossip: %w", err)
   416  	}
   417  
   418  	if err := p.gossip(
   419  		ctx,
   420  		now,
   421  		p.regossipParams,
   422  		p.toRegossip,
   423  		p.toRegossip,
   424  		p.discarded, // Mark dropped sent transactions as discarded
   425  		sentLabels,
   426  	); err != nil {
   427  		return fmt.Errorf("unexpected error during regossip: %w", err)
   428  	}
   429  	return nil
   430  }
   431  
   432  func (p *PushGossiper[T]) gossip(
   433  	ctx context.Context,
   434  	now time.Time,
   435  	gossipParams BranchingFactor,
   436  	toGossip buffer.Deque[T],
   437  	toRegossip buffer.Deque[T],
   438  	discarded cache.Cacher[ids.ID, struct{}],
   439  	metricsLabels prometheus.Labels,
   440  ) error {
   441  	var (
   442  		sentBytes                   = 0
   443  		gossip                      = make([][]byte, 0, defaultGossipableCount)
   444  		maxLastGossipTimeToRegossip = now.Add(-p.maxRegossipFrequency)
   445  	)
   446  
   447  	for sentBytes < p.targetGossipSize {
   448  		gossipable, ok := toGossip.PopLeft()
   449  		if !ok {
   450  			break
   451  		}
   452  
   453  		// Ensure item is still in the set before we gossip.
   454  		gossipID := gossipable.GossipID()
   455  		tracking := p.tracking[gossipID]
   456  		if !p.set.Has(gossipID) {
   457  			delete(p.tracking, gossipID)
   458  			p.addedTimeSum -= tracking.addedTime
   459  			discarded.Put(gossipID, struct{}{}) // Cache that the item was dropped
   460  			continue
   461  		}
   462  
   463  		// Ensure we don't attempt to send a gossipable too frequently.
   464  		if maxLastGossipTimeToRegossip.Before(tracking.lastGossiped) {
   465  			// Put the gossipable on the front of the queue to keep items sorted
   466  			// by last issuance time.
   467  			toGossip.PushLeft(gossipable)
   468  			break
   469  		}
   470  
   471  		bytes, err := p.marshaller.MarshalGossip(gossipable)
   472  		if err != nil {
   473  			delete(p.tracking, gossipID)
   474  			p.addedTimeSum -= tracking.addedTime
   475  			return err
   476  		}
   477  
   478  		gossip = append(gossip, bytes)
   479  		sentBytes += len(bytes)
   480  		toRegossip.PushRight(gossipable)
   481  		tracking.lastGossiped = now
   482  	}
   483  
   484  	// If there is nothing to gossip, we can exit early.
   485  	if len(gossip) == 0 {
   486  		return nil
   487  	}
   488  
   489  	// Send gossipables to peers
   490  	msgBytes, err := MarshalAppGossip(gossip)
   491  	if err != nil {
   492  		return err
   493  	}
   494  
   495  	if err := p.metrics.observeMessage(sentPushLabels, len(gossip), sentBytes); err != nil {
   496  		return err
   497  	}
   498  
   499  	topValidatorsMetric, err := p.metrics.topValidators.GetMetricWith(metricsLabels)
   500  	if err != nil {
   501  		return fmt.Errorf("failed to get top validators metric: %w", err)
   502  	}
   503  
   504  	validatorsByStake := p.validators.Top(ctx, gossipParams.StakePercentage)
   505  	topValidatorsMetric.Set(float64(len(validatorsByStake)))
   506  
   507  	return p.client.AppGossip(
   508  		ctx,
   509  		common.SendConfig{
   510  			NodeIDs:       set.Of(validatorsByStake...),
   511  			Validators:    gossipParams.Validators,
   512  			NonValidators: gossipParams.NonValidators,
   513  			Peers:         gossipParams.Peers,
   514  		},
   515  		msgBytes,
   516  	)
   517  }
   518  
   519  // Add enqueues new gossipables to be pushed. If a gossiable is already tracked,
   520  // it is not added again.
   521  func (p *PushGossiper[T]) Add(gossipables ...T) {
   522  	var (
   523  		now         = time.Now()
   524  		nowUnixNano = float64(now.UnixNano())
   525  	)
   526  
   527  	p.lock.Lock()
   528  	defer func() {
   529  		p.updateMetrics(nowUnixNano)
   530  		p.lock.Unlock()
   531  	}()
   532  
   533  	// Add new gossipables to be sent.
   534  	for _, gossipable := range gossipables {
   535  		gossipID := gossipable.GossipID()
   536  		if _, ok := p.tracking[gossipID]; ok {
   537  			continue
   538  		}
   539  
   540  		tracking := &tracking{
   541  			addedTime: nowUnixNano,
   542  		}
   543  		if _, ok := p.discarded.Get(gossipID); ok {
   544  			// Pretend that recently discarded transactions were just gossiped.
   545  			tracking.lastGossiped = now
   546  			p.toRegossip.PushRight(gossipable)
   547  		} else {
   548  			p.toGossip.PushRight(gossipable)
   549  		}
   550  		p.tracking[gossipID] = tracking
   551  		p.addedTimeSum += nowUnixNano
   552  	}
   553  }
   554  
   555  func (p *PushGossiper[_]) updateMetrics(nowUnixNano float64) {
   556  	var (
   557  		numUnsent       = float64(p.toGossip.Len())
   558  		numSent         = float64(p.toRegossip.Len())
   559  		numTracking     = numUnsent + numSent
   560  		averageLifetime float64
   561  	)
   562  	if numTracking != 0 {
   563  		averageLifetime = nowUnixNano - p.addedTimeSum/numTracking
   564  	}
   565  
   566  	p.metrics.tracking.With(unsentLabels).Set(numUnsent)
   567  	p.metrics.tracking.With(sentLabels).Set(numSent)
   568  	p.metrics.trackingLifetimeAverage.Set(averageLifetime)
   569  }
   570  
   571  // Every calls [Gossip] every [frequency] amount of time.
   572  func Every(ctx context.Context, log logging.Logger, gossiper Gossiper, frequency time.Duration) {
   573  	ticker := time.NewTicker(frequency)
   574  	defer ticker.Stop()
   575  
   576  	for {
   577  		select {
   578  		case <-ticker.C:
   579  			if err := gossiper.Gossip(ctx); err != nil {
   580  				log.Warn("failed to gossip", zap.Error(err))
   581  			}
   582  		case <-ctx.Done():
   583  			log.Debug("shutting down gossip")
   584  			return
   585  		}
   586  	}
   587  }
   588  
   589  type NoOpGossiper struct{}
   590  
   591  func (NoOpGossiper) Gossip(context.Context) error {
   592  	return nil
   593  }
   594  
   595  type TestGossiper struct {
   596  	GossipF func(ctx context.Context) error
   597  }
   598  
   599  func (t *TestGossiper) Gossip(ctx context.Context) error {
   600  	return t.GossipF(ctx)
   601  }
   602  
   603  type FullSet[T Gossipable] struct{}
   604  
   605  func (FullSet[_]) Gossip(context.Context) error {
   606  	return nil
   607  }
   608  
   609  func (FullSet[T]) Add(T) error {
   610  	return nil
   611  }
   612  
   613  func (FullSet[T]) Has(ids.ID) bool {
   614  	return true
   615  }
   616  
   617  func (FullSet[T]) Iterate(func(gossipable T) bool) {}
   618  
   619  func (FullSet[_]) GetFilter() ([]byte, []byte) {
   620  	return bloom.FullFilter.Marshal(), ids.Empty[:]
   621  }