github.com/ava-labs/avalanchego@v1.11.11/network/p2p/gossip/gossip.go (about) 1 // Copyright (C) 2019-2024, Ava Labs, Inc. All rights reserved. 2 // See the file LICENSE for licensing terms. 3 4 package gossip 5 6 import ( 7 "context" 8 "errors" 9 "fmt" 10 "sync" 11 "time" 12 13 "github.com/prometheus/client_golang/prometheus" 14 "go.uber.org/zap" 15 16 "github.com/ava-labs/avalanchego/cache" 17 "github.com/ava-labs/avalanchego/ids" 18 "github.com/ava-labs/avalanchego/network/p2p" 19 "github.com/ava-labs/avalanchego/snow/engine/common" 20 "github.com/ava-labs/avalanchego/utils/bloom" 21 "github.com/ava-labs/avalanchego/utils/buffer" 22 "github.com/ava-labs/avalanchego/utils/logging" 23 "github.com/ava-labs/avalanchego/utils/set" 24 ) 25 26 const ( 27 ioLabel = "io" 28 sentIO = "sent" 29 receivedIO = "received" 30 31 typeLabel = "type" 32 pushType = "push" 33 pullType = "pull" 34 unsentType = "unsent" 35 sentType = "sent" 36 37 defaultGossipableCount = 64 38 ) 39 40 var ( 41 _ Gossiper = (*ValidatorGossiper)(nil) 42 _ Gossiper = (*PullGossiper[*testTx])(nil) 43 _ Gossiper = (*NoOpGossiper)(nil) 44 45 _ Set[*testTx] = (*FullSet[*testTx])(nil) 46 47 ioTypeLabels = []string{ioLabel, typeLabel} 48 sentPushLabels = prometheus.Labels{ 49 ioLabel: sentIO, 50 typeLabel: pushType, 51 } 52 receivedPushLabels = prometheus.Labels{ 53 ioLabel: receivedIO, 54 typeLabel: pushType, 55 } 56 sentPullLabels = prometheus.Labels{ 57 ioLabel: sentIO, 58 typeLabel: pullType, 59 } 60 receivedPullLabels = prometheus.Labels{ 61 ioLabel: receivedIO, 62 typeLabel: pullType, 63 } 64 typeLabels = []string{typeLabel} 65 unsentLabels = prometheus.Labels{ 66 typeLabel: unsentType, 67 } 68 sentLabels = prometheus.Labels{ 69 typeLabel: sentType, 70 } 71 72 ErrInvalidNumValidators = errors.New("num validators cannot be negative") 73 ErrInvalidNumNonValidators = errors.New("num non-validators cannot be negative") 74 ErrInvalidNumPeers = errors.New("num peers cannot be negative") 75 ErrInvalidNumToGossip = errors.New("must gossip to at least one peer") 76 ErrInvalidDiscardedSize = errors.New("discarded size cannot be negative") 77 ErrInvalidTargetGossipSize = errors.New("target gossip size cannot be negative") 78 ErrInvalidRegossipFrequency = errors.New("re-gossip frequency cannot be negative") 79 ) 80 81 // Gossiper gossips Gossipables to other nodes 82 type Gossiper interface { 83 // Gossip runs a cycle of gossip. Returns an error if we failed to gossip. 84 Gossip(ctx context.Context) error 85 } 86 87 // ValidatorGossiper only calls [Gossip] if the given node is a validator 88 type ValidatorGossiper struct { 89 Gossiper 90 91 NodeID ids.NodeID 92 Validators p2p.ValidatorSet 93 } 94 95 // Metrics that are tracked across a gossip protocol. A given protocol should 96 // only use a single instance of Metrics. 97 type Metrics struct { 98 count *prometheus.CounterVec 99 bytes *prometheus.CounterVec 100 tracking *prometheus.GaugeVec 101 trackingLifetimeAverage prometheus.Gauge 102 topValidators *prometheus.GaugeVec 103 } 104 105 // NewMetrics returns a common set of metrics 106 func NewMetrics( 107 metrics prometheus.Registerer, 108 namespace string, 109 ) (Metrics, error) { 110 m := Metrics{ 111 count: prometheus.NewCounterVec( 112 prometheus.CounterOpts{ 113 Namespace: namespace, 114 Name: "gossip_count", 115 Help: "amount of gossip (n)", 116 }, 117 ioTypeLabels, 118 ), 119 bytes: prometheus.NewCounterVec( 120 prometheus.CounterOpts{ 121 Namespace: namespace, 122 Name: "gossip_bytes", 123 Help: "amount of gossip (bytes)", 124 }, 125 ioTypeLabels, 126 ), 127 tracking: prometheus.NewGaugeVec( 128 prometheus.GaugeOpts{ 129 Namespace: namespace, 130 Name: "gossip_tracking", 131 Help: "number of gossipables being tracked", 132 }, 133 typeLabels, 134 ), 135 trackingLifetimeAverage: prometheus.NewGauge(prometheus.GaugeOpts{ 136 Namespace: namespace, 137 Name: "gossip_tracking_lifetime_average", 138 Help: "average duration a gossipable has been tracked (ns)", 139 }), 140 topValidators: prometheus.NewGaugeVec( 141 prometheus.GaugeOpts{ 142 Namespace: namespace, 143 Name: "top_validators", 144 Help: "number of validators gossipables are sent to due to stake", 145 }, 146 typeLabels, 147 ), 148 } 149 err := errors.Join( 150 metrics.Register(m.count), 151 metrics.Register(m.bytes), 152 metrics.Register(m.tracking), 153 metrics.Register(m.trackingLifetimeAverage), 154 metrics.Register(m.topValidators), 155 ) 156 return m, err 157 } 158 159 func (m *Metrics) observeMessage(labels prometheus.Labels, count int, bytes int) error { 160 countMetric, err := m.count.GetMetricWith(labels) 161 if err != nil { 162 return fmt.Errorf("failed to get count metric: %w", err) 163 } 164 165 bytesMetric, err := m.bytes.GetMetricWith(labels) 166 if err != nil { 167 return fmt.Errorf("failed to get bytes metric: %w", err) 168 } 169 170 countMetric.Add(float64(count)) 171 bytesMetric.Add(float64(bytes)) 172 return nil 173 } 174 175 func (v ValidatorGossiper) Gossip(ctx context.Context) error { 176 if !v.Validators.Has(ctx, v.NodeID) { 177 return nil 178 } 179 180 return v.Gossiper.Gossip(ctx) 181 } 182 183 func NewPullGossiper[T Gossipable]( 184 log logging.Logger, 185 marshaller Marshaller[T], 186 set Set[T], 187 client *p2p.Client, 188 metrics Metrics, 189 pollSize int, 190 ) *PullGossiper[T] { 191 return &PullGossiper[T]{ 192 log: log, 193 marshaller: marshaller, 194 set: set, 195 client: client, 196 metrics: metrics, 197 pollSize: pollSize, 198 } 199 } 200 201 type PullGossiper[T Gossipable] struct { 202 log logging.Logger 203 marshaller Marshaller[T] 204 set Set[T] 205 client *p2p.Client 206 metrics Metrics 207 pollSize int 208 } 209 210 func (p *PullGossiper[_]) Gossip(ctx context.Context) error { 211 msgBytes, err := MarshalAppRequest(p.set.GetFilter()) 212 if err != nil { 213 return err 214 } 215 216 for i := 0; i < p.pollSize; i++ { 217 err := p.client.AppRequestAny(ctx, msgBytes, p.handleResponse) 218 if err != nil && !errors.Is(err, p2p.ErrNoPeers) { 219 return err 220 } 221 } 222 223 return nil 224 } 225 226 func (p *PullGossiper[_]) handleResponse( 227 _ context.Context, 228 nodeID ids.NodeID, 229 responseBytes []byte, 230 err error, 231 ) { 232 if err != nil { 233 p.log.Debug( 234 "failed gossip request", 235 zap.Stringer("nodeID", nodeID), 236 zap.Error(err), 237 ) 238 return 239 } 240 241 gossip, err := ParseAppResponse(responseBytes) 242 if err != nil { 243 p.log.Debug("failed to unmarshal gossip response", zap.Error(err)) 244 return 245 } 246 247 receivedBytes := 0 248 for _, bytes := range gossip { 249 receivedBytes += len(bytes) 250 251 gossipable, err := p.marshaller.UnmarshalGossip(bytes) 252 if err != nil { 253 p.log.Debug( 254 "failed to unmarshal gossip", 255 zap.Stringer("nodeID", nodeID), 256 zap.Error(err), 257 ) 258 continue 259 } 260 261 gossipID := gossipable.GossipID() 262 p.log.Debug( 263 "received gossip", 264 zap.Stringer("nodeID", nodeID), 265 zap.Stringer("id", gossipID), 266 ) 267 if err := p.set.Add(gossipable); err != nil { 268 p.log.Debug( 269 "failed to add gossip to the known set", 270 zap.Stringer("nodeID", nodeID), 271 zap.Stringer("id", gossipID), 272 zap.Error(err), 273 ) 274 continue 275 } 276 } 277 278 if err := p.metrics.observeMessage(receivedPullLabels, len(gossip), receivedBytes); err != nil { 279 p.log.Error("failed to update metrics", 280 zap.Error(err), 281 ) 282 } 283 } 284 285 // NewPushGossiper returns an instance of PushGossiper 286 func NewPushGossiper[T Gossipable]( 287 marshaller Marshaller[T], 288 mempool Set[T], 289 validators p2p.ValidatorSubset, 290 client *p2p.Client, 291 metrics Metrics, 292 gossipParams BranchingFactor, 293 regossipParams BranchingFactor, 294 discardedSize int, 295 targetGossipSize int, 296 maxRegossipFrequency time.Duration, 297 ) (*PushGossiper[T], error) { 298 if err := gossipParams.Verify(); err != nil { 299 return nil, fmt.Errorf("invalid gossip params: %w", err) 300 } 301 if err := regossipParams.Verify(); err != nil { 302 return nil, fmt.Errorf("invalid regossip params: %w", err) 303 } 304 switch { 305 case discardedSize < 0: 306 return nil, ErrInvalidDiscardedSize 307 case targetGossipSize < 0: 308 return nil, ErrInvalidTargetGossipSize 309 case maxRegossipFrequency < 0: 310 return nil, ErrInvalidRegossipFrequency 311 } 312 313 return &PushGossiper[T]{ 314 marshaller: marshaller, 315 set: mempool, 316 validators: validators, 317 client: client, 318 metrics: metrics, 319 gossipParams: gossipParams, 320 regossipParams: regossipParams, 321 targetGossipSize: targetGossipSize, 322 maxRegossipFrequency: maxRegossipFrequency, 323 324 tracking: make(map[ids.ID]*tracking), 325 toGossip: buffer.NewUnboundedDeque[T](0), 326 toRegossip: buffer.NewUnboundedDeque[T](0), 327 discarded: &cache.LRU[ids.ID, struct{}]{Size: discardedSize}, 328 }, nil 329 } 330 331 // PushGossiper broadcasts gossip to peers randomly in the network 332 type PushGossiper[T Gossipable] struct { 333 marshaller Marshaller[T] 334 set Set[T] 335 validators p2p.ValidatorSubset 336 client *p2p.Client 337 metrics Metrics 338 339 gossipParams BranchingFactor 340 regossipParams BranchingFactor 341 targetGossipSize int 342 maxRegossipFrequency time.Duration 343 344 lock sync.Mutex 345 tracking map[ids.ID]*tracking 346 addedTimeSum float64 // unix nanoseconds 347 toGossip buffer.Deque[T] 348 toRegossip buffer.Deque[T] 349 discarded *cache.LRU[ids.ID, struct{}] // discarded attempts to avoid overgossiping transactions that are frequently dropped 350 } 351 352 type BranchingFactor struct { 353 // StakePercentage determines the percentage of stake that should have 354 // gossip sent to based on the inverse CDF of stake weights. This value does 355 // not account for the connectivity of the nodes. 356 StakePercentage float64 357 // Validators specifies the number of connected validators, in addition to 358 // any validators sent from the StakePercentage parameter, to send gossip 359 // to. These validators are sampled uniformly rather than by stake. 360 Validators int 361 // NonValidators specifies the number of connected non-validators to send 362 // gossip to. 363 NonValidators int 364 // Peers specifies the number of connected validators or non-validators, in 365 // addition to the number sent due to other configs, to send gossip to. 366 Peers int 367 } 368 369 func (b *BranchingFactor) Verify() error { 370 switch { 371 case b.Validators < 0: 372 return ErrInvalidNumValidators 373 case b.NonValidators < 0: 374 return ErrInvalidNumNonValidators 375 case b.Peers < 0: 376 return ErrInvalidNumPeers 377 case max(b.Validators, b.NonValidators, b.Peers) == 0: 378 return ErrInvalidNumToGossip 379 default: 380 return nil 381 } 382 } 383 384 type tracking struct { 385 addedTime float64 // unix nanoseconds 386 lastGossiped time.Time 387 } 388 389 // Gossip flushes any queued gossipables. 390 func (p *PushGossiper[T]) Gossip(ctx context.Context) error { 391 var ( 392 now = time.Now() 393 nowUnixNano = float64(now.UnixNano()) 394 ) 395 396 p.lock.Lock() 397 defer func() { 398 p.updateMetrics(nowUnixNano) 399 p.lock.Unlock() 400 }() 401 402 if len(p.tracking) == 0 { 403 return nil 404 } 405 406 if err := p.gossip( 407 ctx, 408 now, 409 p.gossipParams, 410 p.toGossip, 411 p.toRegossip, 412 &cache.Empty[ids.ID, struct{}]{}, // Don't mark dropped unsent transactions as discarded 413 unsentLabels, 414 ); err != nil { 415 return fmt.Errorf("unexpected error during gossip: %w", err) 416 } 417 418 if err := p.gossip( 419 ctx, 420 now, 421 p.regossipParams, 422 p.toRegossip, 423 p.toRegossip, 424 p.discarded, // Mark dropped sent transactions as discarded 425 sentLabels, 426 ); err != nil { 427 return fmt.Errorf("unexpected error during regossip: %w", err) 428 } 429 return nil 430 } 431 432 func (p *PushGossiper[T]) gossip( 433 ctx context.Context, 434 now time.Time, 435 gossipParams BranchingFactor, 436 toGossip buffer.Deque[T], 437 toRegossip buffer.Deque[T], 438 discarded cache.Cacher[ids.ID, struct{}], 439 metricsLabels prometheus.Labels, 440 ) error { 441 var ( 442 sentBytes = 0 443 gossip = make([][]byte, 0, defaultGossipableCount) 444 maxLastGossipTimeToRegossip = now.Add(-p.maxRegossipFrequency) 445 ) 446 447 for sentBytes < p.targetGossipSize { 448 gossipable, ok := toGossip.PopLeft() 449 if !ok { 450 break 451 } 452 453 // Ensure item is still in the set before we gossip. 454 gossipID := gossipable.GossipID() 455 tracking := p.tracking[gossipID] 456 if !p.set.Has(gossipID) { 457 delete(p.tracking, gossipID) 458 p.addedTimeSum -= tracking.addedTime 459 discarded.Put(gossipID, struct{}{}) // Cache that the item was dropped 460 continue 461 } 462 463 // Ensure we don't attempt to send a gossipable too frequently. 464 if maxLastGossipTimeToRegossip.Before(tracking.lastGossiped) { 465 // Put the gossipable on the front of the queue to keep items sorted 466 // by last issuance time. 467 toGossip.PushLeft(gossipable) 468 break 469 } 470 471 bytes, err := p.marshaller.MarshalGossip(gossipable) 472 if err != nil { 473 delete(p.tracking, gossipID) 474 p.addedTimeSum -= tracking.addedTime 475 return err 476 } 477 478 gossip = append(gossip, bytes) 479 sentBytes += len(bytes) 480 toRegossip.PushRight(gossipable) 481 tracking.lastGossiped = now 482 } 483 484 // If there is nothing to gossip, we can exit early. 485 if len(gossip) == 0 { 486 return nil 487 } 488 489 // Send gossipables to peers 490 msgBytes, err := MarshalAppGossip(gossip) 491 if err != nil { 492 return err 493 } 494 495 if err := p.metrics.observeMessage(sentPushLabels, len(gossip), sentBytes); err != nil { 496 return err 497 } 498 499 topValidatorsMetric, err := p.metrics.topValidators.GetMetricWith(metricsLabels) 500 if err != nil { 501 return fmt.Errorf("failed to get top validators metric: %w", err) 502 } 503 504 validatorsByStake := p.validators.Top(ctx, gossipParams.StakePercentage) 505 topValidatorsMetric.Set(float64(len(validatorsByStake))) 506 507 return p.client.AppGossip( 508 ctx, 509 common.SendConfig{ 510 NodeIDs: set.Of(validatorsByStake...), 511 Validators: gossipParams.Validators, 512 NonValidators: gossipParams.NonValidators, 513 Peers: gossipParams.Peers, 514 }, 515 msgBytes, 516 ) 517 } 518 519 // Add enqueues new gossipables to be pushed. If a gossiable is already tracked, 520 // it is not added again. 521 func (p *PushGossiper[T]) Add(gossipables ...T) { 522 var ( 523 now = time.Now() 524 nowUnixNano = float64(now.UnixNano()) 525 ) 526 527 p.lock.Lock() 528 defer func() { 529 p.updateMetrics(nowUnixNano) 530 p.lock.Unlock() 531 }() 532 533 // Add new gossipables to be sent. 534 for _, gossipable := range gossipables { 535 gossipID := gossipable.GossipID() 536 if _, ok := p.tracking[gossipID]; ok { 537 continue 538 } 539 540 tracking := &tracking{ 541 addedTime: nowUnixNano, 542 } 543 if _, ok := p.discarded.Get(gossipID); ok { 544 // Pretend that recently discarded transactions were just gossiped. 545 tracking.lastGossiped = now 546 p.toRegossip.PushRight(gossipable) 547 } else { 548 p.toGossip.PushRight(gossipable) 549 } 550 p.tracking[gossipID] = tracking 551 p.addedTimeSum += nowUnixNano 552 } 553 } 554 555 func (p *PushGossiper[_]) updateMetrics(nowUnixNano float64) { 556 var ( 557 numUnsent = float64(p.toGossip.Len()) 558 numSent = float64(p.toRegossip.Len()) 559 numTracking = numUnsent + numSent 560 averageLifetime float64 561 ) 562 if numTracking != 0 { 563 averageLifetime = nowUnixNano - p.addedTimeSum/numTracking 564 } 565 566 p.metrics.tracking.With(unsentLabels).Set(numUnsent) 567 p.metrics.tracking.With(sentLabels).Set(numSent) 568 p.metrics.trackingLifetimeAverage.Set(averageLifetime) 569 } 570 571 // Every calls [Gossip] every [frequency] amount of time. 572 func Every(ctx context.Context, log logging.Logger, gossiper Gossiper, frequency time.Duration) { 573 ticker := time.NewTicker(frequency) 574 defer ticker.Stop() 575 576 for { 577 select { 578 case <-ticker.C: 579 if err := gossiper.Gossip(ctx); err != nil { 580 log.Warn("failed to gossip", zap.Error(err)) 581 } 582 case <-ctx.Done(): 583 log.Debug("shutting down gossip") 584 return 585 } 586 } 587 } 588 589 type NoOpGossiper struct{} 590 591 func (NoOpGossiper) Gossip(context.Context) error { 592 return nil 593 } 594 595 type TestGossiper struct { 596 GossipF func(ctx context.Context) error 597 } 598 599 func (t *TestGossiper) Gossip(ctx context.Context) error { 600 return t.GossipF(ctx) 601 } 602 603 type FullSet[T Gossipable] struct{} 604 605 func (FullSet[_]) Gossip(context.Context) error { 606 return nil 607 } 608 609 func (FullSet[T]) Add(T) error { 610 return nil 611 } 612 613 func (FullSet[T]) Has(ids.ID) bool { 614 return true 615 } 616 617 func (FullSet[T]) Iterate(func(gossipable T) bool) {} 618 619 func (FullSet[_]) GetFilter() ([]byte, []byte) { 620 return bloom.FullFilter.Marshal(), ids.Empty[:] 621 }