github.com/MetalBlockchain/metalgo@v1.11.9/network/p2p/gossip/gossip.go (about) 1 // Copyright (C) 2019-2024, Ava Labs, Inc. All rights reserved. 2 // See the file LICENSE for licensing terms. 3 4 package gossip 5 6 import ( 7 "context" 8 "errors" 9 "fmt" 10 "sync" 11 "time" 12 13 "github.com/prometheus/client_golang/prometheus" 14 "go.uber.org/zap" 15 16 "github.com/MetalBlockchain/metalgo/cache" 17 "github.com/MetalBlockchain/metalgo/ids" 18 "github.com/MetalBlockchain/metalgo/network/p2p" 19 "github.com/MetalBlockchain/metalgo/snow/engine/common" 20 "github.com/MetalBlockchain/metalgo/utils/bloom" 21 "github.com/MetalBlockchain/metalgo/utils/buffer" 22 "github.com/MetalBlockchain/metalgo/utils/logging" 23 "github.com/MetalBlockchain/metalgo/utils/set" 24 ) 25 26 const ( 27 ioLabel = "io" 28 sentIO = "sent" 29 receivedIO = "received" 30 31 typeLabel = "type" 32 pushType = "push" 33 pullType = "pull" 34 unsentType = "unsent" 35 sentType = "sent" 36 37 defaultGossipableCount = 64 38 ) 39 40 var ( 41 _ Gossiper = (*ValidatorGossiper)(nil) 42 _ Gossiper = (*PullGossiper[*testTx])(nil) 43 _ Gossiper = (*NoOpGossiper)(nil) 44 45 _ Set[*testTx] = (*EmptySet[*testTx])(nil) 46 _ Set[*testTx] = (*FullSet[*testTx])(nil) 47 48 ioTypeLabels = []string{ioLabel, typeLabel} 49 sentPushLabels = prometheus.Labels{ 50 ioLabel: sentIO, 51 typeLabel: pushType, 52 } 53 receivedPushLabels = prometheus.Labels{ 54 ioLabel: receivedIO, 55 typeLabel: pushType, 56 } 57 sentPullLabels = prometheus.Labels{ 58 ioLabel: sentIO, 59 typeLabel: pullType, 60 } 61 receivedPullLabels = prometheus.Labels{ 62 ioLabel: receivedIO, 63 typeLabel: pullType, 64 } 65 typeLabels = []string{typeLabel} 66 unsentLabels = prometheus.Labels{ 67 typeLabel: unsentType, 68 } 69 sentLabels = prometheus.Labels{ 70 typeLabel: sentType, 71 } 72 73 ErrInvalidNumValidators = errors.New("num validators cannot be negative") 74 ErrInvalidNumNonValidators = errors.New("num non-validators cannot be negative") 75 ErrInvalidNumPeers = errors.New("num peers cannot be negative") 76 ErrInvalidNumToGossip = errors.New("must gossip to at least one peer") 77 ErrInvalidDiscardedSize = errors.New("discarded size cannot be negative") 78 ErrInvalidTargetGossipSize = errors.New("target gossip size cannot be negative") 79 ErrInvalidRegossipFrequency = errors.New("re-gossip frequency cannot be negative") 80 81 errEmptySetCantAdd = errors.New("empty set can not add") 82 ) 83 84 // Gossiper gossips Gossipables to other nodes 85 type Gossiper interface { 86 // Gossip runs a cycle of gossip. Returns an error if we failed to gossip. 87 Gossip(ctx context.Context) error 88 } 89 90 // ValidatorGossiper only calls [Gossip] if the given node is a validator 91 type ValidatorGossiper struct { 92 Gossiper 93 94 NodeID ids.NodeID 95 Validators p2p.ValidatorSet 96 } 97 98 // Metrics that are tracked across a gossip protocol. A given protocol should 99 // only use a single instance of Metrics. 100 type Metrics struct { 101 count *prometheus.CounterVec 102 bytes *prometheus.CounterVec 103 tracking *prometheus.GaugeVec 104 trackingLifetimeAverage prometheus.Gauge 105 topValidators *prometheus.GaugeVec 106 } 107 108 // NewMetrics returns a common set of metrics 109 func NewMetrics( 110 metrics prometheus.Registerer, 111 namespace string, 112 ) (Metrics, error) { 113 m := Metrics{ 114 count: prometheus.NewCounterVec( 115 prometheus.CounterOpts{ 116 Namespace: namespace, 117 Name: "gossip_count", 118 Help: "amount of gossip (n)", 119 }, 120 ioTypeLabels, 121 ), 122 bytes: prometheus.NewCounterVec( 123 prometheus.CounterOpts{ 124 Namespace: namespace, 125 Name: "gossip_bytes", 126 Help: "amount of gossip (bytes)", 127 }, 128 ioTypeLabels, 129 ), 130 tracking: prometheus.NewGaugeVec( 131 prometheus.GaugeOpts{ 132 Namespace: namespace, 133 Name: "gossip_tracking", 134 Help: "number of gossipables being tracked", 135 }, 136 typeLabels, 137 ), 138 trackingLifetimeAverage: prometheus.NewGauge(prometheus.GaugeOpts{ 139 Namespace: namespace, 140 Name: "gossip_tracking_lifetime_average", 141 Help: "average duration a gossipable has been tracked (ns)", 142 }), 143 topValidators: prometheus.NewGaugeVec( 144 prometheus.GaugeOpts{ 145 Namespace: namespace, 146 Name: "top_validators", 147 Help: "number of validators gossipables are sent to due to stake", 148 }, 149 typeLabels, 150 ), 151 } 152 err := errors.Join( 153 metrics.Register(m.count), 154 metrics.Register(m.bytes), 155 metrics.Register(m.tracking), 156 metrics.Register(m.trackingLifetimeAverage), 157 metrics.Register(m.topValidators), 158 ) 159 return m, err 160 } 161 162 func (m *Metrics) observeMessage(labels prometheus.Labels, count int, bytes int) error { 163 countMetric, err := m.count.GetMetricWith(labels) 164 if err != nil { 165 return fmt.Errorf("failed to get count metric: %w", err) 166 } 167 168 bytesMetric, err := m.bytes.GetMetricWith(labels) 169 if err != nil { 170 return fmt.Errorf("failed to get bytes metric: %w", err) 171 } 172 173 countMetric.Add(float64(count)) 174 bytesMetric.Add(float64(bytes)) 175 return nil 176 } 177 178 func (v ValidatorGossiper) Gossip(ctx context.Context) error { 179 if !v.Validators.Has(ctx, v.NodeID) { 180 return nil 181 } 182 183 return v.Gossiper.Gossip(ctx) 184 } 185 186 func NewPullGossiper[T Gossipable]( 187 log logging.Logger, 188 marshaller Marshaller[T], 189 set Set[T], 190 client *p2p.Client, 191 metrics Metrics, 192 pollSize int, 193 ) *PullGossiper[T] { 194 return &PullGossiper[T]{ 195 log: log, 196 marshaller: marshaller, 197 set: set, 198 client: client, 199 metrics: metrics, 200 pollSize: pollSize, 201 } 202 } 203 204 type PullGossiper[T Gossipable] struct { 205 log logging.Logger 206 marshaller Marshaller[T] 207 set Set[T] 208 client *p2p.Client 209 metrics Metrics 210 pollSize int 211 } 212 213 func (p *PullGossiper[_]) Gossip(ctx context.Context) error { 214 msgBytes, err := MarshalAppRequest(p.set.GetFilter()) 215 if err != nil { 216 return err 217 } 218 219 for i := 0; i < p.pollSize; i++ { 220 err := p.client.AppRequestAny(ctx, msgBytes, p.handleResponse) 221 if err != nil && !errors.Is(err, p2p.ErrNoPeers) { 222 return err 223 } 224 } 225 226 return nil 227 } 228 229 func (p *PullGossiper[_]) handleResponse( 230 _ context.Context, 231 nodeID ids.NodeID, 232 responseBytes []byte, 233 err error, 234 ) { 235 if err != nil { 236 p.log.Debug( 237 "failed gossip request", 238 zap.Stringer("nodeID", nodeID), 239 zap.Error(err), 240 ) 241 return 242 } 243 244 gossip, err := ParseAppResponse(responseBytes) 245 if err != nil { 246 p.log.Debug("failed to unmarshal gossip response", zap.Error(err)) 247 return 248 } 249 250 receivedBytes := 0 251 for _, bytes := range gossip { 252 receivedBytes += len(bytes) 253 254 gossipable, err := p.marshaller.UnmarshalGossip(bytes) 255 if err != nil { 256 p.log.Debug( 257 "failed to unmarshal gossip", 258 zap.Stringer("nodeID", nodeID), 259 zap.Error(err), 260 ) 261 continue 262 } 263 264 gossipID := gossipable.GossipID() 265 p.log.Debug( 266 "received gossip", 267 zap.Stringer("nodeID", nodeID), 268 zap.Stringer("id", gossipID), 269 ) 270 if err := p.set.Add(gossipable); err != nil { 271 p.log.Debug( 272 "failed to add gossip to the known set", 273 zap.Stringer("nodeID", nodeID), 274 zap.Stringer("id", gossipID), 275 zap.Error(err), 276 ) 277 continue 278 } 279 } 280 281 if err := p.metrics.observeMessage(receivedPullLabels, len(gossip), receivedBytes); err != nil { 282 p.log.Error("failed to update metrics", 283 zap.Error(err), 284 ) 285 } 286 } 287 288 // NewPushGossiper returns an instance of PushGossiper 289 func NewPushGossiper[T Gossipable]( 290 marshaller Marshaller[T], 291 mempool Set[T], 292 validators p2p.ValidatorSubset, 293 client *p2p.Client, 294 metrics Metrics, 295 gossipParams BranchingFactor, 296 regossipParams BranchingFactor, 297 discardedSize int, 298 targetGossipSize int, 299 maxRegossipFrequency time.Duration, 300 ) (*PushGossiper[T], error) { 301 if err := gossipParams.Verify(); err != nil { 302 return nil, fmt.Errorf("invalid gossip params: %w", err) 303 } 304 if err := regossipParams.Verify(); err != nil { 305 return nil, fmt.Errorf("invalid regossip params: %w", err) 306 } 307 switch { 308 case discardedSize < 0: 309 return nil, ErrInvalidDiscardedSize 310 case targetGossipSize < 0: 311 return nil, ErrInvalidTargetGossipSize 312 case maxRegossipFrequency < 0: 313 return nil, ErrInvalidRegossipFrequency 314 } 315 316 return &PushGossiper[T]{ 317 marshaller: marshaller, 318 set: mempool, 319 validators: validators, 320 client: client, 321 metrics: metrics, 322 gossipParams: gossipParams, 323 regossipParams: regossipParams, 324 targetGossipSize: targetGossipSize, 325 maxRegossipFrequency: maxRegossipFrequency, 326 327 tracking: make(map[ids.ID]*tracking), 328 toGossip: buffer.NewUnboundedDeque[T](0), 329 toRegossip: buffer.NewUnboundedDeque[T](0), 330 discarded: &cache.LRU[ids.ID, struct{}]{Size: discardedSize}, 331 }, nil 332 } 333 334 // PushGossiper broadcasts gossip to peers randomly in the network 335 type PushGossiper[T Gossipable] struct { 336 marshaller Marshaller[T] 337 set Set[T] 338 validators p2p.ValidatorSubset 339 client *p2p.Client 340 metrics Metrics 341 342 gossipParams BranchingFactor 343 regossipParams BranchingFactor 344 targetGossipSize int 345 maxRegossipFrequency time.Duration 346 347 lock sync.Mutex 348 tracking map[ids.ID]*tracking 349 addedTimeSum float64 // unix nanoseconds 350 toGossip buffer.Deque[T] 351 toRegossip buffer.Deque[T] 352 discarded *cache.LRU[ids.ID, struct{}] // discarded attempts to avoid overgossiping transactions that are frequently dropped 353 } 354 355 type BranchingFactor struct { 356 // StakePercentage determines the percentage of stake that should have 357 // gossip sent to based on the inverse CDF of stake weights. This value does 358 // not account for the connectivity of the nodes. 359 StakePercentage float64 360 // Validators specifies the number of connected validators, in addition to 361 // any validators sent from the StakePercentage parameter, to send gossip 362 // to. These validators are sampled uniformly rather than by stake. 363 Validators int 364 // NonValidators specifies the number of connected non-validators to send 365 // gossip to. 366 NonValidators int 367 // Peers specifies the number of connected validators or non-validators, in 368 // addition to the number sent due to other configs, to send gossip to. 369 Peers int 370 } 371 372 func (b *BranchingFactor) Verify() error { 373 switch { 374 case b.Validators < 0: 375 return ErrInvalidNumValidators 376 case b.NonValidators < 0: 377 return ErrInvalidNumNonValidators 378 case b.Peers < 0: 379 return ErrInvalidNumPeers 380 case max(b.Validators, b.NonValidators, b.Peers) == 0: 381 return ErrInvalidNumToGossip 382 default: 383 return nil 384 } 385 } 386 387 type tracking struct { 388 addedTime float64 // unix nanoseconds 389 lastGossiped time.Time 390 } 391 392 // Gossip flushes any queued gossipables. 393 func (p *PushGossiper[T]) Gossip(ctx context.Context) error { 394 var ( 395 now = time.Now() 396 nowUnixNano = float64(now.UnixNano()) 397 ) 398 399 p.lock.Lock() 400 defer func() { 401 p.updateMetrics(nowUnixNano) 402 p.lock.Unlock() 403 }() 404 405 if len(p.tracking) == 0 { 406 return nil 407 } 408 409 if err := p.gossip( 410 ctx, 411 now, 412 p.gossipParams, 413 p.toGossip, 414 p.toRegossip, 415 &cache.Empty[ids.ID, struct{}]{}, // Don't mark dropped unsent transactions as discarded 416 unsentLabels, 417 ); err != nil { 418 return fmt.Errorf("unexpected error during gossip: %w", err) 419 } 420 421 if err := p.gossip( 422 ctx, 423 now, 424 p.regossipParams, 425 p.toRegossip, 426 p.toRegossip, 427 p.discarded, // Mark dropped sent transactions as discarded 428 sentLabels, 429 ); err != nil { 430 return fmt.Errorf("unexpected error during regossip: %w", err) 431 } 432 return nil 433 } 434 435 func (p *PushGossiper[T]) gossip( 436 ctx context.Context, 437 now time.Time, 438 gossipParams BranchingFactor, 439 toGossip buffer.Deque[T], 440 toRegossip buffer.Deque[T], 441 discarded cache.Cacher[ids.ID, struct{}], 442 metricsLabels prometheus.Labels, 443 ) error { 444 var ( 445 sentBytes = 0 446 gossip = make([][]byte, 0, defaultGossipableCount) 447 maxLastGossipTimeToRegossip = now.Add(-p.maxRegossipFrequency) 448 ) 449 450 for sentBytes < p.targetGossipSize { 451 gossipable, ok := toGossip.PopLeft() 452 if !ok { 453 break 454 } 455 456 // Ensure item is still in the set before we gossip. 457 gossipID := gossipable.GossipID() 458 tracking := p.tracking[gossipID] 459 if !p.set.Has(gossipID) { 460 delete(p.tracking, gossipID) 461 p.addedTimeSum -= tracking.addedTime 462 discarded.Put(gossipID, struct{}{}) // Cache that the item was dropped 463 continue 464 } 465 466 // Ensure we don't attempt to send a gossipable too frequently. 467 if maxLastGossipTimeToRegossip.Before(tracking.lastGossiped) { 468 // Put the gossipable on the front of the queue to keep items sorted 469 // by last issuance time. 470 toGossip.PushLeft(gossipable) 471 break 472 } 473 474 bytes, err := p.marshaller.MarshalGossip(gossipable) 475 if err != nil { 476 delete(p.tracking, gossipID) 477 p.addedTimeSum -= tracking.addedTime 478 return err 479 } 480 481 gossip = append(gossip, bytes) 482 sentBytes += len(bytes) 483 toRegossip.PushRight(gossipable) 484 tracking.lastGossiped = now 485 } 486 487 // If there is nothing to gossip, we can exit early. 488 if len(gossip) == 0 { 489 return nil 490 } 491 492 // Send gossipables to peers 493 msgBytes, err := MarshalAppGossip(gossip) 494 if err != nil { 495 return err 496 } 497 498 if err := p.metrics.observeMessage(sentPushLabels, len(gossip), sentBytes); err != nil { 499 return err 500 } 501 502 topValidatorsMetric, err := p.metrics.topValidators.GetMetricWith(metricsLabels) 503 if err != nil { 504 return fmt.Errorf("failed to get top validators metric: %w", err) 505 } 506 507 validatorsByStake := p.validators.Top(ctx, gossipParams.StakePercentage) 508 topValidatorsMetric.Set(float64(len(validatorsByStake))) 509 510 return p.client.AppGossip( 511 ctx, 512 common.SendConfig{ 513 NodeIDs: set.Of(validatorsByStake...), 514 Validators: gossipParams.Validators, 515 NonValidators: gossipParams.NonValidators, 516 Peers: gossipParams.Peers, 517 }, 518 msgBytes, 519 ) 520 } 521 522 // Add enqueues new gossipables to be pushed. If a gossiable is already tracked, 523 // it is not added again. 524 func (p *PushGossiper[T]) Add(gossipables ...T) { 525 var ( 526 now = time.Now() 527 nowUnixNano = float64(now.UnixNano()) 528 ) 529 530 p.lock.Lock() 531 defer func() { 532 p.updateMetrics(nowUnixNano) 533 p.lock.Unlock() 534 }() 535 536 // Add new gossipables to be sent. 537 for _, gossipable := range gossipables { 538 gossipID := gossipable.GossipID() 539 if _, ok := p.tracking[gossipID]; ok { 540 continue 541 } 542 543 tracking := &tracking{ 544 addedTime: nowUnixNano, 545 } 546 if _, ok := p.discarded.Get(gossipID); ok { 547 // Pretend that recently discarded transactions were just gossiped. 548 tracking.lastGossiped = now 549 p.toRegossip.PushRight(gossipable) 550 } else { 551 p.toGossip.PushRight(gossipable) 552 } 553 p.tracking[gossipID] = tracking 554 p.addedTimeSum += nowUnixNano 555 } 556 } 557 558 func (p *PushGossiper[_]) updateMetrics(nowUnixNano float64) { 559 var ( 560 numUnsent = float64(p.toGossip.Len()) 561 numSent = float64(p.toRegossip.Len()) 562 numTracking = numUnsent + numSent 563 averageLifetime float64 564 ) 565 if numTracking != 0 { 566 averageLifetime = nowUnixNano - p.addedTimeSum/numTracking 567 } 568 569 p.metrics.tracking.With(unsentLabels).Set(numUnsent) 570 p.metrics.tracking.With(sentLabels).Set(numSent) 571 p.metrics.trackingLifetimeAverage.Set(averageLifetime) 572 } 573 574 // Every calls [Gossip] every [frequency] amount of time. 575 func Every(ctx context.Context, log logging.Logger, gossiper Gossiper, frequency time.Duration) { 576 ticker := time.NewTicker(frequency) 577 defer ticker.Stop() 578 579 for { 580 select { 581 case <-ticker.C: 582 if err := gossiper.Gossip(ctx); err != nil { 583 log.Warn("failed to gossip", zap.Error(err)) 584 } 585 case <-ctx.Done(): 586 log.Debug("shutting down gossip") 587 return 588 } 589 } 590 } 591 592 type NoOpGossiper struct{} 593 594 func (NoOpGossiper) Gossip(context.Context) error { 595 return nil 596 } 597 598 type TestGossiper struct { 599 GossipF func(ctx context.Context) error 600 } 601 602 func (t *TestGossiper) Gossip(ctx context.Context) error { 603 return t.GossipF(ctx) 604 } 605 606 type EmptySet[T Gossipable] struct{} 607 608 func (EmptySet[_]) Gossip(context.Context) error { 609 return nil 610 } 611 612 func (EmptySet[T]) Add(T) error { 613 return errEmptySetCantAdd 614 } 615 616 func (EmptySet[T]) Has(ids.ID) bool { 617 return false 618 } 619 620 func (EmptySet[T]) Iterate(func(gossipable T) bool) {} 621 622 func (EmptySet[_]) GetFilter() ([]byte, []byte) { 623 return bloom.EmptyFilter.Marshal(), ids.Empty[:] 624 } 625 626 type FullSet[T Gossipable] struct{} 627 628 func (FullSet[_]) Gossip(context.Context) error { 629 return nil 630 } 631 632 func (FullSet[T]) Add(T) error { 633 return nil 634 } 635 636 func (FullSet[T]) Has(ids.ID) bool { 637 return true 638 } 639 640 func (FullSet[T]) Iterate(func(gossipable T) bool) {} 641 642 func (FullSet[_]) GetFilter() ([]byte, []byte) { 643 return bloom.FullFilter.Marshal(), ids.Empty[:] 644 }