github.com/MetalBlockchain/metalgo@v1.11.9/snow/engine/snowman/transitive.go (about) 1 // Copyright (C) 2019-2024, Ava Labs, Inc. All rights reserved. 2 // See the file LICENSE for licensing terms. 3 4 package snowman 5 6 import ( 7 "context" 8 "fmt" 9 10 "github.com/prometheus/client_golang/prometheus" 11 "go.uber.org/zap" 12 13 "github.com/MetalBlockchain/metalgo/cache" 14 "github.com/MetalBlockchain/metalgo/cache/metercacher" 15 "github.com/MetalBlockchain/metalgo/ids" 16 "github.com/MetalBlockchain/metalgo/proto/pb/p2p" 17 "github.com/MetalBlockchain/metalgo/snow" 18 "github.com/MetalBlockchain/metalgo/snow/consensus/snowman" 19 "github.com/MetalBlockchain/metalgo/snow/consensus/snowman/poll" 20 "github.com/MetalBlockchain/metalgo/snow/engine/common" 21 "github.com/MetalBlockchain/metalgo/snow/engine/common/tracker" 22 "github.com/MetalBlockchain/metalgo/snow/engine/snowman/ancestor" 23 "github.com/MetalBlockchain/metalgo/snow/engine/snowman/job" 24 "github.com/MetalBlockchain/metalgo/snow/validators" 25 "github.com/MetalBlockchain/metalgo/utils/bag" 26 "github.com/MetalBlockchain/metalgo/utils/bimap" 27 "github.com/MetalBlockchain/metalgo/utils/constants" 28 "github.com/MetalBlockchain/metalgo/utils/logging" 29 "github.com/MetalBlockchain/metalgo/utils/math" 30 "github.com/MetalBlockchain/metalgo/utils/set" 31 "github.com/MetalBlockchain/metalgo/utils/units" 32 ) 33 34 const nonVerifiedCacheSize = 64 * units.MiB 35 36 var _ common.Engine = (*Transitive)(nil) 37 38 func cachedBlockSize(_ ids.ID, blk snowman.Block) int { 39 return ids.IDLen + len(blk.Bytes()) + constants.PointerOverhead 40 } 41 42 // Transitive implements the Engine interface by attempting to fetch all 43 // Transitive dependencies. 44 type Transitive struct { 45 Config 46 *metrics 47 48 // list of NoOpsHandler for messages dropped by engine 49 common.StateSummaryFrontierHandler 50 common.AcceptedStateSummaryHandler 51 common.AcceptedFrontierHandler 52 common.AcceptedHandler 53 common.AncestorsHandler 54 common.AppHandler 55 validators.Connector 56 57 requestID uint32 58 59 // track outstanding preference requests 60 polls poll.Set 61 62 // blocks that have we have sent get requests for but haven't yet received 63 blkReqs *bimap.BiMap[common.Request, ids.ID] 64 blkReqSourceMetric map[common.Request]prometheus.Counter 65 66 // blocks that are queued to be issued to consensus once missing dependencies are fetched 67 // Block ID --> Block 68 pending map[ids.ID]snowman.Block 69 70 // Block ID --> Parent ID 71 nonVerifieds ancestor.Tree 72 73 // Block ID --> Block. 74 // A block is put into this cache if it was not able to be issued. A block 75 // fails to be issued if verification on the block or one of its ancestors 76 // occurs. 77 nonVerifiedCache cache.Cacher[ids.ID, snowman.Block] 78 79 // acceptedFrontiers of the other validators of this chain 80 acceptedFrontiers tracker.Accepted 81 82 // operations that are blocked on a block being issued. This could be 83 // issuing another block, responding to a query, or applying votes to consensus 84 blocked *job.Scheduler[ids.ID] 85 86 // number of times build block needs to be called once the number of 87 // processing blocks has gone below the optimal number. 88 pendingBuildBlocks int 89 } 90 91 func New(config Config) (*Transitive, error) { 92 config.Ctx.Log.Info("initializing consensus engine") 93 94 nonVerifiedCache, err := metercacher.New[ids.ID, snowman.Block]( 95 "non_verified_cache", 96 config.Ctx.Registerer, 97 cache.NewSizedLRU[ids.ID, snowman.Block]( 98 nonVerifiedCacheSize, 99 cachedBlockSize, 100 ), 101 ) 102 if err != nil { 103 return nil, err 104 } 105 106 acceptedFrontiers := tracker.NewAccepted() 107 config.Validators.RegisterSetCallbackListener(config.Ctx.SubnetID, acceptedFrontiers) 108 109 factory, err := poll.NewEarlyTermNoTraversalFactory( 110 config.Params.AlphaPreference, 111 config.Params.AlphaConfidence, 112 config.Ctx.Registerer, 113 ) 114 if err != nil { 115 return nil, err 116 } 117 polls, err := poll.NewSet( 118 factory, 119 config.Ctx.Log, 120 config.Ctx.Registerer, 121 ) 122 if err != nil { 123 return nil, err 124 } 125 126 metrics, err := newMetrics(config.Ctx.Registerer) 127 if err != nil { 128 return nil, err 129 } 130 131 return &Transitive{ 132 Config: config, 133 metrics: metrics, 134 StateSummaryFrontierHandler: common.NewNoOpStateSummaryFrontierHandler(config.Ctx.Log), 135 AcceptedStateSummaryHandler: common.NewNoOpAcceptedStateSummaryHandler(config.Ctx.Log), 136 AcceptedFrontierHandler: common.NewNoOpAcceptedFrontierHandler(config.Ctx.Log), 137 AcceptedHandler: common.NewNoOpAcceptedHandler(config.Ctx.Log), 138 AncestorsHandler: common.NewNoOpAncestorsHandler(config.Ctx.Log), 139 AppHandler: config.VM, 140 Connector: config.VM, 141 pending: make(map[ids.ID]snowman.Block), 142 nonVerifieds: ancestor.NewTree(), 143 nonVerifiedCache: nonVerifiedCache, 144 acceptedFrontiers: acceptedFrontiers, 145 blocked: job.NewScheduler[ids.ID](), 146 polls: polls, 147 blkReqs: bimap.New[common.Request, ids.ID](), 148 blkReqSourceMetric: make(map[common.Request]prometheus.Counter), 149 }, nil 150 } 151 152 func (t *Transitive) Gossip(ctx context.Context) error { 153 lastAcceptedID, lastAcceptedHeight := t.Consensus.LastAccepted() 154 if numProcessing := t.Consensus.NumProcessing(); numProcessing != 0 { 155 t.Ctx.Log.Debug("skipping block gossip", 156 zap.String("reason", "blocks currently processing"), 157 zap.Int("numProcessing", numProcessing), 158 ) 159 160 // repoll is called here to unblock the engine if it previously errored 161 // when attempting to issue a query. This can happen if a subnet was 162 // temporarily misconfigured and there were no validators. 163 t.repoll(ctx) 164 return nil 165 } 166 167 t.Ctx.Log.Verbo("sampling from validators", 168 zap.Stringer("validators", t.Validators), 169 ) 170 171 // Uniform sampling is used here to reduce bandwidth requirements of 172 // nodes with a large amount of stake weight. 173 vdrID, ok := t.ConnectedValidators.SampleValidator() 174 if !ok { 175 t.Ctx.Log.Warn("skipping block gossip", 176 zap.String("reason", "no connected validators"), 177 ) 178 return nil 179 } 180 181 nextHeightToAccept, err := math.Add64(lastAcceptedHeight, 1) 182 if err != nil { 183 t.Ctx.Log.Error("skipping block gossip", 184 zap.String("reason", "block height overflow"), 185 zap.Stringer("blkID", lastAcceptedID), 186 zap.Uint64("lastAcceptedHeight", lastAcceptedHeight), 187 zap.Error(err), 188 ) 189 return nil 190 } 191 192 t.requestID++ 193 t.Sender.SendPullQuery( 194 ctx, 195 set.Of(vdrID), 196 t.requestID, 197 t.Consensus.Preference(), 198 nextHeightToAccept, 199 ) 200 return nil 201 } 202 203 func (t *Transitive) Put(ctx context.Context, nodeID ids.NodeID, requestID uint32, blkBytes []byte) error { 204 blk, err := t.VM.ParseBlock(ctx, blkBytes) 205 if err != nil { 206 if t.Ctx.Log.Enabled(logging.Verbo) { 207 t.Ctx.Log.Verbo("failed to parse block", 208 zap.Stringer("nodeID", nodeID), 209 zap.Uint32("requestID", requestID), 210 zap.Binary("block", blkBytes), 211 zap.Error(err), 212 ) 213 } else { 214 t.Ctx.Log.Debug("failed to parse block", 215 zap.Stringer("nodeID", nodeID), 216 zap.Uint32("requestID", requestID), 217 zap.Error(err), 218 ) 219 } 220 // because GetFailed doesn't utilize the assumption that we actually 221 // sent a Get message, we can safely call GetFailed here to potentially 222 // abandon the request. 223 return t.GetFailed(ctx, nodeID, requestID) 224 } 225 226 var ( 227 req = common.Request{ 228 NodeID: nodeID, 229 RequestID: requestID, 230 } 231 issuedMetric prometheus.Counter 232 ) 233 switch expectedBlkID, ok := t.blkReqs.GetValue(req); { 234 case ok: 235 actualBlkID := blk.ID() 236 if actualBlkID != expectedBlkID { 237 t.Ctx.Log.Debug("incorrect block returned in Put", 238 zap.Stringer("nodeID", nodeID), 239 zap.Uint32("requestID", requestID), 240 zap.Stringer("blkID", actualBlkID), 241 zap.Stringer("expectedBlkID", expectedBlkID), 242 ) 243 // We assume that [blk] is useless because it doesn't match what we 244 // expected. 245 return t.GetFailed(ctx, nodeID, requestID) 246 } 247 248 issuedMetric = t.blkReqSourceMetric[req] 249 default: 250 // This can happen if this block was provided to this engine while a Get 251 // request was outstanding. For example, the block may have been locally 252 // built or the node may have received a PushQuery with this block. 253 // 254 // Note: It is still possible this block will be issued here, because 255 // the block may have previously failed verification. 256 issuedMetric = t.metrics.issued.WithLabelValues(unknownSource) 257 } 258 259 if !t.shouldIssueBlock(blk) { 260 t.metrics.numUselessPutBytes.Add(float64(len(blkBytes))) 261 } 262 263 // issue the block into consensus. If the block has already been issued, 264 // this will be a noop. If this block has missing dependencies, vdr will 265 // receive requests to fill the ancestry. dependencies that have already 266 // been fetched, but with missing dependencies themselves won't be requested 267 // from the vdr. 268 if err := t.issueFrom(ctx, nodeID, blk, issuedMetric); err != nil { 269 return err 270 } 271 return t.executeDeferredWork(ctx) 272 } 273 274 func (t *Transitive) GetFailed(ctx context.Context, nodeID ids.NodeID, requestID uint32) error { 275 // We don't assume that this function is called after a failed Get message. 276 // Check to see if we have an outstanding request and also get what the 277 // request was for if it exists. 278 req := common.Request{ 279 NodeID: nodeID, 280 RequestID: requestID, 281 } 282 blkID, ok := t.blkReqs.DeleteKey(req) 283 if !ok { 284 t.Ctx.Log.Debug("unexpected GetFailed", 285 zap.Stringer("nodeID", nodeID), 286 zap.Uint32("requestID", requestID), 287 ) 288 return nil 289 } 290 delete(t.blkReqSourceMetric, req) 291 292 // Because the get request was dropped, we no longer expect blkID to be 293 // issued. 294 if err := t.blocked.Abandon(ctx, blkID); err != nil { 295 return err 296 } 297 return t.executeDeferredWork(ctx) 298 } 299 300 func (t *Transitive) PullQuery(ctx context.Context, nodeID ids.NodeID, requestID uint32, blkID ids.ID, requestedHeight uint64) error { 301 t.sendChits(ctx, nodeID, requestID, requestedHeight) 302 303 issuedMetric := t.metrics.issued.WithLabelValues(pushGossipSource) 304 305 // Try to issue [blkID] to consensus. 306 // If we're missing an ancestor, request it from [vdr] 307 if err := t.issueFromByID(ctx, nodeID, blkID, issuedMetric); err != nil { 308 return err 309 } 310 311 return t.executeDeferredWork(ctx) 312 } 313 314 func (t *Transitive) PushQuery(ctx context.Context, nodeID ids.NodeID, requestID uint32, blkBytes []byte, requestedHeight uint64) error { 315 t.sendChits(ctx, nodeID, requestID, requestedHeight) 316 317 blk, err := t.VM.ParseBlock(ctx, blkBytes) 318 // If parsing fails, we just drop the request, as we didn't ask for it 319 if err != nil { 320 if t.Ctx.Log.Enabled(logging.Verbo) { 321 t.Ctx.Log.Verbo("failed to parse block", 322 zap.Stringer("nodeID", nodeID), 323 zap.Uint32("requestID", requestID), 324 zap.Binary("block", blkBytes), 325 zap.Error(err), 326 ) 327 } else { 328 t.Ctx.Log.Debug("failed to parse block", 329 zap.Stringer("nodeID", nodeID), 330 zap.Uint32("requestID", requestID), 331 zap.Error(err), 332 ) 333 } 334 return nil 335 } 336 337 if !t.shouldIssueBlock(blk) { 338 t.metrics.numUselessPushQueryBytes.Add(float64(len(blkBytes))) 339 } 340 341 issuedMetric := t.metrics.issued.WithLabelValues(pushGossipSource) 342 343 // issue the block into consensus. If the block has already been issued, 344 // this will be a noop. If this block has missing dependencies, nodeID will 345 // receive requests to fill the ancestry. dependencies that have already 346 // been fetched, but with missing dependencies themselves won't be requested 347 // from the vdr. 348 if err := t.issueFrom(ctx, nodeID, blk, issuedMetric); err != nil { 349 return err 350 } 351 352 return t.executeDeferredWork(ctx) 353 } 354 355 func (t *Transitive) Chits(ctx context.Context, nodeID ids.NodeID, requestID uint32, preferredID ids.ID, preferredIDAtHeight ids.ID, acceptedID ids.ID) error { 356 t.acceptedFrontiers.SetLastAccepted(nodeID, acceptedID) 357 358 t.Ctx.Log.Verbo("called Chits for the block", 359 zap.Stringer("nodeID", nodeID), 360 zap.Uint32("requestID", requestID), 361 zap.Stringer("preferredID", preferredID), 362 zap.Stringer("preferredIDAtHeight", preferredIDAtHeight), 363 zap.Stringer("acceptedID", acceptedID), 364 ) 365 366 issuedMetric := t.metrics.issued.WithLabelValues(pullGossipSource) 367 if err := t.issueFromByID(ctx, nodeID, preferredID, issuedMetric); err != nil { 368 return err 369 } 370 371 var ( 372 preferredIDAtHeightShouldBlock bool 373 // Invariant: The order of [responseOptions] must be [preferredID] then 374 // (optionally) [preferredIDAtHeight]. During vote application, the 375 // first vote that can be applied will be used. So, the votes should be 376 // populated in order of decreasing height. 377 responseOptions = []ids.ID{preferredID} 378 ) 379 if preferredID != preferredIDAtHeight { 380 if err := t.issueFromByID(ctx, nodeID, preferredIDAtHeight, issuedMetric); err != nil { 381 return err 382 } 383 preferredIDAtHeightShouldBlock = t.canDependOn(preferredIDAtHeight) 384 responseOptions = append(responseOptions, preferredIDAtHeight) 385 } 386 387 // Will record chits once [preferredID] and [preferredIDAtHeight] have been 388 // issued into consensus 389 v := &voter{ 390 t: t, 391 nodeID: nodeID, 392 requestID: requestID, 393 responseOptions: responseOptions, 394 } 395 396 // Wait until [preferredID] and [preferredIDAtHeight] have been issued to 397 // consensus before applying this chit. 398 var deps []ids.ID 399 if t.canDependOn(preferredID) { 400 deps = append(deps, preferredID) 401 } 402 if preferredIDAtHeightShouldBlock { 403 deps = append(deps, preferredIDAtHeight) 404 } 405 406 if err := t.blocked.Schedule(ctx, v, deps...); err != nil { 407 return err 408 } 409 return t.executeDeferredWork(ctx) 410 } 411 412 func (t *Transitive) QueryFailed(ctx context.Context, nodeID ids.NodeID, requestID uint32) error { 413 lastAccepted, ok := t.acceptedFrontiers.LastAccepted(nodeID) 414 if ok { 415 return t.Chits(ctx, nodeID, requestID, lastAccepted, lastAccepted, lastAccepted) 416 } 417 418 v := &voter{ 419 t: t, 420 nodeID: nodeID, 421 requestID: requestID, 422 } 423 if err := t.blocked.Schedule(ctx, v); err != nil { 424 return err 425 } 426 return t.executeDeferredWork(ctx) 427 } 428 429 func (*Transitive) Timeout(context.Context) error { 430 return nil 431 } 432 433 func (*Transitive) Halt(context.Context) {} 434 435 func (t *Transitive) Shutdown(ctx context.Context) error { 436 t.Ctx.Log.Info("shutting down consensus engine") 437 438 t.Ctx.Lock.Lock() 439 defer t.Ctx.Lock.Unlock() 440 441 return t.VM.Shutdown(ctx) 442 } 443 444 func (t *Transitive) Notify(ctx context.Context, msg common.Message) error { 445 switch msg { 446 case common.PendingTxs: 447 // the pending txs message means we should attempt to build a block. 448 t.pendingBuildBlocks++ 449 return t.executeDeferredWork(ctx) 450 case common.StateSyncDone: 451 t.Ctx.StateSyncing.Set(false) 452 return nil 453 default: 454 t.Ctx.Log.Warn("received an unexpected message from the VM", 455 zap.Stringer("messageString", msg), 456 ) 457 return nil 458 } 459 } 460 461 func (t *Transitive) Context() *snow.ConsensusContext { 462 return t.Ctx 463 } 464 465 func (t *Transitive) Start(ctx context.Context, startReqID uint32) error { 466 t.requestID = startReqID 467 lastAcceptedID, err := t.VM.LastAccepted(ctx) 468 if err != nil { 469 return err 470 } 471 472 lastAccepted, err := t.getBlock(ctx, lastAcceptedID) 473 if err != nil { 474 t.Ctx.Log.Error("failed to get last accepted block", 475 zap.Error(err), 476 ) 477 return err 478 } 479 480 // initialize consensus to the last accepted blockID 481 lastAcceptedHeight := lastAccepted.Height() 482 if err := t.Consensus.Initialize(t.Ctx, t.Params, lastAcceptedID, lastAcceptedHeight, lastAccepted.Timestamp()); err != nil { 483 return err 484 } 485 486 // to maintain the invariant that oracle blocks are issued in the correct 487 // preferences, we need to handle the case that we are bootstrapping into an oracle block 488 if oracleBlk, ok := lastAccepted.(snowman.OracleBlock); ok { 489 options, err := oracleBlk.Options(ctx) 490 switch { 491 case err == snowman.ErrNotOracle: 492 // if there aren't blocks we need to deliver on startup, we need to set 493 // the preference to the last accepted block 494 if err := t.VM.SetPreference(ctx, lastAcceptedID); err != nil { 495 return err 496 } 497 case err != nil: 498 return err 499 default: 500 issuedMetric := t.metrics.issued.WithLabelValues(builtSource) 501 for _, blk := range options { 502 // note that deliver will set the VM's preference 503 if err := t.deliver(ctx, t.Ctx.NodeID, blk, false, issuedMetric); err != nil { 504 return err 505 } 506 } 507 } 508 } else if err := t.VM.SetPreference(ctx, lastAcceptedID); err != nil { 509 return err 510 } 511 512 t.Ctx.Log.Info("starting consensus", 513 zap.Stringer("lastAcceptedID", lastAcceptedID), 514 zap.Uint64("lastAcceptedHeight", lastAcceptedHeight), 515 ) 516 t.metrics.bootstrapFinished.Set(1) 517 518 t.Ctx.State.Set(snow.EngineState{ 519 Type: p2p.EngineType_ENGINE_TYPE_SNOWMAN, 520 State: snow.NormalOp, 521 }) 522 if err := t.VM.SetState(ctx, snow.NormalOp); err != nil { 523 return fmt.Errorf("failed to notify VM that consensus is starting: %w", 524 err) 525 } 526 return t.executeDeferredWork(ctx) 527 } 528 529 func (t *Transitive) HealthCheck(ctx context.Context) (interface{}, error) { 530 t.Ctx.Lock.Lock() 531 defer t.Ctx.Lock.Unlock() 532 533 t.Ctx.Log.Verbo("running health check", 534 zap.Uint32("requestID", t.requestID), 535 zap.Stringer("polls", t.polls), 536 zap.Reflect("outstandingBlockRequests", t.blkReqs), 537 zap.Int("numMissingDependencies", t.blocked.NumDependencies()), 538 zap.Int("pendingBuildBlocks", t.pendingBuildBlocks), 539 ) 540 541 consensusIntf, consensusErr := t.Consensus.HealthCheck(ctx) 542 vmIntf, vmErr := t.VM.HealthCheck(ctx) 543 intf := map[string]interface{}{ 544 "consensus": consensusIntf, 545 "vm": vmIntf, 546 } 547 if consensusErr == nil { 548 return intf, vmErr 549 } 550 if vmErr == nil { 551 return intf, consensusErr 552 } 553 return intf, fmt.Errorf("vm: %w ; consensus: %w", vmErr, consensusErr) 554 } 555 556 func (t *Transitive) executeDeferredWork(ctx context.Context) error { 557 if err := t.buildBlocks(ctx); err != nil { 558 return err 559 } 560 561 t.metrics.numRequests.Set(float64(t.blkReqs.Len())) 562 t.metrics.numBlocked.Set(float64(len(t.pending))) 563 t.metrics.numBlockers.Set(float64(t.blocked.NumDependencies())) 564 t.metrics.numNonVerifieds.Set(float64(t.nonVerifieds.Len())) 565 return nil 566 } 567 568 func (t *Transitive) getBlock(ctx context.Context, blkID ids.ID) (snowman.Block, error) { 569 if blk, ok := t.pending[blkID]; ok { 570 return blk, nil 571 } 572 if blk, ok := t.nonVerifiedCache.Get(blkID); ok { 573 return blk, nil 574 } 575 576 return t.VM.GetBlock(ctx, blkID) 577 } 578 579 func (t *Transitive) sendChits(ctx context.Context, nodeID ids.NodeID, requestID uint32, requestedHeight uint64) { 580 lastAcceptedID, lastAcceptedHeight := t.Consensus.LastAccepted() 581 // If we aren't fully verifying blocks, only vote for blocks that are widely 582 // preferred by the validator set. 583 if t.Ctx.StateSyncing.Get() || t.Config.PartialSync { 584 acceptedAtHeight, err := t.VM.GetBlockIDAtHeight(ctx, requestedHeight) 585 if err != nil { 586 // Because we only return accepted state here, it's fairly likely 587 // that the requested height is higher than the last accepted block. 588 // That means that this code path is actually quite common. 589 t.Ctx.Log.Debug("failed fetching accepted block", 590 zap.Stringer("nodeID", nodeID), 591 zap.Uint64("requestedHeight", requestedHeight), 592 zap.Uint64("lastAcceptedHeight", lastAcceptedHeight), 593 zap.Stringer("lastAcceptedID", lastAcceptedID), 594 zap.Error(err), 595 ) 596 acceptedAtHeight = lastAcceptedID 597 } 598 t.Sender.SendChits(ctx, nodeID, requestID, lastAcceptedID, acceptedAtHeight, lastAcceptedID) 599 return 600 } 601 602 var ( 603 preference = t.Consensus.Preference() 604 preferenceAtHeight ids.ID 605 ) 606 if requestedHeight < lastAcceptedHeight { 607 var err error 608 preferenceAtHeight, err = t.VM.GetBlockIDAtHeight(ctx, requestedHeight) 609 if err != nil { 610 // If this chain is pruning historical blocks, it's expected for a 611 // node to be unable to fetch some block IDs. In this case, we fall 612 // back to returning the last accepted ID. 613 // 614 // Because it is possible for a byzantine node to spam requests at 615 // old heights on a pruning network, we log this as debug. However, 616 // this case is unexpected to be hit by correct peers. 617 t.Ctx.Log.Debug("failed fetching accepted block", 618 zap.Stringer("nodeID", nodeID), 619 zap.Uint64("requestedHeight", requestedHeight), 620 zap.Uint64("lastAcceptedHeight", lastAcceptedHeight), 621 zap.Stringer("lastAcceptedID", lastAcceptedID), 622 zap.Error(err), 623 ) 624 t.numMissingAcceptedBlocks.Inc() 625 626 preferenceAtHeight = lastAcceptedID 627 } 628 } else { 629 var ok bool 630 preferenceAtHeight, ok = t.Consensus.PreferenceAtHeight(requestedHeight) 631 if !ok { 632 t.Ctx.Log.Debug("failed fetching processing block", 633 zap.Stringer("nodeID", nodeID), 634 zap.Uint64("requestedHeight", requestedHeight), 635 zap.Uint64("lastAcceptedHeight", lastAcceptedHeight), 636 zap.Stringer("preferredID", preference), 637 ) 638 // If the requested height is higher than our preferred tip, we 639 // don't prefer anything at the requested height yet. 640 preferenceAtHeight = preference 641 } 642 } 643 t.Sender.SendChits(ctx, nodeID, requestID, preference, preferenceAtHeight, lastAcceptedID) 644 } 645 646 // Build blocks if they have been requested and the number of processing blocks 647 // is less than optimal. 648 func (t *Transitive) buildBlocks(ctx context.Context) error { 649 for t.pendingBuildBlocks > 0 && t.Consensus.NumProcessing() < t.Params.OptimalProcessing { 650 t.pendingBuildBlocks-- 651 652 blk, err := t.VM.BuildBlock(ctx) 653 if err != nil { 654 t.Ctx.Log.Debug("failed building block", 655 zap.Error(err), 656 ) 657 t.numBuildsFailed.Inc() 658 return nil 659 } 660 t.numBuilt.Inc() 661 662 // The newly created block should be built on top of the preferred block. 663 // Otherwise, the new block doesn't have the best chance of being confirmed. 664 parentID := blk.Parent() 665 if pref := t.Consensus.Preference(); parentID != pref { 666 t.Ctx.Log.Warn("built block with unexpected parent", 667 zap.Stringer("expectedParentID", pref), 668 zap.Stringer("parentID", parentID), 669 ) 670 } 671 672 issuedMetric := t.metrics.issued.WithLabelValues(builtSource) 673 if err := t.issueWithAncestors(ctx, blk, issuedMetric); err != nil { 674 return err 675 } 676 677 // TODO: Technically this may incorrectly log a warning if the block 678 // that was just built caused votes to be applied such that the block 679 // was rejected or was accepted along with one of its children. This 680 // should be cleaned up to never produce an invalid warning. 681 if t.canIssueChildOn(blk.ID()) { 682 t.Ctx.Log.Verbo("successfully issued new block from the VM") 683 } else { 684 t.Ctx.Log.Warn("block that was just built is not extendable") 685 } 686 } 687 return nil 688 } 689 690 // Issue another poll to the network, asking what it prefers given the block we prefer. 691 // Helps move consensus along. 692 func (t *Transitive) repoll(ctx context.Context) { 693 // if we are issuing a repoll, we should gossip our current preferences to 694 // propagate the most likely branch as quickly as possible 695 prefID := t.Consensus.Preference() 696 697 for i := t.polls.Len(); i < t.Params.ConcurrentRepolls; i++ { 698 t.sendQuery(ctx, prefID, nil, false) 699 } 700 } 701 702 // issueFromByID attempts to issue the branch ending with a block [blkID] into 703 // consensus. 704 // If we do not have [blkID], request it. 705 func (t *Transitive) issueFromByID( 706 ctx context.Context, 707 nodeID ids.NodeID, 708 blkID ids.ID, 709 issuedMetric prometheus.Counter, 710 ) error { 711 blk, err := t.getBlock(ctx, blkID) 712 if err != nil { 713 t.sendRequest(ctx, nodeID, blkID, issuedMetric) 714 return nil 715 } 716 return t.issueFrom(ctx, nodeID, blk, issuedMetric) 717 } 718 719 // issueFrom attempts to issue the branch ending with block [blkID] to 720 // consensus. 721 // If a dependency is missing, it will be requested it from [nodeID]. 722 func (t *Transitive) issueFrom( 723 ctx context.Context, 724 nodeID ids.NodeID, 725 blk snowman.Block, 726 issuedMetric prometheus.Counter, 727 ) error { 728 // issue [blk] and its ancestors to consensus. 729 blkID := blk.ID() 730 for t.shouldIssueBlock(blk) { 731 err := t.issue(ctx, nodeID, blk, false, issuedMetric) 732 if err != nil { 733 return err 734 } 735 736 // If we don't have this ancestor, request it from [nodeID] 737 blkID = blk.Parent() 738 blk, err = t.getBlock(ctx, blkID) 739 if err != nil { 740 t.sendRequest(ctx, nodeID, blkID, issuedMetric) 741 return nil 742 } 743 } 744 745 // Remove any outstanding requests for this block 746 if req, ok := t.blkReqs.DeleteValue(blkID); ok { 747 delete(t.blkReqSourceMetric, req) 748 } 749 750 // If this block isn't pending, make sure nothing is blocked on it. 751 if _, isPending := t.pending[blkID]; !isPending { 752 return t.blocked.Abandon(ctx, blkID) 753 } 754 return nil 755 } 756 757 // issueWithAncestors attempts to issue the branch ending with [blk] to 758 // consensus. 759 // If a dependency is missing and the dependency hasn't been requested, the 760 // issuance will be abandoned. 761 func (t *Transitive) issueWithAncestors( 762 ctx context.Context, 763 blk snowman.Block, 764 issuedMetric prometheus.Counter, 765 ) error { 766 blkID := blk.ID() 767 // issue [blk] and its ancestors into consensus 768 for t.shouldIssueBlock(blk) { 769 err := t.issue(ctx, t.Ctx.NodeID, blk, true, issuedMetric) 770 if err != nil { 771 return err 772 } 773 blkID = blk.Parent() 774 blk, err = t.getBlock(ctx, blkID) 775 if err != nil { 776 break 777 } 778 } 779 780 // There's an outstanding request for this block. We can wait for that 781 // request to succeed or fail. 782 if t.blkReqs.HasValue(blkID) { 783 return nil 784 } 785 786 // If the block wasn't already issued, we have no reason to expect that it 787 // will be able to be issued. 788 return t.blocked.Abandon(ctx, blkID) 789 } 790 791 // Issue [blk] to consensus once its ancestors have been issued. 792 // If [push] is true, a push query will be used. Otherwise, a pull query will be 793 // used. 794 func (t *Transitive) issue( 795 ctx context.Context, 796 nodeID ids.NodeID, 797 blk snowman.Block, 798 push bool, 799 issuedMetric prometheus.Counter, 800 ) error { 801 blkID := blk.ID() 802 803 // mark that the block is queued to be added to consensus once its ancestors have been 804 t.pending[blkID] = blk 805 806 // Remove any outstanding requests for this block 807 if req, ok := t.blkReqs.DeleteValue(blkID); ok { 808 delete(t.blkReqSourceMetric, req) 809 } 810 811 // Will add [blk] to consensus once its ancestors have been 812 i := &issuer{ 813 t: t, 814 nodeID: nodeID, 815 blk: blk, 816 push: push, 817 issuedMetric: issuedMetric, 818 } 819 820 // We know that shouldIssueBlock(blk) is true. This means that parent is 821 // either the last accepted block or is not decided. 822 var deps []ids.ID 823 if parentID := blk.Parent(); !t.canIssueChildOn(parentID) { 824 t.Ctx.Log.Verbo("block waiting for parent to be issued", 825 zap.Stringer("blkID", blkID), 826 zap.Stringer("parentID", parentID), 827 ) 828 deps = append(deps, parentID) 829 } 830 831 return t.blocked.Schedule(ctx, i, deps...) 832 } 833 834 // Request that [vdr] send us block [blkID] 835 func (t *Transitive) sendRequest( 836 ctx context.Context, 837 nodeID ids.NodeID, 838 blkID ids.ID, 839 issuedMetric prometheus.Counter, 840 ) { 841 // There is already an outstanding request for this block 842 if t.blkReqs.HasValue(blkID) { 843 return 844 } 845 846 t.requestID++ 847 req := common.Request{ 848 NodeID: nodeID, 849 RequestID: t.requestID, 850 } 851 t.blkReqs.Put(req, blkID) 852 t.blkReqSourceMetric[req] = issuedMetric 853 854 t.Ctx.Log.Verbo("sending Get request", 855 zap.Stringer("nodeID", nodeID), 856 zap.Uint32("requestID", t.requestID), 857 zap.Stringer("blkID", blkID), 858 ) 859 t.Sender.SendGet(ctx, nodeID, t.requestID, blkID) 860 } 861 862 // Send a query for this block. If push is set to true, blkBytes will be used to 863 // send a PushQuery. Otherwise, blkBytes will be ignored and a PullQuery will be 864 // sent. 865 func (t *Transitive) sendQuery( 866 ctx context.Context, 867 blkID ids.ID, 868 blkBytes []byte, 869 push bool, 870 ) { 871 t.Ctx.Log.Verbo("sampling from validators", 872 zap.Stringer("validators", t.Validators), 873 ) 874 875 vdrIDs, err := t.Validators.Sample(t.Ctx.SubnetID, t.Params.K) 876 if err != nil { 877 t.Ctx.Log.Warn("dropped query for block", 878 zap.String("reason", "insufficient number of validators"), 879 zap.Stringer("blkID", blkID), 880 zap.Int("size", t.Params.K), 881 ) 882 return 883 } 884 885 _, lastAcceptedHeight := t.Consensus.LastAccepted() 886 nextHeightToAccept, err := math.Add64(lastAcceptedHeight, 1) 887 if err != nil { 888 t.Ctx.Log.Error("dropped query for block", 889 zap.String("reason", "block height overflow"), 890 zap.Stringer("blkID", blkID), 891 zap.Uint64("lastAcceptedHeight", lastAcceptedHeight), 892 zap.Error(err), 893 ) 894 return 895 } 896 897 vdrBag := bag.Of(vdrIDs...) 898 t.requestID++ 899 if !t.polls.Add(t.requestID, vdrBag) { 900 t.Ctx.Log.Error("dropped query for block", 901 zap.String("reason", "failed to add poll"), 902 zap.Stringer("blkID", blkID), 903 zap.Uint32("requestID", t.requestID), 904 ) 905 return 906 } 907 908 vdrSet := set.Of(vdrIDs...) 909 if push { 910 t.Sender.SendPushQuery(ctx, vdrSet, t.requestID, blkBytes, nextHeightToAccept) 911 } else { 912 t.Sender.SendPullQuery(ctx, vdrSet, t.requestID, blkID, nextHeightToAccept) 913 } 914 } 915 916 // issue [blk] to consensus 917 // If [push] is true, a push query will be used. Otherwise, a pull query will be 918 // used. 919 func (t *Transitive) deliver( 920 ctx context.Context, 921 nodeID ids.NodeID, 922 blk snowman.Block, 923 push bool, 924 issuedMetric prometheus.Counter, 925 ) error { 926 // we are no longer waiting on adding the block to consensus, so it is no 927 // longer pending 928 blkID := blk.ID() 929 delete(t.pending, blkID) 930 931 parentID := blk.Parent() 932 if !t.canIssueChildOn(parentID) || t.Consensus.Processing(blkID) { 933 // If the parent isn't processing or the last accepted block, then this 934 // block is effectively rejected. 935 // Additionally, if [blkID] is already in the processing set, it 936 // shouldn't be added to consensus again. 937 return t.blocked.Abandon(ctx, blkID) 938 } 939 940 // By ensuring that the parent is either processing or accepted, it is 941 // guaranteed that the parent was successfully verified. This means that 942 // calling Verify on this block is allowed. 943 blkAdded, err := t.addUnverifiedBlockToConsensus(ctx, nodeID, blk, issuedMetric) 944 if err != nil { 945 return err 946 } 947 if !blkAdded { 948 return t.blocked.Abandon(ctx, blkID) 949 } 950 951 // Add all the oracle blocks if they exist. We call verify on all the blocks 952 // and add them to consensus before marking anything as fulfilled to avoid 953 // any potential reentrant bugs. 954 added := []snowman.Block{} 955 dropped := []snowman.Block{} 956 if blk, ok := blk.(snowman.OracleBlock); ok { 957 options, err := blk.Options(ctx) 958 if err != snowman.ErrNotOracle { 959 if err != nil { 960 return err 961 } 962 963 for _, blk := range options { 964 blkAdded, err := t.addUnverifiedBlockToConsensus(ctx, nodeID, blk, issuedMetric) 965 if err != nil { 966 return err 967 } 968 if blkAdded { 969 added = append(added, blk) 970 } else { 971 dropped = append(dropped, blk) 972 } 973 } 974 } 975 } 976 977 if err := t.VM.SetPreference(ctx, t.Consensus.Preference()); err != nil { 978 return err 979 } 980 981 // If the block is now preferred, query the network for its preferences 982 // with this new block. 983 if t.Consensus.IsPreferred(blkID) { 984 t.sendQuery(ctx, blkID, blk.Bytes(), push) 985 } 986 987 if err := t.blocked.Fulfill(ctx, blkID); err != nil { 988 return err 989 } 990 for _, blk := range added { 991 blkID := blk.ID() 992 if t.Consensus.IsPreferred(blkID) { 993 t.sendQuery(ctx, blkID, blk.Bytes(), push) 994 } 995 996 delete(t.pending, blkID) 997 if err := t.blocked.Fulfill(ctx, blkID); err != nil { 998 return err 999 } 1000 if req, ok := t.blkReqs.DeleteValue(blkID); ok { 1001 delete(t.blkReqSourceMetric, req) 1002 } 1003 } 1004 for _, blk := range dropped { 1005 blkID := blk.ID() 1006 delete(t.pending, blkID) 1007 if err := t.blocked.Abandon(ctx, blkID); err != nil { 1008 return err 1009 } 1010 if req, ok := t.blkReqs.DeleteValue(blkID); ok { 1011 delete(t.blkReqSourceMetric, req) 1012 } 1013 } 1014 1015 // It's possible that the blocks we just added to consensus were decided 1016 // immediately by votes that were pending their issuance. If this is the 1017 // case, we should not be requesting any chits. 1018 if t.Consensus.NumProcessing() == 0 { 1019 return nil 1020 } 1021 1022 // If we should issue multiple queries at the same time, we need to repoll 1023 t.repoll(ctx) 1024 return nil 1025 } 1026 1027 func (t *Transitive) addToNonVerifieds(blk snowman.Block) { 1028 // If this block is processing, we don't need to add it to non-verifieds. 1029 blkID := blk.ID() 1030 if t.Consensus.Processing(blkID) { 1031 return 1032 } 1033 parentID := blk.Parent() 1034 // We might still need this block so we can bubble votes to the parent. 1035 // 1036 // If the non-verified set contains the parentID, then we know that the 1037 // parent is not decided and therefore blk is not decided. 1038 // Similarly, if the parent is processing, then the parent is not decided 1039 // and therefore blk is not decided. 1040 if t.nonVerifieds.Has(parentID) || t.Consensus.Processing(parentID) { 1041 t.nonVerifieds.Add(blkID, parentID) 1042 t.nonVerifiedCache.Put(blkID, blk) 1043 } 1044 } 1045 1046 // addUnverifiedBlockToConsensus returns whether the block was added and an 1047 // error if one occurred while adding it to consensus. 1048 func (t *Transitive) addUnverifiedBlockToConsensus( 1049 ctx context.Context, 1050 nodeID ids.NodeID, 1051 blk snowman.Block, 1052 issuedMetric prometheus.Counter, 1053 ) (bool, error) { 1054 blkID := blk.ID() 1055 blkHeight := blk.Height() 1056 1057 // make sure this block is valid 1058 if err := blk.Verify(ctx); err != nil { 1059 t.Ctx.Log.Debug("block verification failed", 1060 zap.Stringer("nodeID", nodeID), 1061 zap.Stringer("blkID", blkID), 1062 zap.Uint64("height", blkHeight), 1063 zap.Error(err), 1064 ) 1065 1066 // if verify fails, then all descendants are also invalid 1067 t.addToNonVerifieds(blk) 1068 return false, nil 1069 } 1070 1071 issuedMetric.Inc() 1072 t.nonVerifieds.Remove(blkID) 1073 t.nonVerifiedCache.Evict(blkID) 1074 t.metrics.issuerStake.Observe(float64(t.Validators.GetWeight(t.Ctx.SubnetID, nodeID))) 1075 t.Ctx.Log.Verbo("adding block to consensus", 1076 zap.Stringer("nodeID", nodeID), 1077 zap.Stringer("blkID", blkID), 1078 zap.Uint64("height", blkHeight), 1079 ) 1080 return true, t.Consensus.Add(&memoryBlock{ 1081 Block: blk, 1082 metrics: t.metrics, 1083 tree: t.nonVerifieds, 1084 }) 1085 } 1086 1087 // getProcessingAncestor finds [initialVote]'s most recent ancestor that is 1088 // processing in consensus. If no ancestor could be found, false is returned. 1089 // 1090 // Note: If [initialVote] is processing, then [initialVote] will be returned. 1091 func (t *Transitive) getProcessingAncestor(ctx context.Context, initialVote ids.ID) (ids.ID, bool) { 1092 // If [bubbledVote] != [initialVote], it is guaranteed that [bubbledVote] is 1093 // in processing. Otherwise, we attempt to iterate through any blocks we 1094 // have at our disposal as a best-effort mechanism to find a valid ancestor. 1095 bubbledVote := t.nonVerifieds.GetAncestor(initialVote) 1096 for { 1097 if t.Consensus.Processing(bubbledVote) { 1098 t.Ctx.Log.Verbo("applying vote", 1099 zap.Stringer("initialVoteID", initialVote), 1100 zap.Stringer("bubbledVoteID", bubbledVote), 1101 ) 1102 if bubbledVote != initialVote { 1103 t.numProcessingAncestorFetchesSucceeded.Inc() 1104 } else { 1105 t.numProcessingAncestorFetchesUnneeded.Inc() 1106 } 1107 return bubbledVote, true 1108 } 1109 1110 blk, err := t.getBlock(ctx, bubbledVote) 1111 // If we cannot retrieve the block, drop [vote] 1112 if err != nil { 1113 t.Ctx.Log.Debug("dropping vote", 1114 zap.String("reason", "ancestor couldn't be fetched"), 1115 zap.Stringer("initialVoteID", initialVote), 1116 zap.Stringer("bubbledVoteID", bubbledVote), 1117 zap.Error(err), 1118 ) 1119 t.numProcessingAncestorFetchesFailed.Inc() 1120 return ids.Empty, false 1121 } 1122 1123 if t.isDecided(blk) { 1124 t.Ctx.Log.Debug("dropping vote", 1125 zap.String("reason", "bubbled vote already decided"), 1126 zap.Stringer("initialVoteID", initialVote), 1127 zap.Stringer("bubbledVoteID", bubbledVote), 1128 zap.Uint64("height", blk.Height()), 1129 ) 1130 t.numProcessingAncestorFetchesDropped.Inc() 1131 return ids.Empty, false 1132 } 1133 1134 bubbledVote = blk.Parent() 1135 } 1136 } 1137 1138 // shouldIssueBlock returns true if the provided block should be enqueued for 1139 // issuance. If the block is already decided, already enqueued, or has already 1140 // been issued, this function will return false. 1141 func (t *Transitive) shouldIssueBlock(blk snowman.Block) bool { 1142 if t.isDecided(blk) { 1143 return false 1144 } 1145 1146 blkID := blk.ID() 1147 _, isPending := t.pending[blkID] 1148 return !isPending && // If the block is already pending, don't issue it again. 1149 !t.Consensus.Processing(blkID) // If the block was previously issued, don't issue it again. 1150 } 1151 1152 // canDependOn reports true if it is guaranteed for the provided block ID to 1153 // eventually either be fulfilled or abandoned. 1154 func (t *Transitive) canDependOn(blkID ids.ID) bool { 1155 _, isPending := t.pending[blkID] 1156 return isPending || t.blkReqs.HasValue(blkID) 1157 } 1158 1159 // canIssueChildOn reports true if it is valid for a child of parentID to be 1160 // verified and added to consensus. 1161 func (t *Transitive) canIssueChildOn(parentID ids.ID) bool { 1162 lastAcceptedID, _ := t.Consensus.LastAccepted() 1163 return parentID == lastAcceptedID || t.Consensus.Processing(parentID) 1164 } 1165 1166 // isDecided reports true if the provided block's height implies that the block 1167 // is either Accepted or Rejected. 1168 func (t *Transitive) isDecided(blk snowman.Block) bool { 1169 height := blk.Height() 1170 lastAcceptedID, lastAcceptedHeight := t.Consensus.LastAccepted() 1171 if height <= lastAcceptedHeight { 1172 return true // block is either accepted or rejected 1173 } 1174 1175 // This is guaranteed not to underflow because the above check ensures 1176 // [height] > 0. 1177 parentHeight := height - 1 1178 parentID := blk.Parent() 1179 return parentHeight == lastAcceptedHeight && parentID != lastAcceptedID // the parent was rejected 1180 }