github.com/MetalBlockchain/metalgo@v1.11.9/snow/networking/handler/handler.go (about) 1 // Copyright (C) 2019-2024, Ava Labs, Inc. All rights reserved. 2 // See the file LICENSE for licensing terms. 3 4 package handler 5 6 import ( 7 "context" 8 "errors" 9 "fmt" 10 "sync" 11 "sync/atomic" 12 "time" 13 14 "github.com/prometheus/client_golang/prometheus" 15 "go.opentelemetry.io/otel/attribute" 16 "go.opentelemetry.io/otel/trace" 17 "go.uber.org/zap" 18 "golang.org/x/sync/errgroup" 19 20 "github.com/MetalBlockchain/metalgo/api/health" 21 "github.com/MetalBlockchain/metalgo/ids" 22 "github.com/MetalBlockchain/metalgo/message" 23 "github.com/MetalBlockchain/metalgo/network/p2p" 24 "github.com/MetalBlockchain/metalgo/snow" 25 "github.com/MetalBlockchain/metalgo/snow/engine/common" 26 "github.com/MetalBlockchain/metalgo/snow/networking/tracker" 27 "github.com/MetalBlockchain/metalgo/snow/validators" 28 "github.com/MetalBlockchain/metalgo/subnets" 29 "github.com/MetalBlockchain/metalgo/utils/logging" 30 "github.com/MetalBlockchain/metalgo/utils/set" 31 "github.com/MetalBlockchain/metalgo/utils/timer/mockable" 32 33 p2ppb "github.com/MetalBlockchain/metalgo/proto/pb/p2p" 34 commontracker "github.com/MetalBlockchain/metalgo/snow/engine/common/tracker" 35 ) 36 37 const ( 38 numDispatchersToClose = 3 39 // If a consensus message takes longer than this to process, the handler 40 // will log a warning. 41 syncProcessingTimeWarnLimit = 30 * time.Second 42 ) 43 44 var ( 45 _ Handler = (*handler)(nil) 46 47 errMissingEngine = errors.New("missing engine") 48 errNoStartingGear = errors.New("failed to select starting gear") 49 ) 50 51 type Handler interface { 52 common.Timer 53 health.Checker 54 55 Context() *snow.ConsensusContext 56 // ShouldHandle returns true if the node with the given ID is allowed to send 57 // messages to this chain. If the node is not allowed to send messages to 58 // this chain, the message should be dropped. 59 ShouldHandle(nodeID ids.NodeID) bool 60 61 SetEngineManager(engineManager *EngineManager) 62 GetEngineManager() *EngineManager 63 64 SetOnStopped(onStopped func()) 65 Start(ctx context.Context, recoverPanic bool) 66 Push(ctx context.Context, msg Message) 67 Len() int 68 69 Stop(ctx context.Context) 70 StopWithError(ctx context.Context, err error) 71 // AwaitStopped returns an error if the call would block and [ctx] is done. 72 // Even if [ctx] is done when passed into this function, this function will 73 // return a nil error if it will not block. 74 AwaitStopped(ctx context.Context) (time.Duration, error) 75 } 76 77 // handler passes incoming messages from the network to the consensus engine. 78 // (Actually, it receives the incoming messages from a ChainRouter, but same difference.) 79 type handler struct { 80 metrics *metrics 81 82 // Useful for faking time in tests 83 clock mockable.Clock 84 85 ctx *snow.ConsensusContext 86 // TODO: consider using peerTracker instead of validators 87 // since peerTracker is already tracking validators 88 validators validators.Manager 89 // Receives messages from the VM 90 msgFromVMChan <-chan common.Message 91 preemptTimeouts chan struct{} 92 gossipFrequency time.Duration 93 94 engineManager *EngineManager 95 96 // onStopped is called in a goroutine when this handler finishes shutting 97 // down. If it is nil then it is skipped. 98 onStopped func() 99 100 // Tracks cpu/disk usage caused by each peer. 101 resourceTracker tracker.ResourceTracker 102 103 // Holds messages that [engine] hasn't processed yet. 104 // [unprocessedMsgsCond.L] must be held while accessing [syncMessageQueue]. 105 syncMessageQueue MessageQueue 106 // Holds messages that [engine] hasn't processed yet. 107 // [unprocessedAsyncMsgsCond.L] must be held while accessing [asyncMessageQueue]. 108 asyncMessageQueue MessageQueue 109 // Worker pool for handling asynchronous consensus messages 110 asyncMessagePool errgroup.Group 111 timeouts chan struct{} 112 113 closeOnce sync.Once 114 startClosingTime time.Time 115 totalClosingTime time.Duration 116 closingChan chan struct{} 117 numDispatchersClosed atomic.Uint32 118 // Closed when this handler and [engine] are done shutting down 119 closed chan struct{} 120 121 subnetConnector validators.SubnetConnector 122 123 subnet subnets.Subnet 124 125 // Tracks the peers that are currently connected to this subnet 126 peerTracker commontracker.Peers 127 p2pTracker *p2p.PeerTracker 128 } 129 130 // Initialize this consensus handler 131 // [engine] must be initialized before initializing this handler 132 func New( 133 ctx *snow.ConsensusContext, 134 validators validators.Manager, 135 msgFromVMChan <-chan common.Message, 136 gossipFrequency time.Duration, 137 threadPoolSize int, 138 resourceTracker tracker.ResourceTracker, 139 subnetConnector validators.SubnetConnector, 140 subnet subnets.Subnet, 141 peerTracker commontracker.Peers, 142 p2pTracker *p2p.PeerTracker, 143 reg prometheus.Registerer, 144 ) (Handler, error) { 145 h := &handler{ 146 ctx: ctx, 147 validators: validators, 148 msgFromVMChan: msgFromVMChan, 149 preemptTimeouts: subnet.OnBootstrapCompleted(), 150 gossipFrequency: gossipFrequency, 151 timeouts: make(chan struct{}, 1), 152 closingChan: make(chan struct{}), 153 closed: make(chan struct{}), 154 resourceTracker: resourceTracker, 155 subnetConnector: subnetConnector, 156 subnet: subnet, 157 peerTracker: peerTracker, 158 p2pTracker: p2pTracker, 159 } 160 h.asyncMessagePool.SetLimit(threadPoolSize) 161 162 var err error 163 164 h.metrics, err = newMetrics(reg) 165 if err != nil { 166 return nil, fmt.Errorf("initializing handler metrics errored with: %w", err) 167 } 168 cpuTracker := resourceTracker.CPUTracker() 169 h.syncMessageQueue, err = NewMessageQueue( 170 h.ctx.Log, 171 h.ctx.SubnetID, 172 h.validators, 173 cpuTracker, 174 "sync", 175 reg, 176 ) 177 if err != nil { 178 return nil, fmt.Errorf("initializing sync message queue errored with: %w", err) 179 } 180 h.asyncMessageQueue, err = NewMessageQueue( 181 h.ctx.Log, 182 h.ctx.SubnetID, 183 h.validators, 184 cpuTracker, 185 "async", 186 reg, 187 ) 188 if err != nil { 189 return nil, fmt.Errorf("initializing async message queue errored with: %w", err) 190 } 191 return h, nil 192 } 193 194 func (h *handler) Context() *snow.ConsensusContext { 195 return h.ctx 196 } 197 198 func (h *handler) ShouldHandle(nodeID ids.NodeID) bool { 199 _, ok := h.validators.GetValidator(h.ctx.SubnetID, nodeID) 200 return h.subnet.IsAllowed(nodeID, ok) 201 } 202 203 func (h *handler) SetEngineManager(engineManager *EngineManager) { 204 h.engineManager = engineManager 205 } 206 207 func (h *handler) GetEngineManager() *EngineManager { 208 return h.engineManager 209 } 210 211 func (h *handler) SetOnStopped(onStopped func()) { 212 h.onStopped = onStopped 213 } 214 215 func (h *handler) selectStartingGear(ctx context.Context) (common.Engine, error) { 216 state := h.ctx.State.Get() 217 engines := h.engineManager.Get(state.Type) 218 if engines == nil { 219 return nil, errNoStartingGear 220 } 221 if engines.StateSyncer == nil { 222 return engines.Bootstrapper, nil 223 } 224 225 stateSyncEnabled, err := engines.StateSyncer.IsEnabled(ctx) 226 if err != nil { 227 return nil, err 228 } 229 230 if !stateSyncEnabled { 231 return engines.Bootstrapper, nil 232 } 233 234 // drop bootstrap state from previous runs before starting state sync 235 return engines.StateSyncer, engines.Bootstrapper.Clear(ctx) 236 } 237 238 func (h *handler) Start(ctx context.Context, recoverPanic bool) { 239 gear, err := h.selectStartingGear(ctx) 240 if err != nil { 241 h.ctx.Log.Error("chain failed to select starting gear", 242 zap.Error(err), 243 ) 244 h.shutdown(ctx, h.clock.Time()) 245 return 246 } 247 248 h.ctx.Lock.Lock() 249 err = gear.Start(ctx, 0) 250 h.ctx.Lock.Unlock() 251 if err != nil { 252 h.ctx.Log.Error("chain failed to start", 253 zap.Error(err), 254 ) 255 h.shutdown(ctx, h.clock.Time()) 256 return 257 } 258 259 detachedCtx := context.WithoutCancel(ctx) 260 dispatchSync := func() { 261 h.dispatchSync(detachedCtx) 262 } 263 dispatchAsync := func() { 264 h.dispatchAsync(detachedCtx) 265 } 266 dispatchChans := func() { 267 h.dispatchChans(detachedCtx) 268 } 269 if recoverPanic { 270 go h.ctx.Log.RecoverAndExit(dispatchSync, func() { 271 h.ctx.Log.Error("chain was shutdown due to a panic in the sync dispatcher") 272 }) 273 go h.ctx.Log.RecoverAndExit(dispatchAsync, func() { 274 h.ctx.Log.Error("chain was shutdown due to a panic in the async dispatcher") 275 }) 276 go h.ctx.Log.RecoverAndExit(dispatchChans, func() { 277 h.ctx.Log.Error("chain was shutdown due to a panic in the chan dispatcher") 278 }) 279 } else { 280 go h.ctx.Log.RecoverAndPanic(dispatchSync) 281 go h.ctx.Log.RecoverAndPanic(dispatchAsync) 282 go h.ctx.Log.RecoverAndPanic(dispatchChans) 283 } 284 } 285 286 // Push the message onto the handler's queue 287 func (h *handler) Push(ctx context.Context, msg Message) { 288 switch msg.Op() { 289 case message.AppRequestOp, message.AppErrorOp, message.AppResponseOp, message.AppGossipOp, 290 message.CrossChainAppRequestOp, message.CrossChainAppErrorOp, message.CrossChainAppResponseOp: 291 h.asyncMessageQueue.Push(ctx, msg) 292 default: 293 h.syncMessageQueue.Push(ctx, msg) 294 } 295 } 296 297 func (h *handler) Len() int { 298 return h.syncMessageQueue.Len() + h.asyncMessageQueue.Len() 299 } 300 301 func (h *handler) RegisterTimeout(d time.Duration) { 302 go func() { 303 timer := time.NewTimer(d) 304 defer timer.Stop() 305 306 select { 307 case <-timer.C: 308 case <-h.preemptTimeouts: 309 } 310 311 // If there is already a timeout ready to fire - just drop the 312 // additional timeout. This ensures that all goroutines that are spawned 313 // here are able to close if the chain is shutdown. 314 select { 315 case h.timeouts <- struct{}{}: 316 default: 317 } 318 }() 319 } 320 321 // Note: It is possible for Stop to be called before/concurrently with Start. 322 // 323 // Invariant: Stop must never block. 324 func (h *handler) Stop(ctx context.Context) { 325 h.closeOnce.Do(func() { 326 h.startClosingTime = h.clock.Time() 327 328 // Must hold the locks here to ensure there's no race condition in where 329 // we check the value of [h.closing] after the call to [Signal]. 330 h.syncMessageQueue.Shutdown() 331 h.asyncMessageQueue.Shutdown() 332 close(h.closingChan) 333 334 // TODO: switch this to use a [context.Context] with a cancel function. 335 // 336 // Don't process any more bootstrap messages. If a dispatcher is 337 // processing a bootstrap message, stop. We do this because if we 338 // didn't, and the engine was in the middle of executing state 339 // transitions during bootstrapping, we wouldn't be able to grab 340 // [h.ctx.Lock] until the engine finished executing state transitions, 341 // which may take a long time. As a result, the router would time out on 342 // shutting down this chain. 343 state := h.ctx.State.Get() 344 bootstrapper, ok := h.engineManager.Get(state.Type).Get(snow.Bootstrapping) 345 if !ok { 346 h.ctx.Log.Error("bootstrapping engine doesn't exist", 347 zap.Stringer("type", state.Type), 348 ) 349 return 350 } 351 bootstrapper.Halt(ctx) 352 }) 353 } 354 355 func (h *handler) StopWithError(ctx context.Context, err error) { 356 h.ctx.Log.Fatal("shutting down chain", 357 zap.String("reason", "received an unexpected error"), 358 zap.Error(err), 359 ) 360 h.Stop(ctx) 361 } 362 363 func (h *handler) AwaitStopped(ctx context.Context) (time.Duration, error) { 364 select { 365 case <-h.closed: 366 return h.totalClosingTime, nil 367 default: 368 } 369 370 select { 371 case <-ctx.Done(): 372 return 0, ctx.Err() 373 case <-h.closed: 374 return h.totalClosingTime, nil 375 } 376 } 377 378 func (h *handler) dispatchSync(ctx context.Context) { 379 defer h.closeDispatcher(ctx) 380 381 // Handle sync messages from the router 382 for { 383 // Get the next message we should process. If the handler is shutting 384 // down, we may fail to pop a message. 385 ctx, msg, ok := h.popUnexpiredMsg(h.syncMessageQueue) 386 if !ok { 387 return 388 } 389 390 // If there is an error handling the message, shut down the chain 391 if err := h.handleSyncMsg(ctx, msg); err != nil { 392 h.StopWithError(ctx, fmt.Errorf( 393 "%w while processing sync message: %s from %s", 394 err, 395 msg.Op(), 396 msg.NodeID(), 397 )) 398 return 399 } 400 } 401 } 402 403 func (h *handler) dispatchAsync(ctx context.Context) { 404 defer func() { 405 // We never return an error in any of our functions, so it is safe to 406 // drop any error here. 407 _ = h.asyncMessagePool.Wait() 408 h.closeDispatcher(ctx) 409 }() 410 411 // Handle async messages from the router 412 for { 413 // Get the next message we should process. If the handler is shutting 414 // down, we may fail to pop a message. 415 ctx, msg, ok := h.popUnexpiredMsg(h.asyncMessageQueue) 416 if !ok { 417 return 418 } 419 420 h.handleAsyncMsg(ctx, msg) 421 } 422 } 423 424 func (h *handler) dispatchChans(ctx context.Context) { 425 gossiper := time.NewTicker(h.gossipFrequency) 426 defer func() { 427 gossiper.Stop() 428 h.closeDispatcher(ctx) 429 }() 430 431 // Handle messages generated by the handler and the VM 432 for { 433 var msg message.InboundMessage 434 select { 435 case <-h.closingChan: 436 return 437 438 case vmMSG := <-h.msgFromVMChan: 439 msg = message.InternalVMMessage(h.ctx.NodeID, uint32(vmMSG)) 440 441 case <-gossiper.C: 442 msg = message.InternalGossipRequest(h.ctx.NodeID) 443 444 case <-h.timeouts: 445 msg = message.InternalTimeout(h.ctx.NodeID) 446 } 447 448 if err := h.handleChanMsg(msg); err != nil { 449 h.StopWithError(ctx, fmt.Errorf( 450 "%w while processing chan message: %s", 451 err, 452 msg.Op(), 453 )) 454 return 455 } 456 } 457 } 458 459 // Any returned error is treated as fatal 460 func (h *handler) handleSyncMsg(ctx context.Context, msg Message) error { 461 var ( 462 nodeID = msg.NodeID() 463 op = msg.Op().String() 464 body = msg.Message() 465 startTime = h.clock.Time() 466 // Check if the chain is in normal operation at the start of message 467 // execution (may change during execution) 468 isNormalOp = h.ctx.State.Get().State == snow.NormalOp 469 ) 470 if h.ctx.Log.Enabled(logging.Verbo) { 471 h.ctx.Log.Verbo("forwarding sync message to consensus", 472 zap.Stringer("nodeID", nodeID), 473 zap.String("messageOp", op), 474 zap.Stringer("message", body), 475 ) 476 } else { 477 h.ctx.Log.Debug("forwarding sync message to consensus", 478 zap.Stringer("nodeID", nodeID), 479 zap.String("messageOp", op), 480 ) 481 } 482 h.resourceTracker.StartProcessing(nodeID, startTime) 483 h.ctx.Lock.Lock() 484 lockAcquiredTime := h.clock.Time() 485 defer func() { 486 h.ctx.Lock.Unlock() 487 488 var ( 489 endTime = h.clock.Time() 490 lockingTime = lockAcquiredTime.Sub(startTime) 491 handlingTime = endTime.Sub(lockAcquiredTime) 492 ) 493 h.resourceTracker.StopProcessing(nodeID, endTime) 494 h.metrics.lockingTime.Add(float64(lockingTime)) 495 labels := prometheus.Labels{ 496 opLabel: op, 497 } 498 h.metrics.messages.With(labels).Inc() 499 h.metrics.messageHandlingTime.With(labels).Add(float64(handlingTime)) 500 501 msg.OnFinishedHandling() 502 h.ctx.Log.Debug("finished handling sync message", 503 zap.String("messageOp", op), 504 ) 505 if lockingTime+handlingTime > syncProcessingTimeWarnLimit && isNormalOp { 506 h.ctx.Log.Warn("handling sync message took longer than expected", 507 zap.Duration("lockingTime", lockingTime), 508 zap.Duration("handlingTime", handlingTime), 509 zap.Stringer("nodeID", nodeID), 510 zap.String("messageOp", op), 511 zap.Stringer("message", body), 512 ) 513 } 514 }() 515 516 // We will attempt to pass the message to the requested type for the state 517 // we are currently in. 518 currentState := h.ctx.State.Get() 519 if msg.EngineType == p2ppb.EngineType_ENGINE_TYPE_SNOWMAN && 520 currentState.Type == p2ppb.EngineType_ENGINE_TYPE_AVALANCHE { 521 // The peer is requesting an engine type that hasn't been initialized 522 // yet. This means we know that this isn't a response, so we can safely 523 // drop the message. 524 h.ctx.Log.Debug("dropping sync message", 525 zap.String("reason", "uninitialized engine type"), 526 zap.String("messageOp", op), 527 zap.Stringer("currentEngineType", currentState.Type), 528 zap.Stringer("requestedEngineType", msg.EngineType), 529 ) 530 return nil 531 } 532 533 var engineType p2ppb.EngineType 534 switch msg.EngineType { 535 case p2ppb.EngineType_ENGINE_TYPE_AVALANCHE, p2ppb.EngineType_ENGINE_TYPE_SNOWMAN: 536 // The peer is requesting an engine type that has been initialized, so 537 // we should attempt to honor the request. 538 engineType = msg.EngineType 539 default: 540 // Note: [msg.EngineType] may have been provided by the peer as an 541 // invalid option. I.E. not one of AVALANCHE, SNOWMAN, or UNSPECIFIED. 542 // In this case, we treat the value the same way as UNSPECIFIED. 543 // 544 // If the peer didn't request a specific engine type, we default to the 545 // current engine. 546 engineType = currentState.Type 547 } 548 549 engine, ok := h.engineManager.Get(engineType).Get(currentState.State) 550 if !ok { 551 // This should only happen if the peer is not following the protocol. 552 // This can happen if the chain only has a Snowman engine and the peer 553 // requested an Avalanche engine handle the message. 554 h.ctx.Log.Debug("dropping sync message", 555 zap.String("reason", "uninitialized engine state"), 556 zap.String("messageOp", op), 557 zap.Stringer("currentEngineType", currentState.Type), 558 zap.Stringer("requestedEngineType", msg.EngineType), 559 zap.Stringer("engineState", currentState.State), 560 ) 561 return nil 562 } 563 564 // Invariant: Response messages can never be dropped here. This is because 565 // the timeout has already been cleared. This means the engine 566 // should be invoked with a failure message if parsing of the 567 // response fails. 568 switch msg := body.(type) { 569 // State messages should always be sent to the snowman engine 570 case *p2ppb.GetStateSummaryFrontier: 571 return engine.GetStateSummaryFrontier(ctx, nodeID, msg.RequestId) 572 573 case *p2ppb.StateSummaryFrontier: 574 return engine.StateSummaryFrontier(ctx, nodeID, msg.RequestId, msg.Summary) 575 576 case *message.GetStateSummaryFrontierFailed: 577 return engine.GetStateSummaryFrontierFailed(ctx, nodeID, msg.RequestID) 578 579 case *p2ppb.GetAcceptedStateSummary: 580 return engine.GetAcceptedStateSummary( 581 ctx, 582 nodeID, 583 msg.RequestId, 584 set.Of(msg.Heights...), 585 ) 586 587 case *p2ppb.AcceptedStateSummary: 588 summaryIDs, err := getIDs(msg.SummaryIds) 589 if err != nil { 590 h.ctx.Log.Debug("message with invalid field", 591 zap.Stringer("nodeID", nodeID), 592 zap.Stringer("messageOp", message.AcceptedStateSummaryOp), 593 zap.Uint32("requestID", msg.RequestId), 594 zap.String("field", "SummaryIDs"), 595 zap.Error(err), 596 ) 597 return engine.GetAcceptedStateSummaryFailed(ctx, nodeID, msg.RequestId) 598 } 599 600 return engine.AcceptedStateSummary(ctx, nodeID, msg.RequestId, summaryIDs) 601 602 case *message.GetAcceptedStateSummaryFailed: 603 return engine.GetAcceptedStateSummaryFailed(ctx, nodeID, msg.RequestID) 604 605 // Bootstrapping messages may be forwarded to either avalanche or snowman 606 // engines, depending on the EngineType field 607 case *p2ppb.GetAcceptedFrontier: 608 return engine.GetAcceptedFrontier(ctx, nodeID, msg.RequestId) 609 610 case *p2ppb.AcceptedFrontier: 611 containerID, err := ids.ToID(msg.ContainerId) 612 if err != nil { 613 h.ctx.Log.Debug("message with invalid field", 614 zap.Stringer("nodeID", nodeID), 615 zap.Stringer("messageOp", message.AcceptedFrontierOp), 616 zap.Uint32("requestID", msg.RequestId), 617 zap.String("field", "ContainerID"), 618 zap.Error(err), 619 ) 620 return engine.GetAcceptedFrontierFailed(ctx, nodeID, msg.RequestId) 621 } 622 623 return engine.AcceptedFrontier(ctx, nodeID, msg.RequestId, containerID) 624 625 case *message.GetAcceptedFrontierFailed: 626 return engine.GetAcceptedFrontierFailed(ctx, nodeID, msg.RequestID) 627 628 case *p2ppb.GetAccepted: 629 containerIDs, err := getIDs(msg.ContainerIds) 630 if err != nil { 631 h.ctx.Log.Debug("message with invalid field", 632 zap.Stringer("nodeID", nodeID), 633 zap.Stringer("messageOp", message.GetAcceptedOp), 634 zap.Uint32("requestID", msg.RequestId), 635 zap.String("field", "ContainerIDs"), 636 zap.Error(err), 637 ) 638 return nil 639 } 640 641 return engine.GetAccepted(ctx, nodeID, msg.RequestId, containerIDs) 642 643 case *p2ppb.Accepted: 644 containerIDs, err := getIDs(msg.ContainerIds) 645 if err != nil { 646 h.ctx.Log.Debug("message with invalid field", 647 zap.Stringer("nodeID", nodeID), 648 zap.Stringer("messageOp", message.AcceptedOp), 649 zap.Uint32("requestID", msg.RequestId), 650 zap.String("field", "ContainerIDs"), 651 zap.Error(err), 652 ) 653 return engine.GetAcceptedFailed(ctx, nodeID, msg.RequestId) 654 } 655 656 return engine.Accepted(ctx, nodeID, msg.RequestId, containerIDs) 657 658 case *message.GetAcceptedFailed: 659 return engine.GetAcceptedFailed(ctx, nodeID, msg.RequestID) 660 661 case *p2ppb.GetAncestors: 662 containerID, err := ids.ToID(msg.ContainerId) 663 if err != nil { 664 h.ctx.Log.Debug("dropping message with invalid field", 665 zap.Stringer("nodeID", nodeID), 666 zap.Stringer("messageOp", message.GetAncestorsOp), 667 zap.Uint32("requestID", msg.RequestId), 668 zap.String("field", "ContainerID"), 669 zap.Error(err), 670 ) 671 return nil 672 } 673 674 return engine.GetAncestors(ctx, nodeID, msg.RequestId, containerID) 675 676 case *message.GetAncestorsFailed: 677 return engine.GetAncestorsFailed(ctx, nodeID, msg.RequestID) 678 679 case *p2ppb.Ancestors: 680 return engine.Ancestors(ctx, nodeID, msg.RequestId, msg.Containers) 681 682 case *p2ppb.Get: 683 containerID, err := ids.ToID(msg.ContainerId) 684 if err != nil { 685 h.ctx.Log.Debug("dropping message with invalid field", 686 zap.Stringer("nodeID", nodeID), 687 zap.Stringer("messageOp", message.GetOp), 688 zap.Uint32("requestID", msg.RequestId), 689 zap.String("field", "ContainerID"), 690 zap.Error(err), 691 ) 692 return nil 693 } 694 695 return engine.Get(ctx, nodeID, msg.RequestId, containerID) 696 697 case *message.GetFailed: 698 return engine.GetFailed(ctx, nodeID, msg.RequestID) 699 700 case *p2ppb.Put: 701 return engine.Put(ctx, nodeID, msg.RequestId, msg.Container) 702 703 case *p2ppb.PushQuery: 704 return engine.PushQuery(ctx, nodeID, msg.RequestId, msg.Container, msg.RequestedHeight) 705 706 case *p2ppb.PullQuery: 707 containerID, err := ids.ToID(msg.ContainerId) 708 if err != nil { 709 h.ctx.Log.Debug("dropping message with invalid field", 710 zap.Stringer("nodeID", nodeID), 711 zap.Stringer("messageOp", message.PullQueryOp), 712 zap.Uint32("requestID", msg.RequestId), 713 zap.String("field", "ContainerID"), 714 zap.Error(err), 715 ) 716 return nil 717 } 718 719 return engine.PullQuery(ctx, nodeID, msg.RequestId, containerID, msg.RequestedHeight) 720 721 case *p2ppb.Chits: 722 preferredID, err := ids.ToID(msg.PreferredId) 723 if err != nil { 724 h.ctx.Log.Debug("message with invalid field", 725 zap.Stringer("nodeID", nodeID), 726 zap.Stringer("messageOp", message.ChitsOp), 727 zap.Uint32("requestID", msg.RequestId), 728 zap.String("field", "PreferredID"), 729 zap.Error(err), 730 ) 731 return engine.QueryFailed(ctx, nodeID, msg.RequestId) 732 } 733 734 preferredIDAtHeight, err := ids.ToID(msg.PreferredIdAtHeight) 735 if err != nil { 736 h.ctx.Log.Debug("message with invalid field", 737 zap.Stringer("nodeID", nodeID), 738 zap.Stringer("messageOp", message.ChitsOp), 739 zap.Uint32("requestID", msg.RequestId), 740 zap.String("field", "PreferredIDAtHeight"), 741 zap.Error(err), 742 ) 743 return engine.QueryFailed(ctx, nodeID, msg.RequestId) 744 } 745 746 acceptedID, err := ids.ToID(msg.AcceptedId) 747 if err != nil { 748 h.ctx.Log.Debug("message with invalid field", 749 zap.Stringer("nodeID", nodeID), 750 zap.Stringer("messageOp", message.ChitsOp), 751 zap.Uint32("requestID", msg.RequestId), 752 zap.String("field", "AcceptedID"), 753 zap.Error(err), 754 ) 755 return engine.QueryFailed(ctx, nodeID, msg.RequestId) 756 } 757 758 return engine.Chits(ctx, nodeID, msg.RequestId, preferredID, preferredIDAtHeight, acceptedID) 759 760 case *message.QueryFailed: 761 return engine.QueryFailed(ctx, nodeID, msg.RequestID) 762 763 // Connection messages can be sent to the currently executing engine 764 case *message.Connected: 765 err := h.peerTracker.Connected(ctx, nodeID, msg.NodeVersion) 766 if err != nil { 767 return err 768 } 769 h.p2pTracker.Connected(nodeID, msg.NodeVersion) 770 return engine.Connected(ctx, nodeID, msg.NodeVersion) 771 772 case *message.ConnectedSubnet: 773 return h.subnetConnector.ConnectedSubnet(ctx, nodeID, msg.SubnetID) 774 775 case *message.Disconnected: 776 err := h.peerTracker.Disconnected(ctx, nodeID) 777 if err != nil { 778 return err 779 } 780 h.p2pTracker.Disconnected(nodeID) 781 return engine.Disconnected(ctx, nodeID) 782 783 default: 784 return fmt.Errorf( 785 "attempt to submit unhandled sync msg %s from %s", 786 op, nodeID, 787 ) 788 } 789 } 790 791 func (h *handler) handleAsyncMsg(ctx context.Context, msg Message) { 792 h.asyncMessagePool.Go(func() error { 793 if err := h.executeAsyncMsg(ctx, msg); err != nil { 794 h.StopWithError(ctx, fmt.Errorf( 795 "%w while processing async message: %s from %s", 796 err, 797 msg.Op(), 798 msg.NodeID(), 799 )) 800 } 801 return nil 802 }) 803 } 804 805 // Any returned error is treated as fatal 806 func (h *handler) executeAsyncMsg(ctx context.Context, msg Message) error { 807 var ( 808 nodeID = msg.NodeID() 809 op = msg.Op().String() 810 body = msg.Message() 811 startTime = h.clock.Time() 812 ) 813 if h.ctx.Log.Enabled(logging.Verbo) { 814 h.ctx.Log.Verbo("forwarding async message to consensus", 815 zap.Stringer("nodeID", nodeID), 816 zap.String("messageOp", op), 817 zap.Stringer("message", body), 818 ) 819 } else { 820 h.ctx.Log.Debug("forwarding async message to consensus", 821 zap.Stringer("nodeID", nodeID), 822 zap.String("messageOp", op), 823 ) 824 } 825 h.resourceTracker.StartProcessing(nodeID, startTime) 826 defer func() { 827 var ( 828 endTime = h.clock.Time() 829 handlingTime = endTime.Sub(startTime) 830 ) 831 h.resourceTracker.StopProcessing(nodeID, endTime) 832 labels := prometheus.Labels{ 833 opLabel: op, 834 } 835 h.metrics.messages.With(labels).Inc() 836 h.metrics.messageHandlingTime.With(labels).Add(float64(handlingTime)) 837 838 msg.OnFinishedHandling() 839 h.ctx.Log.Debug("finished handling async message", 840 zap.String("messageOp", op), 841 ) 842 }() 843 844 state := h.ctx.State.Get() 845 engine, ok := h.engineManager.Get(state.Type).Get(state.State) 846 if !ok { 847 return fmt.Errorf( 848 "%w %s running %s", 849 errMissingEngine, 850 state.State, 851 state.Type, 852 ) 853 } 854 855 switch m := body.(type) { 856 case *p2ppb.AppRequest: 857 return engine.AppRequest( 858 ctx, 859 nodeID, 860 m.RequestId, 861 msg.Expiration(), 862 m.AppBytes, 863 ) 864 865 case *p2ppb.AppResponse: 866 return engine.AppResponse(ctx, nodeID, m.RequestId, m.AppBytes) 867 868 case *p2ppb.AppError: 869 err := &common.AppError{ 870 Code: m.ErrorCode, 871 Message: m.ErrorMessage, 872 } 873 874 return engine.AppRequestFailed( 875 ctx, 876 nodeID, 877 m.RequestId, 878 err, 879 ) 880 881 case *p2ppb.AppGossip: 882 return engine.AppGossip(ctx, nodeID, m.AppBytes) 883 884 case *message.CrossChainAppRequest: 885 return engine.CrossChainAppRequest( 886 ctx, 887 m.SourceChainID, 888 m.RequestID, 889 msg.Expiration(), 890 m.Message, 891 ) 892 893 case *message.CrossChainAppResponse: 894 return engine.CrossChainAppResponse( 895 ctx, 896 m.SourceChainID, 897 m.RequestID, 898 m.Message, 899 ) 900 901 case *message.CrossChainAppRequestFailed: 902 err := &common.AppError{ 903 Code: m.ErrorCode, 904 Message: m.ErrorMessage, 905 } 906 907 return engine.CrossChainAppRequestFailed( 908 ctx, 909 m.SourceChainID, 910 m.RequestID, 911 err, 912 ) 913 914 default: 915 return fmt.Errorf( 916 "attempt to submit unhandled async msg %s from %s", 917 op, nodeID, 918 ) 919 } 920 } 921 922 // Any returned error is treated as fatal 923 func (h *handler) handleChanMsg(msg message.InboundMessage) error { 924 var ( 925 op = msg.Op().String() 926 body = msg.Message() 927 startTime = h.clock.Time() 928 // Check if the chain is in normal operation at the start of message 929 // execution (may change during execution) 930 isNormalOp = h.ctx.State.Get().State == snow.NormalOp 931 ) 932 if h.ctx.Log.Enabled(logging.Verbo) { 933 h.ctx.Log.Verbo("forwarding chan message to consensus", 934 zap.String("messageOp", op), 935 zap.Stringer("message", body), 936 ) 937 } else { 938 h.ctx.Log.Debug("forwarding chan message to consensus", 939 zap.String("messageOp", op), 940 ) 941 } 942 h.ctx.Lock.Lock() 943 lockAcquiredTime := h.clock.Time() 944 defer func() { 945 h.ctx.Lock.Unlock() 946 947 var ( 948 endTime = h.clock.Time() 949 lockingTime = lockAcquiredTime.Sub(startTime) 950 handlingTime = endTime.Sub(lockAcquiredTime) 951 ) 952 h.metrics.lockingTime.Add(float64(lockingTime)) 953 labels := prometheus.Labels{ 954 opLabel: op, 955 } 956 h.metrics.messages.With(labels).Inc() 957 h.metrics.messageHandlingTime.With(labels).Add(float64(handlingTime)) 958 959 msg.OnFinishedHandling() 960 h.ctx.Log.Debug("finished handling chan message", 961 zap.String("messageOp", op), 962 ) 963 if lockingTime+handlingTime > syncProcessingTimeWarnLimit && isNormalOp { 964 h.ctx.Log.Warn("handling chan message took longer than expected", 965 zap.Duration("lockingTime", lockingTime), 966 zap.Duration("handlingTime", handlingTime), 967 zap.String("messageOp", op), 968 zap.Stringer("message", body), 969 ) 970 } 971 }() 972 973 state := h.ctx.State.Get() 974 engine, ok := h.engineManager.Get(state.Type).Get(state.State) 975 if !ok { 976 return fmt.Errorf( 977 "%w %s running %s", 978 errMissingEngine, 979 state.State, 980 state.Type, 981 ) 982 } 983 984 switch msg := body.(type) { 985 case *message.VMMessage: 986 return engine.Notify(context.TODO(), common.Message(msg.Notification)) 987 988 case *message.GossipRequest: 989 return engine.Gossip(context.TODO()) 990 991 case *message.Timeout: 992 return engine.Timeout(context.TODO()) 993 994 default: 995 return fmt.Errorf( 996 "attempt to submit unhandled chan msg %s", 997 op, 998 ) 999 } 1000 } 1001 1002 func (h *handler) popUnexpiredMsg(queue MessageQueue) (context.Context, Message, bool) { 1003 for { 1004 // Get the next message we should process. If the handler is shutting 1005 // down, we may fail to pop a message. 1006 ctx, msg, ok := queue.Pop() 1007 if !ok { 1008 return nil, Message{}, false 1009 } 1010 1011 // If this message's deadline has passed, don't process it. 1012 if expiration := msg.Expiration(); h.clock.Time().After(expiration) { 1013 op := msg.Op().String() 1014 h.ctx.Log.Debug("dropping message", 1015 zap.String("reason", "timeout"), 1016 zap.Stringer("nodeID", msg.NodeID()), 1017 zap.String("messageOp", op), 1018 ) 1019 span := trace.SpanFromContext(ctx) 1020 span.AddEvent("dropping message", trace.WithAttributes( 1021 attribute.String("reason", "timeout"), 1022 )) 1023 h.metrics.expired.With(prometheus.Labels{ 1024 opLabel: op, 1025 }).Inc() 1026 msg.OnFinishedHandling() 1027 continue 1028 } 1029 1030 return ctx, msg, true 1031 } 1032 } 1033 1034 // Invariant: if closeDispatcher is called, Stop has already been called. 1035 func (h *handler) closeDispatcher(ctx context.Context) { 1036 if h.numDispatchersClosed.Add(1) < numDispatchersToClose { 1037 return 1038 } 1039 1040 h.shutdown(ctx, h.startClosingTime) 1041 } 1042 1043 // Note: shutdown is only called after all message dispatchers have exited or if 1044 // no message dispatchers ever started. 1045 func (h *handler) shutdown(ctx context.Context, startClosingTime time.Time) { 1046 defer func() { 1047 if h.onStopped != nil { 1048 go h.onStopped() 1049 } 1050 1051 h.totalClosingTime = h.clock.Time().Sub(startClosingTime) 1052 close(h.closed) 1053 }() 1054 1055 state := h.ctx.State.Get() 1056 engine, ok := h.engineManager.Get(state.Type).Get(state.State) 1057 if !ok { 1058 h.ctx.Log.Error("failed fetching current engine during shutdown", 1059 zap.Stringer("type", state.Type), 1060 zap.Stringer("state", state.State), 1061 ) 1062 return 1063 } 1064 1065 if err := engine.Shutdown(ctx); err != nil { 1066 h.ctx.Log.Error("failed while shutting down the chain", 1067 zap.Error(err), 1068 ) 1069 } 1070 }