github.com/number571/tendermint@v0.34.11-gost/internal/blockchain/v0/reactor.go (about) 1 package v0 2 3 import ( 4 "fmt" 5 "runtime/debug" 6 "sync" 7 "time" 8 9 bc "github.com/number571/tendermint/internal/blockchain" 10 cons "github.com/number571/tendermint/internal/consensus" 11 "github.com/number571/tendermint/internal/p2p" 12 "github.com/number571/tendermint/libs/log" 13 "github.com/number571/tendermint/libs/service" 14 tmSync "github.com/number571/tendermint/libs/sync" 15 bcproto "github.com/number571/tendermint/proto/tendermint/blockchain" 16 sm "github.com/number571/tendermint/state" 17 "github.com/number571/tendermint/store" 18 "github.com/number571/tendermint/types" 19 ) 20 21 var ( 22 _ service.Service = (*Reactor)(nil) 23 24 // ChannelShims contains a map of ChannelDescriptorShim objects, where each 25 // object wraps a reference to a legacy p2p ChannelDescriptor and the corresponding 26 // p2p proto.Message the new p2p Channel is responsible for handling. 27 // 28 // 29 // TODO: Remove once p2p refactor is complete. 30 // ref: https://github.com/number571/tendermint/issues/5670 31 ChannelShims = map[p2p.ChannelID]*p2p.ChannelDescriptorShim{ 32 BlockchainChannel: { 33 MsgType: new(bcproto.Message), 34 Descriptor: &p2p.ChannelDescriptor{ 35 ID: byte(BlockchainChannel), 36 Priority: 5, 37 SendQueueCapacity: 1000, 38 RecvBufferCapacity: 1024, 39 RecvMessageCapacity: bc.MaxMsgSize, 40 MaxSendBytes: 100, 41 }, 42 }, 43 } 44 ) 45 46 const ( 47 // BlockchainChannel is a channel for blocks and status updates 48 BlockchainChannel = p2p.ChannelID(0x40) 49 50 trySyncIntervalMS = 10 51 52 // ask for best height every 10s 53 statusUpdateIntervalSeconds = 10 54 55 // check if we should switch to consensus reactor 56 switchToConsensusIntervalSeconds = 1 57 58 // switch to consensus after this duration of inactivity 59 syncTimeout = 60 * time.Second 60 ) 61 62 type consensusReactor interface { 63 // For when we switch from blockchain reactor and fast sync to the consensus 64 // machine. 65 SwitchToConsensus(state sm.State, skipWAL bool) 66 } 67 68 type peerError struct { 69 err error 70 peerID types.NodeID 71 } 72 73 func (e peerError) Error() string { 74 return fmt.Sprintf("error with peer %v: %s", e.peerID, e.err.Error()) 75 } 76 77 // Reactor handles long-term catchup syncing. 78 type Reactor struct { 79 service.BaseService 80 81 // immutable 82 initialState sm.State 83 84 blockExec *sm.BlockExecutor 85 store *store.BlockStore 86 pool *BlockPool 87 consReactor consensusReactor 88 fastSync *tmSync.AtomicBool 89 90 blockchainCh *p2p.Channel 91 // blockchainOutBridgeCh defines a channel that acts as a bridge between sending Envelope 92 // messages that the reactor will consume in processBlockchainCh and receiving messages 93 // from the peer updates channel and other goroutines. We do this instead of directly 94 // sending on blockchainCh.Out to avoid race conditions in the case where other goroutines 95 // send Envelopes directly to the to blockchainCh.Out channel, since processBlockchainCh 96 // may close the blockchainCh.Out channel at the same time that other goroutines send to 97 // blockchainCh.Out. 98 blockchainOutBridgeCh chan p2p.Envelope 99 peerUpdates *p2p.PeerUpdates 100 closeCh chan struct{} 101 102 requestsCh <-chan BlockRequest 103 errorsCh <-chan peerError 104 105 // poolWG is used to synchronize the graceful shutdown of the poolRoutine and 106 // requestRoutine spawned goroutines when stopping the reactor and before 107 // stopping the p2p Channel(s). 108 poolWG sync.WaitGroup 109 110 metrics *cons.Metrics 111 112 syncStartTime time.Time 113 } 114 115 // NewReactor returns new reactor instance. 116 func NewReactor( 117 logger log.Logger, 118 state sm.State, 119 blockExec *sm.BlockExecutor, 120 store *store.BlockStore, 121 consReactor consensusReactor, 122 blockchainCh *p2p.Channel, 123 peerUpdates *p2p.PeerUpdates, 124 fastSync bool, 125 metrics *cons.Metrics, 126 ) (*Reactor, error) { 127 if state.LastBlockHeight != store.Height() { 128 return nil, fmt.Errorf("state (%v) and store (%v) height mismatch", state.LastBlockHeight, store.Height()) 129 } 130 131 startHeight := store.Height() + 1 132 if startHeight == 1 { 133 startHeight = state.InitialHeight 134 } 135 136 requestsCh := make(chan BlockRequest, maxTotalRequesters) 137 errorsCh := make(chan peerError, maxPeerErrBuffer) // NOTE: The capacity should be larger than the peer count. 138 139 r := &Reactor{ 140 initialState: state, 141 blockExec: blockExec, 142 store: store, 143 pool: NewBlockPool(startHeight, requestsCh, errorsCh), 144 consReactor: consReactor, 145 fastSync: tmSync.NewBool(fastSync), 146 requestsCh: requestsCh, 147 errorsCh: errorsCh, 148 blockchainCh: blockchainCh, 149 blockchainOutBridgeCh: make(chan p2p.Envelope), 150 peerUpdates: peerUpdates, 151 closeCh: make(chan struct{}), 152 metrics: metrics, 153 syncStartTime: time.Time{}, 154 } 155 156 r.BaseService = *service.NewBaseService(logger, "Blockchain", r) 157 return r, nil 158 } 159 160 // OnStart starts separate go routines for each p2p Channel and listens for 161 // envelopes on each. In addition, it also listens for peer updates and handles 162 // messages on that p2p channel accordingly. The caller must be sure to execute 163 // OnStop to ensure the outbound p2p Channels are closed. 164 // 165 // If fastSync is enabled, we also start the pool and the pool processing 166 // goroutine. If the pool fails to start, an error is returned. 167 func (r *Reactor) OnStart() error { 168 if r.fastSync.IsSet() { 169 if err := r.pool.Start(); err != nil { 170 return err 171 } 172 173 r.poolWG.Add(1) 174 go r.poolRoutine(false) 175 } 176 177 go r.processBlockchainCh() 178 go r.processPeerUpdates() 179 180 return nil 181 } 182 183 // OnStop stops the reactor by signaling to all spawned goroutines to exit and 184 // blocking until they all exit. 185 func (r *Reactor) OnStop() { 186 if r.fastSync.IsSet() { 187 if err := r.pool.Stop(); err != nil { 188 r.Logger.Error("failed to stop pool", "err", err) 189 } 190 } 191 192 // wait for the poolRoutine and requestRoutine goroutines to gracefully exit 193 r.poolWG.Wait() 194 195 // Close closeCh to signal to all spawned goroutines to gracefully exit. All 196 // p2p Channels should execute Close(). 197 close(r.closeCh) 198 199 // Wait for all p2p Channels to be closed before returning. This ensures we 200 // can easily reason about synchronization of all p2p Channels and ensure no 201 // panics will occur. 202 <-r.blockchainCh.Done() 203 <-r.peerUpdates.Done() 204 } 205 206 // respondToPeer loads a block and sends it to the requesting peer, if we have it. 207 // Otherwise, we'll respond saying we do not have it. 208 func (r *Reactor) respondToPeer(msg *bcproto.BlockRequest, peerID types.NodeID) { 209 block := r.store.LoadBlock(msg.Height) 210 if block != nil { 211 blockProto, err := block.ToProto() 212 if err != nil { 213 r.Logger.Error("failed to convert msg to protobuf", "err", err) 214 return 215 } 216 217 r.blockchainCh.Out <- p2p.Envelope{ 218 To: peerID, 219 Message: &bcproto.BlockResponse{Block: blockProto}, 220 } 221 222 return 223 } 224 225 r.Logger.Info("peer requesting a block we do not have", "peer", peerID, "height", msg.Height) 226 r.blockchainCh.Out <- p2p.Envelope{ 227 To: peerID, 228 Message: &bcproto.NoBlockResponse{Height: msg.Height}, 229 } 230 } 231 232 // handleBlockchainMessage handles envelopes sent from peers on the 233 // BlockchainChannel. It returns an error only if the Envelope.Message is unknown 234 // for this channel. This should never be called outside of handleMessage. 235 func (r *Reactor) handleBlockchainMessage(envelope p2p.Envelope) error { 236 logger := r.Logger.With("peer", envelope.From) 237 238 switch msg := envelope.Message.(type) { 239 case *bcproto.BlockRequest: 240 r.respondToPeer(msg, envelope.From) 241 242 case *bcproto.BlockResponse: 243 block, err := types.BlockFromProto(msg.Block) 244 if err != nil { 245 logger.Error("failed to convert block from proto", "err", err) 246 return err 247 } 248 249 r.pool.AddBlock(envelope.From, block, block.Size()) 250 251 case *bcproto.StatusRequest: 252 r.blockchainCh.Out <- p2p.Envelope{ 253 To: envelope.From, 254 Message: &bcproto.StatusResponse{ 255 Height: r.store.Height(), 256 Base: r.store.Base(), 257 }, 258 } 259 260 case *bcproto.StatusResponse: 261 r.pool.SetPeerRange(envelope.From, msg.Base, msg.Height) 262 263 case *bcproto.NoBlockResponse: 264 logger.Debug("peer does not have the requested block", "height", msg.Height) 265 266 default: 267 return fmt.Errorf("received unknown message: %T", msg) 268 } 269 270 return nil 271 } 272 273 // handleMessage handles an Envelope sent from a peer on a specific p2p Channel. 274 // It will handle errors and any possible panics gracefully. A caller can handle 275 // any error returned by sending a PeerError on the respective channel. 276 func (r *Reactor) handleMessage(chID p2p.ChannelID, envelope p2p.Envelope) (err error) { 277 defer func() { 278 if e := recover(); e != nil { 279 err = fmt.Errorf("panic in processing message: %v", e) 280 r.Logger.Error( 281 "recovering from processing message panic", 282 "err", err, 283 "stack", string(debug.Stack()), 284 ) 285 } 286 }() 287 288 r.Logger.Debug("received message", "message", envelope.Message, "peer", envelope.From) 289 290 switch chID { 291 case BlockchainChannel: 292 err = r.handleBlockchainMessage(envelope) 293 294 default: 295 err = fmt.Errorf("unknown channel ID (%d) for envelope (%v)", chID, envelope) 296 } 297 298 return err 299 } 300 301 // processBlockchainCh initiates a blocking process where we listen for and handle 302 // envelopes on the BlockchainChannel and blockchainOutBridgeCh. Any error encountered during 303 // message execution will result in a PeerError being sent on the BlockchainChannel. 304 // When the reactor is stopped, we will catch the signal and close the p2p Channel 305 // gracefully. 306 func (r *Reactor) processBlockchainCh() { 307 defer r.blockchainCh.Close() 308 309 for { 310 select { 311 case envelope := <-r.blockchainCh.In: 312 if err := r.handleMessage(r.blockchainCh.ID, envelope); err != nil { 313 r.Logger.Error("failed to process message", "ch_id", r.blockchainCh.ID, "envelope", envelope, "err", err) 314 r.blockchainCh.Error <- p2p.PeerError{ 315 NodeID: envelope.From, 316 Err: err, 317 } 318 } 319 320 case envelope := <-r.blockchainOutBridgeCh: 321 r.blockchainCh.Out <- envelope 322 323 case <-r.closeCh: 324 r.Logger.Debug("stopped listening on blockchain channel; closing...") 325 return 326 327 } 328 } 329 } 330 331 // processPeerUpdate processes a PeerUpdate. 332 func (r *Reactor) processPeerUpdate(peerUpdate p2p.PeerUpdate) { 333 r.Logger.Debug("received peer update", "peer", peerUpdate.NodeID, "status", peerUpdate.Status) 334 335 // XXX: Pool#RedoRequest can sometimes give us an empty peer. 336 if len(peerUpdate.NodeID) == 0 { 337 return 338 } 339 340 switch peerUpdate.Status { 341 case p2p.PeerStatusUp: 342 // send a status update the newly added peer 343 r.blockchainOutBridgeCh <- p2p.Envelope{ 344 To: peerUpdate.NodeID, 345 Message: &bcproto.StatusResponse{ 346 Base: r.store.Base(), 347 Height: r.store.Height(), 348 }, 349 } 350 351 case p2p.PeerStatusDown: 352 r.pool.RemovePeer(peerUpdate.NodeID) 353 } 354 } 355 356 // processPeerUpdates initiates a blocking process where we listen for and handle 357 // PeerUpdate messages. When the reactor is stopped, we will catch the signal and 358 // close the p2p PeerUpdatesCh gracefully. 359 func (r *Reactor) processPeerUpdates() { 360 defer r.peerUpdates.Close() 361 362 for { 363 select { 364 case peerUpdate := <-r.peerUpdates.Updates(): 365 r.processPeerUpdate(peerUpdate) 366 367 case <-r.closeCh: 368 r.Logger.Debug("stopped listening on peer updates channel; closing...") 369 return 370 } 371 } 372 } 373 374 // SwitchToFastSync is called by the state sync reactor when switching to fast 375 // sync. 376 func (r *Reactor) SwitchToFastSync(state sm.State) error { 377 r.fastSync.Set() 378 r.initialState = state 379 r.pool.height = state.LastBlockHeight + 1 380 381 if err := r.pool.Start(); err != nil { 382 return err 383 } 384 385 r.syncStartTime = time.Now() 386 387 r.poolWG.Add(1) 388 go r.poolRoutine(true) 389 390 return nil 391 } 392 393 func (r *Reactor) requestRoutine() { 394 statusUpdateTicker := time.NewTicker(statusUpdateIntervalSeconds * time.Second) 395 defer statusUpdateTicker.Stop() 396 397 r.poolWG.Add(1) 398 defer r.poolWG.Done() 399 400 for { 401 select { 402 case <-r.closeCh: 403 return 404 405 case <-r.pool.Quit(): 406 return 407 408 case request := <-r.requestsCh: 409 r.blockchainOutBridgeCh <- p2p.Envelope{ 410 To: request.PeerID, 411 Message: &bcproto.BlockRequest{Height: request.Height}, 412 } 413 414 case pErr := <-r.errorsCh: 415 r.blockchainCh.Error <- p2p.PeerError{ 416 NodeID: pErr.peerID, 417 Err: pErr.err, 418 } 419 420 case <-statusUpdateTicker.C: 421 r.poolWG.Add(1) 422 423 go func() { 424 defer r.poolWG.Done() 425 426 r.blockchainOutBridgeCh <- p2p.Envelope{ 427 Broadcast: true, 428 Message: &bcproto.StatusRequest{}, 429 } 430 }() 431 } 432 } 433 } 434 435 // poolRoutine handles messages from the poolReactor telling the reactor what to 436 // do. 437 // 438 // NOTE: Don't sleep in the FOR_LOOP or otherwise slow it down! 439 func (r *Reactor) poolRoutine(stateSynced bool) { 440 var ( 441 trySyncTicker = time.NewTicker(trySyncIntervalMS * time.Millisecond) 442 switchToConsensusTicker = time.NewTicker(switchToConsensusIntervalSeconds * time.Second) 443 444 blocksSynced = uint64(0) 445 446 chainID = r.initialState.ChainID 447 state = r.initialState 448 449 lastHundred = time.Now() 450 lastRate = 0.0 451 452 didProcessCh = make(chan struct{}, 1) 453 ) 454 455 defer trySyncTicker.Stop() 456 defer switchToConsensusTicker.Stop() 457 458 go r.requestRoutine() 459 460 defer r.poolWG.Done() 461 462 FOR_LOOP: 463 for { 464 select { 465 case <-switchToConsensusTicker.C: 466 var ( 467 height, numPending, lenRequesters = r.pool.GetStatus() 468 lastAdvance = r.pool.LastAdvance() 469 ) 470 471 r.Logger.Debug( 472 "consensus ticker", 473 "num_pending", numPending, 474 "total", lenRequesters, 475 "height", height, 476 ) 477 478 switch { 479 case r.pool.IsCaughtUp(): 480 r.Logger.Info("switching to consensus reactor", "height", height) 481 482 case time.Since(lastAdvance) > syncTimeout: 483 r.Logger.Error("no progress since last advance", "last_advance", lastAdvance) 484 485 default: 486 r.Logger.Info( 487 "not caught up yet", 488 "height", height, 489 "max_peer_height", r.pool.MaxPeerHeight(), 490 "timeout_in", syncTimeout-time.Since(lastAdvance), 491 ) 492 continue 493 } 494 495 if err := r.pool.Stop(); err != nil { 496 r.Logger.Error("failed to stop pool", "err", err) 497 } 498 499 r.fastSync.UnSet() 500 501 if r.consReactor != nil { 502 r.consReactor.SwitchToConsensus(state, blocksSynced > 0 || stateSynced) 503 } 504 505 break FOR_LOOP 506 507 case <-trySyncTicker.C: 508 select { 509 case didProcessCh <- struct{}{}: 510 default: 511 } 512 513 case <-didProcessCh: 514 // NOTE: It is a subtle mistake to process more than a single block at a 515 // time (e.g. 10) here, because we only send one BlockRequest per loop 516 // iteration. The ratio mismatch can result in starving of blocks, i.e. a 517 // sudden burst of requests and responses, and repeat. Consequently, it is 518 // better to split these routines rather than coupling them as it is 519 // written here. 520 // 521 // TODO: Uncouple from request routine. 522 523 // see if there are any blocks to sync 524 first, second := r.pool.PeekTwoBlocks() 525 if first == nil || second == nil { 526 // we need both to sync the first block 527 continue FOR_LOOP 528 } else { 529 // try again quickly next loop 530 didProcessCh <- struct{}{} 531 } 532 533 var ( 534 firstParts = first.MakePartSet(types.BlockPartSizeBytes) 535 firstPartSetHeader = firstParts.Header() 536 firstID = types.BlockID{Hash: first.Hash(), PartSetHeader: firstPartSetHeader} 537 ) 538 539 // Finally, verify the first block using the second's commit. 540 // 541 // NOTE: We can probably make this more efficient, but note that calling 542 // first.Hash() doesn't verify the tx contents, so MakePartSet() is 543 // currently necessary. 544 err := state.Validators.VerifyCommitLight(chainID, firstID, first.Height, second.LastCommit) 545 if err != nil { 546 err = fmt.Errorf("invalid last commit: %w", err) 547 r.Logger.Error( 548 err.Error(), 549 "last_commit", second.LastCommit, 550 "block_id", firstID, 551 "height", first.Height, 552 ) 553 554 // NOTE: We've already removed the peer's request, but we still need 555 // to clean up the rest. 556 peerID := r.pool.RedoRequest(first.Height) 557 r.blockchainCh.Error <- p2p.PeerError{ 558 NodeID: peerID, 559 Err: err, 560 } 561 562 peerID2 := r.pool.RedoRequest(second.Height) 563 if peerID2 != peerID { 564 r.blockchainCh.Error <- p2p.PeerError{ 565 NodeID: peerID2, 566 Err: err, 567 } 568 } 569 570 continue FOR_LOOP 571 } else { 572 r.pool.PopRequest() 573 574 // TODO: batch saves so we do not persist to disk every block 575 r.store.SaveBlock(first, firstParts, second.LastCommit) 576 577 var err error 578 579 // TODO: Same thing for app - but we would need a way to get the hash 580 // without persisting the state. 581 state, err = r.blockExec.ApplyBlock(state, firstID, first) 582 if err != nil { 583 // TODO: This is bad, are we zombie? 584 panic(fmt.Sprintf("failed to process committed block (%d:%X): %v", first.Height, first.Hash(), err)) 585 } 586 587 r.metrics.RecordConsMetrics(first) 588 589 blocksSynced++ 590 591 if blocksSynced%100 == 0 { 592 lastRate = 0.9*lastRate + 0.1*(100/time.Since(lastHundred).Seconds()) 593 r.Logger.Info( 594 "fast sync rate", 595 "height", r.pool.height, 596 "max_peer_height", r.pool.MaxPeerHeight(), 597 "blocks/s", lastRate, 598 ) 599 600 lastHundred = time.Now() 601 } 602 } 603 604 continue FOR_LOOP 605 606 case <-r.closeCh: 607 break FOR_LOOP 608 } 609 } 610 } 611 612 func (r *Reactor) GetMaxPeerBlockHeight() int64 { 613 return r.pool.MaxPeerHeight() 614 } 615 616 func (r *Reactor) GetTotalSyncedTime() time.Duration { 617 if !r.fastSync.IsSet() || r.syncStartTime.IsZero() { 618 return time.Duration(0) 619 } 620 return time.Since(r.syncStartTime) 621 } 622 623 func (r *Reactor) GetRemainingSyncTime() time.Duration { 624 if !r.fastSync.IsSet() { 625 return time.Duration(0) 626 } 627 628 targetSyncs := r.pool.targetSyncBlocks() 629 currentSyncs := r.store.Height() - r.pool.startHeight + 1 630 lastSyncRate := r.pool.getLastSyncRate() 631 if currentSyncs < 0 || lastSyncRate < 0.001 { 632 return time.Duration(0) 633 } 634 635 remain := float64(targetSyncs-currentSyncs) / lastSyncRate 636 637 return time.Duration(int64(remain * float64(time.Second))) 638 }