github.com/okex/exchain@v1.8.0/libs/tendermint/blockchain/v2/scheduler.go (about) 1 package v2 2 3 import ( 4 "bytes" 5 "fmt" 6 "math" 7 "sort" 8 "time" 9 10 "github.com/okex/exchain/libs/tendermint/p2p" 11 "github.com/okex/exchain/libs/tendermint/types" 12 ) 13 14 // Events generated by the scheduler: 15 // all blocks have been processed 16 type scFinishedEv struct { 17 priorityNormal 18 reason string 19 } 20 21 // send a blockRequest message 22 type scBlockRequest struct { 23 priorityNormal 24 peerID p2p.ID 25 height int64 26 } 27 28 // a block has been received and validated by the scheduler 29 type scBlockReceived struct { 30 priorityNormal 31 peerID p2p.ID 32 block *types.Block 33 } 34 35 // scheduler detected a peer error 36 type scPeerError struct { 37 priorityHigh 38 peerID p2p.ID 39 reason error 40 } 41 42 func (e scPeerError) String() string { 43 return fmt.Sprintf("scPeerError - peerID %s, err %s", e.peerID, e.reason) 44 } 45 46 // scheduler removed a set of peers (timed out or slow peer) 47 type scPeersPruned struct { 48 priorityHigh 49 peers []p2p.ID 50 } 51 52 // XXX: make this fatal? 53 // scheduler encountered a fatal error 54 type scSchedulerFail struct { 55 priorityHigh 56 reason error 57 } 58 59 type blockState int 60 61 const ( 62 blockStateUnknown blockState = iota + 1 // no known peer has this block 63 blockStateNew // indicates that a peer has reported having this block 64 blockStatePending // indicates that this block has been requested from a peer 65 blockStateReceived // indicates that this block has been received by a peer 66 blockStateProcessed // indicates that this block has been applied 67 ) 68 69 func (e blockState) String() string { 70 switch e { 71 case blockStateUnknown: 72 return "Unknown" 73 case blockStateNew: 74 return "New" 75 case blockStatePending: 76 return "Pending" 77 case blockStateReceived: 78 return "Received" 79 case blockStateProcessed: 80 return "Processed" 81 default: 82 return fmt.Sprintf("invalid blockState: %d", e) 83 } 84 } 85 86 type peerState int 87 88 const ( 89 peerStateNew = iota + 1 90 peerStateReady 91 peerStateRemoved 92 ) 93 94 func (e peerState) String() string { 95 switch e { 96 case peerStateNew: 97 return "New" 98 case peerStateReady: 99 return "Ready" 100 case peerStateRemoved: 101 return "Removed" 102 default: 103 panic(fmt.Sprintf("unknown peerState: %d", e)) 104 } 105 } 106 107 type scPeer struct { 108 peerID p2p.ID 109 110 // initialized as New when peer is added, updated to Ready when statusUpdate is received, 111 // updated to Removed when peer is removed 112 state peerState 113 114 base int64 // updated when statusResponse is received 115 height int64 // updated when statusResponse is received 116 lastTouched time.Time 117 lastRate int64 // last receive rate in bytes 118 } 119 120 func (p scPeer) String() string { 121 return fmt.Sprintf("{state %v, base %d, height %d, lastTouched %v, lastRate %d, id %v}", 122 p.state, p.base, p.height, p.lastTouched, p.lastRate, p.peerID) 123 } 124 125 func newScPeer(peerID p2p.ID) *scPeer { 126 return &scPeer{ 127 peerID: peerID, 128 state: peerStateNew, 129 base: -1, 130 height: -1, 131 lastTouched: time.Time{}, 132 } 133 } 134 135 // The scheduler keep track of the state of each block and each peer. The 136 // scheduler will attempt to schedule new block requests with `trySchedule` 137 // events and remove slow peers with `tryPrune` events. 138 type scheduler struct { 139 initHeight int64 140 141 // next block that needs to be processed. All blocks with smaller height are 142 // in Processed state. 143 height int64 144 145 // lastAdvance tracks the last time a block execution happened. 146 // syncTimeout is the maximum time the scheduler waits to advance in the fast sync process before finishing. 147 // This covers the cases where there are no peers or all peers have a lower height. 148 lastAdvance time.Time 149 syncTimeout time.Duration 150 151 // a map of peerID to scheduler specific peer struct `scPeer` used to keep 152 // track of peer specific state 153 peers map[p2p.ID]*scPeer 154 peerTimeout time.Duration // maximum response time from a peer otherwise prune 155 minRecvRate int64 // minimum receive rate from peer otherwise prune 156 157 // the maximum number of blocks that should be New, Received or Pending at any point 158 // in time. This is used to enforce a limit on the blockStates map. 159 targetPending int 160 // a list of blocks to be scheduled (New), Pending or Received. Its length should be 161 // smaller than targetPending. 162 blockStates map[int64]blockState 163 164 // a map of heights to the peer we are waiting a response from 165 pendingBlocks map[int64]p2p.ID 166 167 // the time at which a block was put in blockStatePending 168 pendingTime map[int64]time.Time 169 170 // a map of heights to the peers that put the block in blockStateReceived 171 receivedBlocks map[int64]p2p.ID 172 } 173 174 func (sc scheduler) String() string { 175 return fmt.Sprintf("ih: %d, bst: %v, peers: %v, pblks: %v, ptm %v, rblks: %v", 176 sc.initHeight, sc.blockStates, sc.peers, sc.pendingBlocks, sc.pendingTime, sc.receivedBlocks) 177 } 178 179 func newScheduler(initHeight int64, startTime time.Time) *scheduler { 180 sc := scheduler{ 181 initHeight: initHeight, 182 lastAdvance: startTime, 183 syncTimeout: 60 * time.Second, 184 height: initHeight + 1, 185 blockStates: make(map[int64]blockState), 186 peers: make(map[p2p.ID]*scPeer), 187 pendingBlocks: make(map[int64]p2p.ID), 188 pendingTime: make(map[int64]time.Time), 189 receivedBlocks: make(map[int64]p2p.ID), 190 targetPending: 10, // TODO - pass as param 191 peerTimeout: 15 * time.Second, // TODO - pass as param 192 minRecvRate: 0, //int64(7680), TODO - pass as param 193 } 194 195 return &sc 196 } 197 198 func (sc *scheduler) addPeer(peerID p2p.ID) error { 199 if _, ok := sc.peers[peerID]; ok { 200 // In the future we should be able to add a previously removed peer 201 return fmt.Errorf("cannot add duplicate peer %s", peerID) 202 } 203 sc.peers[peerID] = newScPeer(peerID) 204 return nil 205 } 206 207 func (sc *scheduler) touchPeer(peerID p2p.ID, time time.Time) error { 208 peer, ok := sc.peers[peerID] 209 if !ok { 210 return fmt.Errorf("couldn't find peer %s", peerID) 211 } 212 213 if peer.state != peerStateReady { 214 return fmt.Errorf("tried to touch peer in state %s, must be Ready", peer.state) 215 } 216 217 peer.lastTouched = time 218 219 return nil 220 } 221 222 func (sc *scheduler) removePeer(peerID p2p.ID) error { 223 peer, ok := sc.peers[peerID] 224 if !ok { 225 return fmt.Errorf("couldn't find peer %s", peerID) 226 } 227 228 if peer.state == peerStateRemoved { 229 return fmt.Errorf("tried to remove peer %s in peerStateRemoved", peerID) 230 } 231 232 for height, pendingPeerID := range sc.pendingBlocks { 233 if pendingPeerID == peerID { 234 sc.setStateAtHeight(height, blockStateNew) 235 delete(sc.pendingTime, height) 236 delete(sc.pendingBlocks, height) 237 } 238 } 239 240 for height, rcvPeerID := range sc.receivedBlocks { 241 if rcvPeerID == peerID { 242 sc.setStateAtHeight(height, blockStateNew) 243 delete(sc.receivedBlocks, height) 244 } 245 } 246 247 // remove the blocks from blockStates if the peer removal causes the max peer height to be lower. 248 peer.state = peerStateRemoved 249 maxPeerHeight := int64(0) 250 for _, otherPeer := range sc.peers { 251 if otherPeer.state != peerStateReady { 252 continue 253 } 254 if otherPeer.peerID != peer.peerID && otherPeer.height > maxPeerHeight { 255 maxPeerHeight = otherPeer.height 256 } 257 } 258 for h := range sc.blockStates { 259 if h > maxPeerHeight { 260 delete(sc.blockStates, h) 261 } 262 } 263 264 return nil 265 } 266 267 // check if the blockPool is running low and add new blocks in New state to be requested. 268 // This function is called when there is an increase in the maximum peer height or when 269 // blocks are processed. 270 func (sc *scheduler) addNewBlocks() { 271 if len(sc.blockStates) >= sc.targetPending { 272 return 273 } 274 275 for i := sc.height; i < int64(sc.targetPending)+sc.height; i++ { 276 if i > sc.maxHeight() { 277 break 278 } 279 if sc.getStateAtHeight(i) == blockStateUnknown { 280 sc.setStateAtHeight(i, blockStateNew) 281 } 282 } 283 } 284 285 func (sc *scheduler) setPeerRange(peerID p2p.ID, base int64, height int64) error { 286 peer, ok := sc.peers[peerID] 287 if !ok { 288 return fmt.Errorf("cannot find peer %s", peerID) 289 } 290 291 if peer.state == peerStateRemoved { 292 return fmt.Errorf("cannot set peer height for a peer in peerStateRemoved") 293 } 294 295 if height < peer.height { 296 sc.removePeer(peerID) 297 return fmt.Errorf("cannot move peer height lower. from %d to %d", peer.height, height) 298 } 299 300 if base > height { 301 return fmt.Errorf("cannot set peer base higher than its height") 302 } 303 304 peer.base = base 305 peer.height = height 306 peer.state = peerStateReady 307 308 sc.addNewBlocks() 309 return nil 310 } 311 312 func (sc *scheduler) getStateAtHeight(height int64) blockState { 313 if height < sc.height { 314 return blockStateProcessed 315 } else if state, ok := sc.blockStates[height]; ok { 316 return state 317 } else { 318 return blockStateUnknown 319 } 320 } 321 322 func (sc *scheduler) getPeersWithHeight(height int64) []p2p.ID { 323 peers := make([]p2p.ID, 0) 324 for _, peer := range sc.peers { 325 if peer.state != peerStateReady { 326 continue 327 } 328 if peer.base <= height && peer.height >= height { 329 peers = append(peers, peer.peerID) 330 } 331 } 332 return peers 333 } 334 335 func (sc *scheduler) prunablePeers(peerTimout time.Duration, minRecvRate int64, now time.Time) []p2p.ID { 336 prunable := make([]p2p.ID, 0) 337 for peerID, peer := range sc.peers { 338 if peer.state != peerStateReady { 339 continue 340 } 341 if now.Sub(peer.lastTouched) > peerTimout || peer.lastRate < minRecvRate { 342 prunable = append(prunable, peerID) 343 } 344 } 345 // Tests for handleTryPrunePeer() may fail without sort due to range non-determinism 346 sort.Sort(PeerByID(prunable)) 347 return prunable 348 } 349 350 func (sc *scheduler) setStateAtHeight(height int64, state blockState) { 351 sc.blockStates[height] = state 352 } 353 354 func (sc *scheduler) markReceived(peerID p2p.ID, height int64, size int64, now time.Time) error { 355 peer, ok := sc.peers[peerID] 356 if !ok { 357 return fmt.Errorf("couldn't find peer %s", peerID) 358 } 359 360 if peer.state != peerStateReady { 361 return fmt.Errorf("cannot receive blocks from not ready peer %s", peerID) 362 } 363 364 if state := sc.getStateAtHeight(height); state != blockStatePending || sc.pendingBlocks[height] != peerID { 365 return fmt.Errorf("received block %d from peer %s without being requested", height, peerID) 366 } 367 368 pendingTime, ok := sc.pendingTime[height] 369 if !ok || now.Sub(pendingTime) <= 0 { 370 return fmt.Errorf("clock error: block %d received at %s but requested at %s", 371 height, pendingTime, now) 372 } 373 374 peer.lastRate = size / now.Sub(pendingTime).Nanoseconds() 375 376 sc.setStateAtHeight(height, blockStateReceived) 377 delete(sc.pendingBlocks, height) 378 delete(sc.pendingTime, height) 379 380 sc.receivedBlocks[height] = peerID 381 382 return nil 383 } 384 385 func (sc *scheduler) markPending(peerID p2p.ID, height int64, time time.Time) error { 386 state := sc.getStateAtHeight(height) 387 if state != blockStateNew { 388 return fmt.Errorf("block %d should be in blockStateNew but is %s", height, state) 389 } 390 391 peer, ok := sc.peers[peerID] 392 if !ok { 393 return fmt.Errorf("cannot find peer %s", peerID) 394 } 395 396 if peer.state != peerStateReady { 397 return fmt.Errorf("cannot schedule %d from %s in %s", height, peerID, peer.state) 398 } 399 400 if height > peer.height { 401 return fmt.Errorf("cannot request height %d from peer %s that is at height %d", 402 height, peerID, peer.height) 403 } 404 405 if height < peer.base { 406 return fmt.Errorf("cannot request height %d for peer %s with base %d", 407 height, peerID, peer.base) 408 } 409 410 sc.setStateAtHeight(height, blockStatePending) 411 sc.pendingBlocks[height] = peerID 412 sc.pendingTime[height] = time 413 414 return nil 415 } 416 417 func (sc *scheduler) markProcessed(height int64) error { 418 // It is possible that a peer error or timeout is handled after the processor 419 // has processed the block but before the scheduler received this event, so 420 // when pcBlockProcessed event is received, the block had been requested 421 // again => don't check the block state. 422 sc.lastAdvance = time.Now() 423 sc.height = height + 1 424 delete(sc.pendingBlocks, height) 425 delete(sc.pendingTime, height) 426 delete(sc.receivedBlocks, height) 427 delete(sc.blockStates, height) 428 sc.addNewBlocks() 429 430 return nil 431 } 432 433 func (sc *scheduler) allBlocksProcessed() bool { 434 if len(sc.peers) == 0 { 435 return false 436 } 437 return sc.height >= sc.maxHeight() 438 } 439 440 // returns max peer height or the last processed block, i.e. sc.height 441 func (sc *scheduler) maxHeight() int64 { 442 max := sc.height - 1 443 for _, peer := range sc.peers { 444 if peer.state != peerStateReady { 445 continue 446 } 447 if max < peer.height { 448 max = peer.height 449 } 450 } 451 return max 452 } 453 454 // lowest block in sc.blockStates with state == blockStateNew or -1 if no new blocks 455 func (sc *scheduler) nextHeightToSchedule() int64 { 456 var min int64 = math.MaxInt64 457 for height, state := range sc.blockStates { 458 if state == blockStateNew && height < min { 459 min = height 460 } 461 } 462 if min == math.MaxInt64 { 463 min = -1 464 } 465 return min 466 } 467 468 func (sc *scheduler) pendingFrom(peerID p2p.ID) []int64 { 469 var heights []int64 470 for height, pendingPeerID := range sc.pendingBlocks { 471 if pendingPeerID == peerID { 472 heights = append(heights, height) 473 } 474 } 475 return heights 476 } 477 478 func (sc *scheduler) selectPeer(height int64) (p2p.ID, error) { 479 peers := sc.getPeersWithHeight(height) 480 if len(peers) == 0 { 481 return "", fmt.Errorf("cannot find peer for height %d", height) 482 } 483 484 // create a map from number of pending requests to a list 485 // of peers having that number of pending requests. 486 pendingFrom := make(map[int][]p2p.ID) 487 for _, peerID := range peers { 488 numPending := len(sc.pendingFrom(peerID)) 489 pendingFrom[numPending] = append(pendingFrom[numPending], peerID) 490 } 491 492 // find the set of peers with minimum number of pending requests. 493 var minPending int64 = math.MaxInt64 494 for mp := range pendingFrom { 495 if int64(mp) < minPending { 496 minPending = int64(mp) 497 } 498 } 499 500 sort.Sort(PeerByID(pendingFrom[int(minPending)])) 501 return pendingFrom[int(minPending)][0], nil 502 } 503 504 // PeerByID is a list of peers sorted by peerID. 505 type PeerByID []p2p.ID 506 507 func (peers PeerByID) Len() int { 508 return len(peers) 509 } 510 func (peers PeerByID) Less(i, j int) bool { 511 return bytes.Compare([]byte(peers[i]), []byte(peers[j])) == -1 512 } 513 514 func (peers PeerByID) Swap(i, j int) { 515 it := peers[i] 516 peers[i] = peers[j] 517 peers[j] = it 518 } 519 520 // Handlers 521 522 // This handler gets the block, performs some validation and then passes it on to the processor. 523 func (sc *scheduler) handleBlockResponse(event bcBlockResponse) (Event, error) { 524 err := sc.touchPeer(event.peerID, event.time) 525 if err != nil { 526 return scPeerError{peerID: event.peerID, reason: err}, nil 527 } 528 529 err = sc.markReceived(event.peerID, event.block.Height, event.size, event.time) 530 if err != nil { 531 _ = sc.removePeer(event.peerID) 532 return scPeerError{peerID: event.peerID, reason: err}, nil 533 } 534 535 return scBlockReceived{peerID: event.peerID, block: event.block}, nil 536 } 537 538 func (sc *scheduler) handleNoBlockResponse(event bcNoBlockResponse) (Event, error) { 539 if len(sc.peers) == 0 { 540 return noOp, nil 541 } 542 543 peer, ok := sc.peers[event.peerID] 544 if !ok || peer.state == peerStateRemoved { 545 return noOp, nil 546 } 547 // The peer may have been just removed due to errors, low speed or timeouts. 548 _ = sc.removePeer(event.peerID) 549 550 return scPeerError{peerID: event.peerID, 551 reason: fmt.Errorf("peer %v with base %d height %d claims no block for %d", 552 event.peerID, peer.base, peer.height, event.height)}, nil 553 } 554 555 func (sc *scheduler) handleBlockProcessed(event pcBlockProcessed) (Event, error) { 556 if event.height != sc.height { 557 panic(fmt.Sprintf("processed height %d but expected height %d", event.height, sc.height)) 558 } 559 err := sc.markProcessed(event.height) 560 if err != nil { 561 // It is possible that a peer error or timeout is handled after the processor 562 // has processed the block but before the scheduler received this event, 563 // so when pcBlockProcessed event is received the block had been requested again. 564 return scSchedulerFail{reason: err}, nil 565 } 566 567 if sc.allBlocksProcessed() { 568 return scFinishedEv{reason: "processed all blocks"}, nil 569 } 570 571 return noOp, nil 572 } 573 574 // Handles an error from the processor. The processor had already cleaned the blocks from 575 // the peers included in this event. Just attempt to remove the peers. 576 func (sc *scheduler) handleBlockProcessError(event pcBlockVerificationFailure) (Event, error) { 577 if len(sc.peers) == 0 { 578 return noOp, nil 579 } 580 // The peers may have been just removed due to errors, low speed or timeouts. 581 _ = sc.removePeer(event.firstPeerID) 582 if event.firstPeerID != event.secondPeerID { 583 _ = sc.removePeer(event.secondPeerID) 584 } 585 586 if sc.allBlocksProcessed() { 587 return scFinishedEv{reason: "error on last block"}, nil 588 } 589 590 return noOp, nil 591 } 592 593 func (sc *scheduler) handleAddNewPeer(event bcAddNewPeer) (Event, error) { 594 err := sc.addPeer(event.peerID) 595 if err != nil { 596 return scSchedulerFail{reason: err}, nil 597 } 598 return noOp, nil 599 } 600 601 func (sc *scheduler) handleRemovePeer(event bcRemovePeer) (Event, error) { 602 err := sc.removePeer(event.peerID) 603 if err != nil { 604 // XXX - It is possible that the removePeer fails here for legitimate reasons 605 // for example if a peer timeout or error was handled just before this. 606 return scSchedulerFail{reason: err}, nil 607 } 608 if sc.allBlocksProcessed() { 609 return scFinishedEv{reason: "removed peer"}, nil 610 } 611 return noOp, nil 612 } 613 614 func (sc *scheduler) handleTryPrunePeer(event rTryPrunePeer) (Event, error) { 615 616 // Check behavior of peer responsible to deliver block at sc.height. 617 timeHeightAsked, ok := sc.pendingTime[sc.height] 618 if ok && time.Since(timeHeightAsked) > sc.peerTimeout { 619 // A request was sent to a peer for block at sc.height but a response was not received 620 // from that peer within sc.peerTimeout. Remove the peer. This is to ensure that a peer 621 // will be timed out even if it sends blocks at higher heights but prevents progress by 622 // not sending the block at current height. 623 sc.removePeer(sc.pendingBlocks[sc.height]) 624 } 625 626 prunablePeers := sc.prunablePeers(sc.peerTimeout, sc.minRecvRate, event.time) 627 if len(prunablePeers) == 0 { 628 return noOp, nil 629 } 630 for _, peerID := range prunablePeers { 631 err := sc.removePeer(peerID) 632 if err != nil { 633 // Should never happen as prunablePeers() returns only existing peers in Ready state. 634 panic("scheduler data corruption") 635 } 636 } 637 638 // If all blocks are processed we should finish. 639 if sc.allBlocksProcessed() { 640 return scFinishedEv{reason: "after try prune"}, nil 641 } 642 643 return scPeersPruned{peers: prunablePeers}, nil 644 645 } 646 647 func (sc *scheduler) handleTrySchedule(event rTrySchedule) (Event, error) { 648 if time.Since(sc.lastAdvance) > sc.syncTimeout { 649 return scFinishedEv{reason: "timeout, no advance"}, nil 650 } 651 652 nextHeight := sc.nextHeightToSchedule() 653 if nextHeight == -1 { 654 return noOp, nil 655 } 656 657 bestPeerID, err := sc.selectPeer(nextHeight) 658 if err != nil { 659 return scSchedulerFail{reason: err}, nil 660 } 661 if err := sc.markPending(bestPeerID, nextHeight, event.time); err != nil { 662 return scSchedulerFail{reason: err}, nil // XXX: peerError might be more appropriate 663 } 664 return scBlockRequest{peerID: bestPeerID, height: nextHeight}, nil 665 666 } 667 668 func (sc *scheduler) handleStatusResponse(event bcStatusResponse) (Event, error) { 669 err := sc.setPeerRange(event.peerID, event.base, event.height) 670 if err != nil { 671 return scPeerError{peerID: event.peerID, reason: err}, nil 672 } 673 return noOp, nil 674 } 675 676 func (sc *scheduler) handle(event Event) (Event, error) { 677 switch event := event.(type) { 678 case bcStatusResponse: 679 nextEvent, err := sc.handleStatusResponse(event) 680 return nextEvent, err 681 case bcBlockResponse: 682 nextEvent, err := sc.handleBlockResponse(event) 683 return nextEvent, err 684 case bcNoBlockResponse: 685 nextEvent, err := sc.handleNoBlockResponse(event) 686 return nextEvent, err 687 case rTrySchedule: 688 nextEvent, err := sc.handleTrySchedule(event) 689 return nextEvent, err 690 case bcAddNewPeer: 691 nextEvent, err := sc.handleAddNewPeer(event) 692 return nextEvent, err 693 case bcRemovePeer: 694 nextEvent, err := sc.handleRemovePeer(event) 695 return nextEvent, err 696 case rTryPrunePeer: 697 nextEvent, err := sc.handleTryPrunePeer(event) 698 return nextEvent, err 699 case pcBlockProcessed: 700 nextEvent, err := sc.handleBlockProcessed(event) 701 return nextEvent, err 702 case pcBlockVerificationFailure: 703 nextEvent, err := sc.handleBlockProcessError(event) 704 return nextEvent, err 705 default: 706 return scSchedulerFail{reason: fmt.Errorf("unknown event %v", event)}, nil 707 } 708 }