github.com/Team-Kujira/tendermint@v0.34.24-indexer/statesync/syncer.go (about) 1 package statesync 2 3 import ( 4 "bytes" 5 "context" 6 "errors" 7 "fmt" 8 "time" 9 10 abci "github.com/tendermint/tendermint/abci/types" 11 "github.com/tendermint/tendermint/config" 12 "github.com/tendermint/tendermint/libs/log" 13 tmsync "github.com/tendermint/tendermint/libs/sync" 14 "github.com/tendermint/tendermint/light" 15 "github.com/tendermint/tendermint/p2p" 16 ssproto "github.com/tendermint/tendermint/proto/tendermint/statesync" 17 "github.com/tendermint/tendermint/proxy" 18 sm "github.com/tendermint/tendermint/state" 19 "github.com/tendermint/tendermint/types" 20 ) 21 22 const ( 23 // chunkTimeout is the timeout while waiting for the next chunk from the chunk queue. 24 chunkTimeout = 2 * time.Minute 25 26 // minimumDiscoveryTime is the lowest allowable time for a 27 // SyncAny discovery time. 28 minimumDiscoveryTime = 5 * time.Second 29 ) 30 31 var ( 32 // errAbort is returned by Sync() when snapshot restoration is aborted. 33 errAbort = errors.New("state sync aborted") 34 // errRetrySnapshot is returned by Sync() when the snapshot should be retried. 35 errRetrySnapshot = errors.New("retry snapshot") 36 // errRejectSnapshot is returned by Sync() when the snapshot is rejected. 37 errRejectSnapshot = errors.New("snapshot was rejected") 38 // errRejectFormat is returned by Sync() when the snapshot format is rejected. 39 errRejectFormat = errors.New("snapshot format was rejected") 40 // errRejectSender is returned by Sync() when the snapshot sender is rejected. 41 errRejectSender = errors.New("snapshot sender was rejected") 42 // errVerifyFailed is returned by Sync() when app hash or last height verification fails. 43 errVerifyFailed = errors.New("verification failed") 44 // errTimeout is returned by Sync() when we've waited too long to receive a chunk. 45 errTimeout = errors.New("timed out waiting for chunk") 46 // errNoSnapshots is returned by SyncAny() if no snapshots are found and discovery is disabled. 47 errNoSnapshots = errors.New("no suitable snapshots found") 48 ) 49 50 // syncer runs a state sync against an ABCI app. Use either SyncAny() to automatically attempt to 51 // sync all snapshots in the pool (pausing to discover new ones), or Sync() to sync a specific 52 // snapshot. Snapshots and chunks are fed via AddSnapshot() and AddChunk() as appropriate. 53 type syncer struct { 54 logger log.Logger 55 stateProvider StateProvider 56 conn proxy.AppConnSnapshot 57 connQuery proxy.AppConnQuery 58 snapshots *snapshotPool 59 tempDir string 60 chunkFetchers int32 61 retryTimeout time.Duration 62 63 mtx tmsync.RWMutex 64 chunks *chunkQueue 65 } 66 67 // newSyncer creates a new syncer. 68 func newSyncer( 69 cfg config.StateSyncConfig, 70 logger log.Logger, 71 conn proxy.AppConnSnapshot, 72 connQuery proxy.AppConnQuery, 73 stateProvider StateProvider, 74 tempDir string, 75 ) *syncer { 76 77 return &syncer{ 78 logger: logger, 79 stateProvider: stateProvider, 80 conn: conn, 81 connQuery: connQuery, 82 snapshots: newSnapshotPool(), 83 tempDir: tempDir, 84 chunkFetchers: cfg.ChunkFetchers, 85 retryTimeout: cfg.ChunkRequestTimeout, 86 } 87 } 88 89 // AddChunk adds a chunk to the chunk queue, if any. It returns false if the chunk has already 90 // been added to the queue, or an error if there's no sync in progress. 91 func (s *syncer) AddChunk(chunk *chunk) (bool, error) { 92 s.mtx.RLock() 93 defer s.mtx.RUnlock() 94 if s.chunks == nil { 95 return false, errors.New("no state sync in progress") 96 } 97 added, err := s.chunks.Add(chunk) 98 if err != nil { 99 return false, err 100 } 101 if added { 102 s.logger.Debug("Added chunk to queue", "height", chunk.Height, "format", chunk.Format, 103 "chunk", chunk.Index) 104 } else { 105 s.logger.Debug("Ignoring duplicate chunk in queue", "height", chunk.Height, "format", chunk.Format, 106 "chunk", chunk.Index) 107 } 108 return added, nil 109 } 110 111 // AddSnapshot adds a snapshot to the snapshot pool. It returns true if a new, previously unseen 112 // snapshot was accepted and added. 113 func (s *syncer) AddSnapshot(peer p2p.Peer, snapshot *snapshot) (bool, error) { 114 added, err := s.snapshots.Add(peer, snapshot) 115 if err != nil { 116 return false, err 117 } 118 if added { 119 s.logger.Info("Discovered new snapshot", "height", snapshot.Height, "format", snapshot.Format, 120 "hash", snapshot.Hash) 121 } 122 return added, nil 123 } 124 125 // AddPeer adds a peer to the pool. For now we just keep it simple and send a single request 126 // to discover snapshots, later we may want to do retries and stuff. 127 func (s *syncer) AddPeer(peer p2p.Peer) { 128 s.logger.Debug("Requesting snapshots from peer", "peer", peer.ID()) 129 e := p2p.Envelope{ 130 ChannelID: SnapshotChannel, 131 Message: &ssproto.SnapshotsRequest{}, 132 } 133 p2p.SendEnvelopeShim(peer, e, s.logger) //nolint: staticcheck 134 } 135 136 // RemovePeer removes a peer from the pool. 137 func (s *syncer) RemovePeer(peer p2p.Peer) { 138 s.logger.Debug("Removing peer from sync", "peer", peer.ID()) 139 s.snapshots.RemovePeer(peer.ID()) 140 } 141 142 // SyncAny tries to sync any of the snapshots in the snapshot pool, waiting to discover further 143 // snapshots if none were found and discoveryTime > 0. It returns the latest state and block commit 144 // which the caller must use to bootstrap the node. 145 func (s *syncer) SyncAny(discoveryTime time.Duration, retryHook func()) (sm.State, *types.Commit, error) { 146 if discoveryTime != 0 && discoveryTime < minimumDiscoveryTime { 147 discoveryTime = 5 * minimumDiscoveryTime 148 } 149 150 if discoveryTime > 0 { 151 s.logger.Info("sync any", "msg", log.NewLazySprintf("Discovering snapshots for %v", discoveryTime)) 152 time.Sleep(discoveryTime) 153 } 154 155 // The app may ask us to retry a snapshot restoration, in which case we need to reuse 156 // the snapshot and chunk queue from the previous loop iteration. 157 var ( 158 snapshot *snapshot 159 chunks *chunkQueue 160 err error 161 ) 162 for { 163 // If not nil, we're going to retry restoration of the same snapshot. 164 if snapshot == nil { 165 snapshot = s.snapshots.Best() 166 chunks = nil 167 } 168 if snapshot == nil { 169 if discoveryTime == 0 { 170 return sm.State{}, nil, errNoSnapshots 171 } 172 retryHook() 173 s.logger.Info("sync any", "msg", log.NewLazySprintf("Discovering snapshots for %v", discoveryTime)) 174 time.Sleep(discoveryTime) 175 continue 176 } 177 if chunks == nil { 178 chunks, err = newChunkQueue(snapshot, s.tempDir) 179 if err != nil { 180 return sm.State{}, nil, fmt.Errorf("failed to create chunk queue: %w", err) 181 } 182 defer chunks.Close() // in case we forget to close it elsewhere 183 } 184 185 newState, commit, err := s.Sync(snapshot, chunks) 186 switch { 187 case err == nil: 188 return newState, commit, nil 189 190 case errors.Is(err, errAbort): 191 return sm.State{}, nil, err 192 193 case errors.Is(err, errRetrySnapshot): 194 chunks.RetryAll() 195 s.logger.Info("Retrying snapshot", "height", snapshot.Height, "format", snapshot.Format, 196 "hash", snapshot.Hash) 197 continue 198 199 case errors.Is(err, errTimeout): 200 s.snapshots.Reject(snapshot) 201 s.logger.Error("Timed out waiting for snapshot chunks, rejected snapshot", 202 "height", snapshot.Height, "format", snapshot.Format, "hash", snapshot.Hash) 203 204 case errors.Is(err, errRejectSnapshot): 205 s.snapshots.Reject(snapshot) 206 s.logger.Info("Snapshot rejected", "height", snapshot.Height, "format", snapshot.Format, 207 "hash", snapshot.Hash) 208 209 case errors.Is(err, errRejectFormat): 210 s.snapshots.RejectFormat(snapshot.Format) 211 s.logger.Info("Snapshot format rejected", "format", snapshot.Format) 212 213 case errors.Is(err, errRejectSender): 214 s.logger.Info("Snapshot senders rejected", "height", snapshot.Height, "format", snapshot.Format, 215 "hash", snapshot.Hash) 216 for _, peer := range s.snapshots.GetPeers(snapshot) { 217 s.snapshots.RejectPeer(peer.ID()) 218 s.logger.Info("Snapshot sender rejected", "peer", peer.ID()) 219 } 220 221 case errors.Is(err, context.DeadlineExceeded): 222 s.logger.Info("Timed out validating snapshot, rejecting", "height", snapshot.Height, "err", err) 223 s.snapshots.Reject(snapshot) 224 225 default: 226 return sm.State{}, nil, fmt.Errorf("snapshot restoration failed: %w", err) 227 } 228 229 // Discard snapshot and chunks for next iteration 230 err = chunks.Close() 231 if err != nil { 232 s.logger.Error("Failed to clean up chunk queue", "err", err) 233 } 234 snapshot = nil 235 chunks = nil 236 } 237 } 238 239 // Sync executes a sync for a specific snapshot, returning the latest state and block commit which 240 // the caller must use to bootstrap the node. 241 func (s *syncer) Sync(snapshot *snapshot, chunks *chunkQueue) (sm.State, *types.Commit, error) { 242 s.mtx.Lock() 243 if s.chunks != nil { 244 s.mtx.Unlock() 245 return sm.State{}, nil, errors.New("a state sync is already in progress") 246 } 247 s.chunks = chunks 248 s.mtx.Unlock() 249 defer func() { 250 s.mtx.Lock() 251 s.chunks = nil 252 s.mtx.Unlock() 253 }() 254 255 hctx, cancel := context.WithTimeout(context.TODO(), 30*time.Second) 256 defer cancel() 257 258 appHash, err := s.stateProvider.AppHash(hctx, snapshot.Height) 259 if err != nil { 260 s.logger.Info("failed to fetch and verify app hash", "err", err) 261 if err == light.ErrNoWitnesses { 262 return sm.State{}, nil, err 263 } 264 return sm.State{}, nil, errRejectSnapshot 265 } 266 snapshot.trustedAppHash = appHash 267 268 // Offer snapshot to ABCI app. 269 err = s.offerSnapshot(snapshot) 270 if err != nil { 271 return sm.State{}, nil, err 272 } 273 274 // Spawn chunk fetchers. They will terminate when the chunk queue is closed or context cancelled. 275 fetchCtx, cancel := context.WithCancel(context.TODO()) 276 defer cancel() 277 for i := int32(0); i < s.chunkFetchers; i++ { 278 go s.fetchChunks(fetchCtx, snapshot, chunks) 279 } 280 281 pctx, pcancel := context.WithTimeout(context.TODO(), 30*time.Second) 282 defer pcancel() 283 284 // Optimistically build new state, so we don't discover any light client failures at the end. 285 state, err := s.stateProvider.State(pctx, snapshot.Height) 286 if err != nil { 287 s.logger.Info("failed to fetch and verify tendermint state", "err", err) 288 if err == light.ErrNoWitnesses { 289 return sm.State{}, nil, err 290 } 291 return sm.State{}, nil, errRejectSnapshot 292 } 293 commit, err := s.stateProvider.Commit(pctx, snapshot.Height) 294 if err != nil { 295 s.logger.Info("failed to fetch and verify commit", "err", err) 296 if err == light.ErrNoWitnesses { 297 return sm.State{}, nil, err 298 } 299 return sm.State{}, nil, errRejectSnapshot 300 } 301 302 // Restore snapshot 303 err = s.applyChunks(chunks) 304 if err != nil { 305 return sm.State{}, nil, err 306 } 307 308 // Verify app and app version 309 if err := s.verifyApp(snapshot, state.Version.Consensus.App); err != nil { 310 return sm.State{}, nil, err 311 } 312 313 // Done! 🎉 314 s.logger.Info("Snapshot restored", "height", snapshot.Height, "format", snapshot.Format, 315 "hash", snapshot.Hash) 316 317 return state, commit, nil 318 } 319 320 // offerSnapshot offers a snapshot to the app. It returns various errors depending on the app's 321 // response, or nil if the snapshot was accepted. 322 func (s *syncer) offerSnapshot(snapshot *snapshot) error { 323 s.logger.Info("Offering snapshot to ABCI app", "height", snapshot.Height, 324 "format", snapshot.Format, "hash", snapshot.Hash) 325 resp, err := s.conn.OfferSnapshotSync(abci.RequestOfferSnapshot{ 326 Snapshot: &abci.Snapshot{ 327 Height: snapshot.Height, 328 Format: snapshot.Format, 329 Chunks: snapshot.Chunks, 330 Hash: snapshot.Hash, 331 Metadata: snapshot.Metadata, 332 }, 333 AppHash: snapshot.trustedAppHash, 334 }) 335 if err != nil { 336 return fmt.Errorf("failed to offer snapshot: %w", err) 337 } 338 switch resp.Result { 339 case abci.ResponseOfferSnapshot_ACCEPT: 340 s.logger.Info("Snapshot accepted, restoring", "height", snapshot.Height, 341 "format", snapshot.Format, "hash", snapshot.Hash) 342 return nil 343 case abci.ResponseOfferSnapshot_ABORT: 344 return errAbort 345 case abci.ResponseOfferSnapshot_REJECT: 346 return errRejectSnapshot 347 case abci.ResponseOfferSnapshot_REJECT_FORMAT: 348 return errRejectFormat 349 case abci.ResponseOfferSnapshot_REJECT_SENDER: 350 return errRejectSender 351 default: 352 return fmt.Errorf("unknown ResponseOfferSnapshot result %v", resp.Result) 353 } 354 } 355 356 // applyChunks applies chunks to the app. It returns various errors depending on the app's 357 // response, or nil once the snapshot is fully restored. 358 func (s *syncer) applyChunks(chunks *chunkQueue) error { 359 for { 360 chunk, err := chunks.Next() 361 if err == errDone { 362 return nil 363 } else if err != nil { 364 return fmt.Errorf("failed to fetch chunk: %w", err) 365 } 366 367 resp, err := s.conn.ApplySnapshotChunkSync(abci.RequestApplySnapshotChunk{ 368 Index: chunk.Index, 369 Chunk: chunk.Chunk, 370 Sender: string(chunk.Sender), 371 }) 372 if err != nil { 373 return fmt.Errorf("failed to apply chunk %v: %w", chunk.Index, err) 374 } 375 s.logger.Info("Applied snapshot chunk to ABCI app", "height", chunk.Height, 376 "format", chunk.Format, "chunk", chunk.Index, "total", chunks.Size()) 377 378 // Discard and refetch any chunks as requested by the app 379 for _, index := range resp.RefetchChunks { 380 err := chunks.Discard(index) 381 if err != nil { 382 return fmt.Errorf("failed to discard chunk %v: %w", index, err) 383 } 384 } 385 386 // Reject any senders as requested by the app 387 for _, sender := range resp.RejectSenders { 388 if sender != "" { 389 s.snapshots.RejectPeer(p2p.ID(sender)) 390 err := chunks.DiscardSender(p2p.ID(sender)) 391 if err != nil { 392 return fmt.Errorf("failed to reject sender: %w", err) 393 } 394 } 395 } 396 397 switch resp.Result { 398 case abci.ResponseApplySnapshotChunk_ACCEPT: 399 case abci.ResponseApplySnapshotChunk_ABORT: 400 return errAbort 401 case abci.ResponseApplySnapshotChunk_RETRY: 402 chunks.Retry(chunk.Index) 403 case abci.ResponseApplySnapshotChunk_RETRY_SNAPSHOT: 404 return errRetrySnapshot 405 case abci.ResponseApplySnapshotChunk_REJECT_SNAPSHOT: 406 return errRejectSnapshot 407 default: 408 return fmt.Errorf("unknown ResponseApplySnapshotChunk result %v", resp.Result) 409 } 410 } 411 } 412 413 // fetchChunks requests chunks from peers, receiving allocations from the chunk queue. Chunks 414 // will be received from the reactor via syncer.AddChunks() to chunkQueue.Add(). 415 func (s *syncer) fetchChunks(ctx context.Context, snapshot *snapshot, chunks *chunkQueue) { 416 var ( 417 next = true 418 index uint32 419 err error 420 ) 421 422 for { 423 if next { 424 index, err = chunks.Allocate() 425 if errors.Is(err, errDone) { 426 // Keep checking until the context is canceled (restore is done), in case any 427 // chunks need to be refetched. 428 select { 429 case <-ctx.Done(): 430 return 431 default: 432 } 433 time.Sleep(2 * time.Second) 434 continue 435 } 436 if err != nil { 437 s.logger.Error("Failed to allocate chunk from queue", "err", err) 438 return 439 } 440 } 441 s.logger.Info("Fetching snapshot chunk", "height", snapshot.Height, 442 "format", snapshot.Format, "chunk", index, "total", chunks.Size()) 443 444 ticker := time.NewTicker(s.retryTimeout) 445 defer ticker.Stop() 446 447 s.requestChunk(snapshot, index) 448 449 select { 450 case <-chunks.WaitFor(index): 451 next = true 452 453 case <-ticker.C: 454 next = false 455 456 case <-ctx.Done(): 457 return 458 } 459 460 ticker.Stop() 461 } 462 } 463 464 // requestChunk requests a chunk from a peer. 465 func (s *syncer) requestChunk(snapshot *snapshot, chunk uint32) { 466 peer := s.snapshots.GetPeer(snapshot) 467 if peer == nil { 468 s.logger.Error("No valid peers found for snapshot", "height", snapshot.Height, 469 "format", snapshot.Format, "hash", snapshot.Hash) 470 return 471 } 472 s.logger.Debug("Requesting snapshot chunk", "height", snapshot.Height, 473 "format", snapshot.Format, "chunk", chunk, "peer", peer.ID()) 474 p2p.SendEnvelopeShim(peer, p2p.Envelope{ //nolint: staticcheck 475 ChannelID: ChunkChannel, 476 Message: &ssproto.ChunkRequest{ 477 Height: snapshot.Height, 478 Format: snapshot.Format, 479 Index: chunk, 480 }, 481 }, s.logger) 482 } 483 484 // verifyApp verifies the sync, checking the app hash, last block height and app version 485 func (s *syncer) verifyApp(snapshot *snapshot, appVersion uint64) error { 486 resp, err := s.connQuery.InfoSync(proxy.RequestInfo) 487 if err != nil { 488 return fmt.Errorf("failed to query ABCI app for appHash: %w", err) 489 } 490 491 // sanity check that the app version in the block matches the application's own record 492 // of its version 493 if resp.AppVersion != appVersion { 494 // An error here most likely means that the app hasn't inplemented state sync 495 // or the Info call correctly 496 return fmt.Errorf("app version mismatch. Expected: %d, got: %d", 497 appVersion, resp.AppVersion) 498 } 499 if !bytes.Equal(snapshot.trustedAppHash, resp.LastBlockAppHash) { 500 s.logger.Error("appHash verification failed", 501 "expected", snapshot.trustedAppHash, 502 "actual", resp.LastBlockAppHash) 503 return errVerifyFailed 504 } 505 if uint64(resp.LastBlockHeight) != snapshot.Height { 506 s.logger.Error( 507 "ABCI app reported unexpected last block height", 508 "expected", snapshot.Height, 509 "actual", resp.LastBlockHeight, 510 ) 511 return errVerifyFailed 512 } 513 514 s.logger.Info("Verified ABCI app", "height", snapshot.Height, "appHash", snapshot.trustedAppHash) 515 return nil 516 }