github.com/Finschia/ostracon@v1.1.5/statesync/syncer.go (about) 1 package statesync 2 3 import ( 4 "bytes" 5 "context" 6 "errors" 7 "fmt" 8 "time" 9 10 abci "github.com/tendermint/tendermint/abci/types" 11 ssproto "github.com/tendermint/tendermint/proto/tendermint/statesync" 12 13 "github.com/Finschia/ostracon/config" 14 "github.com/Finschia/ostracon/libs/log" 15 tmsync "github.com/Finschia/ostracon/libs/sync" 16 "github.com/Finschia/ostracon/light" 17 "github.com/Finschia/ostracon/p2p" 18 "github.com/Finschia/ostracon/proxy" 19 sm "github.com/Finschia/ostracon/state" 20 "github.com/Finschia/ostracon/types" 21 ) 22 23 const ( 24 // chunkTimeout is the timeout while waiting for the next chunk from the chunk queue. 25 chunkTimeout = 2 * time.Minute 26 27 // minimumDiscoveryTime is the lowest allowable time for a 28 // SyncAny discovery time. 29 minimumDiscoveryTime = 5 * time.Second 30 ) 31 32 var ( 33 // errAbort is returned by Sync() when snapshot restoration is aborted. 34 errAbort = errors.New("state sync aborted") 35 // errRetrySnapshot is returned by Sync() when the snapshot should be retried. 36 errRetrySnapshot = errors.New("retry snapshot") 37 // errRejectSnapshot is returned by Sync() when the snapshot is rejected. 38 errRejectSnapshot = errors.New("snapshot was rejected") 39 // errRejectFormat is returned by Sync() when the snapshot format is rejected. 40 errRejectFormat = errors.New("snapshot format was rejected") 41 // errRejectSender is returned by Sync() when the snapshot sender is rejected. 42 errRejectSender = errors.New("snapshot sender was rejected") 43 // errVerifyFailed is returned by Sync() when app hash or last height verification fails. 44 errVerifyFailed = errors.New("verification failed") 45 // errTimeout is returned by Sync() when we've waited too long to receive a chunk. 46 errTimeout = errors.New("timed out waiting for chunk") 47 // errNoSnapshots is returned by SyncAny() if no snapshots are found and discovery is disabled. 48 errNoSnapshots = errors.New("no suitable snapshots found") 49 ) 50 51 // syncer runs a state sync against an ABCI app. Use either SyncAny() to automatically attempt to 52 // sync all snapshots in the pool (pausing to discover new ones), or Sync() to sync a specific 53 // snapshot. Snapshots and chunks are fed via AddSnapshot() and AddChunk() as appropriate. 54 type syncer struct { 55 logger log.Logger 56 stateProvider StateProvider 57 conn proxy.AppConnSnapshot 58 connQuery proxy.AppConnQuery 59 snapshots *snapshotPool 60 tempDir string 61 chunkFetchers int32 62 retryTimeout time.Duration 63 64 mtx tmsync.RWMutex 65 chunks *chunkQueue 66 } 67 68 // newSyncer creates a new syncer. 69 func newSyncer( 70 cfg config.StateSyncConfig, 71 logger log.Logger, 72 conn proxy.AppConnSnapshot, 73 connQuery proxy.AppConnQuery, 74 stateProvider StateProvider, 75 tempDir string, 76 ) *syncer { 77 78 return &syncer{ 79 logger: logger, 80 stateProvider: stateProvider, 81 conn: conn, 82 connQuery: connQuery, 83 snapshots: newSnapshotPool(), 84 tempDir: tempDir, 85 chunkFetchers: cfg.ChunkFetchers, 86 retryTimeout: cfg.ChunkRequestTimeout, 87 } 88 } 89 90 // AddChunk adds a chunk to the chunk queue, if any. It returns false if the chunk has already 91 // been added to the queue, or an error if there's no sync in progress. 92 func (s *syncer) AddChunk(chunk *chunk) (bool, error) { 93 s.mtx.RLock() 94 defer s.mtx.RUnlock() 95 if s.chunks == nil { 96 return false, errors.New("no state sync in progress") 97 } 98 added, err := s.chunks.Add(chunk) 99 if err != nil { 100 return false, err 101 } 102 if added { 103 s.logger.Debug("Added chunk to queue", "height", chunk.Height, "format", chunk.Format, 104 "chunk", chunk.Index) 105 } else { 106 s.logger.Debug("Ignoring duplicate chunk in queue", "height", chunk.Height, "format", chunk.Format, 107 "chunk", chunk.Index) 108 } 109 return added, nil 110 } 111 112 // AddSnapshot adds a snapshot to the snapshot pool. It returns true if a new, previously unseen 113 // snapshot was accepted and added. 114 func (s *syncer) AddSnapshot(peer p2p.Peer, snapshot *snapshot) (bool, error) { 115 added, err := s.snapshots.Add(peer, snapshot) 116 if err != nil { 117 return false, err 118 } 119 if added { 120 s.logger.Info("Discovered new snapshot", "height", snapshot.Height, "format", snapshot.Format, 121 "hash", snapshot.Hash) 122 } 123 return added, nil 124 } 125 126 // AddPeer adds a peer to the pool. For now we just keep it simple and send a single request 127 // to discover snapshots, later we may want to do retries and stuff. 128 func (s *syncer) AddPeer(peer p2p.Peer) { 129 s.logger.Debug("Requesting snapshots from peer", "peer", peer.ID()) 130 e := p2p.Envelope{ 131 ChannelID: SnapshotChannel, 132 Message: &ssproto.SnapshotsRequest{}, 133 } 134 p2p.SendEnvelopeShim(peer, e, s.logger) //nolint: staticcheck 135 } 136 137 // RemovePeer removes a peer from the pool. 138 func (s *syncer) RemovePeer(peer p2p.Peer) { 139 s.logger.Debug("Removing peer from sync", "peer", peer.ID()) 140 s.snapshots.RemovePeer(peer.ID()) 141 } 142 143 // SyncAny tries to sync any of the snapshots in the snapshot pool, waiting to discover further 144 // snapshots if none were found and discoveryTime > 0. It returns the latest state, previous state and block commit 145 // which the caller must use to bootstrap the node. 146 func (s *syncer) SyncAny(discoveryTime time.Duration, retryHook func()) (sm.State, sm.State, *types.Commit, error) { 147 if discoveryTime != 0 && discoveryTime < minimumDiscoveryTime { 148 discoveryTime = 5 * minimumDiscoveryTime 149 } 150 151 if discoveryTime > 0 { 152 s.logger.Info("sync any", "msg", log.NewLazySprintf("Discovering snapshots for %v", discoveryTime)) 153 time.Sleep(discoveryTime) 154 } 155 156 // The app may ask us to retry a snapshot restoration, in which case we need to reuse 157 // the snapshot and chunk queue from the previous loop iteration. 158 var ( 159 snapshot *snapshot 160 chunks *chunkQueue 161 err error 162 ) 163 for { 164 // If not nil, we're going to retry restoration of the same snapshot. 165 if snapshot == nil { 166 snapshot = s.snapshots.Best() 167 chunks = nil 168 } 169 if snapshot == nil { 170 if discoveryTime == 0 { 171 return sm.State{}, sm.State{}, nil, errNoSnapshots 172 } 173 retryHook() 174 s.logger.Info("sync any", "msg", log.NewLazySprintf("Discovering snapshots for %v", discoveryTime)) 175 time.Sleep(discoveryTime) 176 continue 177 } 178 if chunks == nil { 179 chunks, err = newChunkQueue(snapshot, s.tempDir) 180 if err != nil { 181 return sm.State{}, sm.State{}, nil, fmt.Errorf("failed to create chunk queue: %w", err) 182 } 183 defer chunks.Close() // in case we forget to close it elsewhere 184 } 185 186 newState, previousState, commit, err := s.Sync(snapshot, chunks) 187 switch { 188 case err == nil: 189 return newState, previousState, commit, nil 190 191 case errors.Is(err, errAbort): 192 return sm.State{}, sm.State{}, nil, err 193 194 case errors.Is(err, errRetrySnapshot): 195 chunks.RetryAll() 196 s.logger.Info("Retrying snapshot", "height", snapshot.Height, "format", snapshot.Format, 197 "hash", snapshot.Hash) 198 continue 199 200 case errors.Is(err, errTimeout): 201 s.snapshots.Reject(snapshot) 202 s.logger.Error("Timed out waiting for snapshot chunks, rejected snapshot", 203 "height", snapshot.Height, "format", snapshot.Format, "hash", snapshot.Hash) 204 205 case errors.Is(err, errRejectSnapshot): 206 s.snapshots.Reject(snapshot) 207 s.logger.Info("Snapshot rejected", "height", snapshot.Height, "format", snapshot.Format, 208 "hash", snapshot.Hash) 209 210 case errors.Is(err, errRejectFormat): 211 s.snapshots.RejectFormat(snapshot.Format) 212 s.logger.Info("Snapshot format rejected", "format", snapshot.Format) 213 214 case errors.Is(err, errRejectSender): 215 s.logger.Info("Snapshot senders rejected", "height", snapshot.Height, "format", snapshot.Format, 216 "hash", snapshot.Hash) 217 for _, peer := range s.snapshots.GetPeers(snapshot) { 218 s.snapshots.RejectPeer(peer.ID()) 219 s.logger.Info("Snapshot sender rejected", "peer", peer.ID()) 220 } 221 222 case errors.Is(err, context.DeadlineExceeded): 223 s.logger.Info("Timed out validating snapshot, rejecting", "height", snapshot.Height, "err", err) 224 s.snapshots.Reject(snapshot) 225 226 default: 227 return sm.State{}, sm.State{}, nil, fmt.Errorf("snapshot restoration failed: %w", err) 228 } 229 230 // Discard snapshot and chunks for next iteration 231 err = chunks.Close() 232 if err != nil { 233 s.logger.Error("Failed to clean up chunk queue", "err", err) 234 } 235 snapshot = nil 236 chunks = nil 237 } 238 } 239 240 // Sync executes a sync for a specific snapshot, returning the latest state, previous state and block commit which 241 // the caller must use to bootstrap the node. 242 func (s *syncer) Sync(snapshot *snapshot, chunks *chunkQueue) (sm.State, sm.State, *types.Commit, error) { 243 s.mtx.Lock() 244 if s.chunks != nil { 245 s.mtx.Unlock() 246 return sm.State{}, sm.State{}, nil, errors.New("a state sync is already in progress") 247 } 248 s.chunks = chunks 249 s.mtx.Unlock() 250 defer func() { 251 s.mtx.Lock() 252 s.chunks = nil 253 s.mtx.Unlock() 254 }() 255 256 hctx, cancel := context.WithTimeout(context.TODO(), 30*time.Second) 257 defer cancel() 258 259 appHash, err := s.stateProvider.AppHash(hctx, snapshot.Height) 260 if err != nil { 261 s.logger.Info("failed to fetch and verify app hash", "err", err) 262 if err == light.ErrNoWitnesses { 263 return sm.State{}, sm.State{}, nil, err 264 } 265 return sm.State{}, sm.State{}, nil, errRejectSnapshot 266 } 267 snapshot.trustedAppHash = appHash 268 269 // Offer snapshot to ABCI app. 270 err = s.offerSnapshot(snapshot) 271 if err != nil { 272 return sm.State{}, sm.State{}, nil, err 273 } 274 275 // Spawn chunk fetchers. They will terminate when the chunk queue is closed or context cancelled. 276 fetchCtx, cancel := context.WithCancel(context.TODO()) 277 defer cancel() 278 for i := int32(0); i < s.chunkFetchers; i++ { 279 go s.fetchChunks(fetchCtx, snapshot, chunks) 280 } 281 282 pctx, pcancel := context.WithTimeout(context.TODO(), 30*time.Second) 283 defer pcancel() 284 285 // Optimistically build new state, so we don't discover any light client failures at the end. 286 state, err := s.stateProvider.State(pctx, snapshot.Height) 287 if err != nil { 288 s.logger.Info("failed to fetch and verify ostracon state", "err", err) 289 if err == light.ErrNoWitnesses { 290 return sm.State{}, sm.State{}, nil, err 291 } 292 return sm.State{}, sm.State{}, nil, errRejectSnapshot 293 } 294 var previousState sm.State 295 if snapshot.Height-1 < 1 { 296 previousState = sm.State{} 297 } else { 298 previousState, err = s.stateProvider.State(pctx, snapshot.Height-1) 299 if err != nil { 300 s.logger.Info("failed to fetch and verify ostracon previous state", "err", err) 301 if err == light.ErrNoWitnesses { 302 return sm.State{}, sm.State{}, nil, err 303 } 304 return sm.State{}, sm.State{}, nil, errRejectSnapshot 305 } 306 } 307 commit, err := s.stateProvider.Commit(pctx, snapshot.Height) 308 if err != nil { 309 s.logger.Info("failed to fetch and verify commit", "err", err) 310 if err == light.ErrNoWitnesses { 311 return sm.State{}, sm.State{}, nil, err 312 } 313 return sm.State{}, sm.State{}, nil, errRejectSnapshot 314 } 315 316 // Restore snapshot 317 err = s.applyChunks(chunks) 318 if err != nil { 319 return sm.State{}, sm.State{}, nil, err 320 } 321 322 // Verify app and app version 323 if err := s.verifyApp(snapshot, state.Version.Consensus.App); err != nil { 324 return sm.State{}, sm.State{}, nil, err 325 } 326 327 // Done! 🎉 328 s.logger.Info("Snapshot restored", "height", snapshot.Height, "format", snapshot.Format, 329 "hash", snapshot.Hash) 330 331 return state, previousState, commit, nil 332 } 333 334 // offerSnapshot offers a snapshot to the app. It returns various errors depending on the app's 335 // response, or nil if the snapshot was accepted. 336 func (s *syncer) offerSnapshot(snapshot *snapshot) error { 337 s.logger.Info("Offering snapshot to ABCI app", "height", snapshot.Height, 338 "format", snapshot.Format, "hash", snapshot.Hash) 339 resp, err := s.conn.OfferSnapshotSync(abci.RequestOfferSnapshot{ 340 Snapshot: &abci.Snapshot{ 341 Height: snapshot.Height, 342 Format: snapshot.Format, 343 Chunks: snapshot.Chunks, 344 Hash: snapshot.Hash, 345 Metadata: snapshot.Metadata, 346 }, 347 AppHash: snapshot.trustedAppHash, 348 }) 349 if err != nil { 350 return fmt.Errorf("failed to offer snapshot: %w", err) 351 } 352 switch resp.Result { 353 case abci.ResponseOfferSnapshot_ACCEPT: 354 s.logger.Info("Snapshot accepted, restoring", "height", snapshot.Height, 355 "format", snapshot.Format, "hash", snapshot.Hash) 356 return nil 357 case abci.ResponseOfferSnapshot_ABORT: 358 return errAbort 359 case abci.ResponseOfferSnapshot_REJECT: 360 return errRejectSnapshot 361 case abci.ResponseOfferSnapshot_REJECT_FORMAT: 362 return errRejectFormat 363 case abci.ResponseOfferSnapshot_REJECT_SENDER: 364 return errRejectSender 365 default: 366 return fmt.Errorf("unknown ResponseOfferSnapshot result %v", resp.Result) 367 } 368 } 369 370 // applyChunks applies chunks to the app. It returns various errors depending on the app's 371 // response, or nil once the snapshot is fully restored. 372 func (s *syncer) applyChunks(chunks *chunkQueue) error { 373 for { 374 chunk, err := chunks.Next() 375 if err == errDone { 376 return nil 377 } else if err != nil { 378 return fmt.Errorf("failed to fetch chunk: %w", err) 379 } 380 381 resp, err := s.conn.ApplySnapshotChunkSync(abci.RequestApplySnapshotChunk{ 382 Index: chunk.Index, 383 Chunk: chunk.Chunk, 384 Sender: string(chunk.Sender), 385 }) 386 if err != nil { 387 return fmt.Errorf("failed to apply chunk %v: %w", chunk.Index, err) 388 } 389 s.logger.Info("Applied snapshot chunk to ABCI app", "height", chunk.Height, 390 "format", chunk.Format, "chunk", chunk.Index, "total", chunks.Size()) 391 392 // Discard and refetch any chunks as requested by the app 393 for _, index := range resp.RefetchChunks { 394 err := chunks.Discard(index) 395 if err != nil { 396 return fmt.Errorf("failed to discard chunk %v: %w", index, err) 397 } 398 } 399 400 // Reject any senders as requested by the app 401 for _, sender := range resp.RejectSenders { 402 if sender != "" { 403 s.snapshots.RejectPeer(p2p.ID(sender)) 404 err := chunks.DiscardSender(p2p.ID(sender)) 405 if err != nil { 406 return fmt.Errorf("failed to reject sender: %w", err) 407 } 408 } 409 } 410 411 switch resp.Result { 412 case abci.ResponseApplySnapshotChunk_ACCEPT: 413 case abci.ResponseApplySnapshotChunk_ABORT: 414 return errAbort 415 case abci.ResponseApplySnapshotChunk_RETRY: 416 chunks.Retry(chunk.Index) 417 case abci.ResponseApplySnapshotChunk_RETRY_SNAPSHOT: 418 return errRetrySnapshot 419 case abci.ResponseApplySnapshotChunk_REJECT_SNAPSHOT: 420 return errRejectSnapshot 421 default: 422 return fmt.Errorf("unknown ResponseApplySnapshotChunk result %v", resp.Result) 423 } 424 } 425 } 426 427 // fetchChunks requests chunks from peers, receiving allocations from the chunk queue. Chunks 428 // will be received from the reactor via syncer.AddChunks() to chunkQueue.Add(). 429 func (s *syncer) fetchChunks(ctx context.Context, snapshot *snapshot, chunks *chunkQueue) { 430 var ( 431 next = true 432 index uint32 433 err error 434 ) 435 436 for { 437 if next { 438 index, err = chunks.Allocate() 439 if errors.Is(err, errDone) { 440 // Keep checking until the context is canceled (restore is done), in case any 441 // chunks need to be refetched. 442 select { 443 case <-ctx.Done(): 444 return 445 default: 446 } 447 time.Sleep(2 * time.Second) 448 continue 449 } 450 if err != nil { 451 s.logger.Error("Failed to allocate chunk from queue", "err", err) 452 return 453 } 454 } 455 s.logger.Info("Fetching snapshot chunk", "height", snapshot.Height, 456 "format", snapshot.Format, "chunk", index, "total", chunks.Size()) 457 458 ticker := time.NewTicker(s.retryTimeout) 459 defer ticker.Stop() 460 461 s.requestChunk(snapshot, index) 462 463 select { 464 case <-chunks.WaitFor(index): 465 next = true 466 467 case <-ticker.C: 468 next = false 469 470 case <-ctx.Done(): 471 return 472 } 473 474 ticker.Stop() 475 } 476 } 477 478 // requestChunk requests a chunk from a peer. 479 func (s *syncer) requestChunk(snapshot *snapshot, chunk uint32) { 480 peer := s.snapshots.GetPeer(snapshot) 481 if peer == nil { 482 s.logger.Error("No valid peers found for snapshot", "height", snapshot.Height, 483 "format", snapshot.Format, "hash", snapshot.Hash) 484 return 485 } 486 s.logger.Debug("Requesting snapshot chunk", "height", snapshot.Height, 487 "format", snapshot.Format, "chunk", chunk, "peer", peer.ID()) 488 p2p.SendEnvelopeShim(peer, p2p.Envelope{ //nolint: staticcheck 489 ChannelID: ChunkChannel, 490 Message: &ssproto.ChunkRequest{ 491 Height: snapshot.Height, 492 Format: snapshot.Format, 493 Index: chunk, 494 }, 495 }, s.logger) 496 } 497 498 // verifyApp verifies the sync, checking the app hash, last block height and app version 499 func (s *syncer) verifyApp(snapshot *snapshot, appVersion uint64) error { 500 // XXX Is it okay to check with "ABCI.Info" only 501 // since AppVersion will be updated by "ABCI.InitChain" and "ABCI.EndBlock"? 502 resp, err := s.connQuery.InfoSync(proxy.RequestInfo) 503 if err != nil { 504 return fmt.Errorf("failed to query ABCI app for appHash: %w", err) 505 } 506 507 // sanity check that the app version in the block matches the application's own record 508 // of its version 509 if resp.AppVersion != appVersion { 510 // An error here most likely means that the app hasn't implemented state sync 511 // or the Info call correctly 512 return fmt.Errorf("app version mismatch. Expected: %d, got: %d", 513 appVersion, resp.AppVersion) 514 } 515 if !bytes.Equal(snapshot.trustedAppHash, resp.LastBlockAppHash) { 516 s.logger.Error("appHash verification failed", 517 "expected", snapshot.trustedAppHash, 518 "actual", resp.LastBlockAppHash) 519 return errVerifyFailed 520 } 521 if uint64(resp.LastBlockHeight) != snapshot.Height { 522 s.logger.Error( 523 "ABCI app reported unexpected last block height", 524 "expected", snapshot.Height, 525 "actual", resp.LastBlockHeight, 526 ) 527 return errVerifyFailed 528 } 529 530 s.logger.Info("Verified ABCI app", "height", snapshot.Height, "appHash", snapshot.trustedAppHash) 531 return nil 532 }