github.com/adoriasoft/tendermint@v0.34.0-dev1.0.20200722151356-96d84601a75a/statesync/syncer.go (about) 1 package statesync 2 3 import ( 4 "bytes" 5 "context" 6 "errors" 7 "fmt" 8 "time" 9 10 abci "github.com/tendermint/tendermint/abci/types" 11 "github.com/tendermint/tendermint/libs/log" 12 tmsync "github.com/tendermint/tendermint/libs/sync" 13 "github.com/tendermint/tendermint/p2p" 14 ssproto "github.com/tendermint/tendermint/proto/tendermint/statesync" 15 "github.com/tendermint/tendermint/proxy" 16 sm "github.com/tendermint/tendermint/state" 17 "github.com/tendermint/tendermint/types" 18 ) 19 20 const ( 21 // defaultDiscoveryTime is the time to spend discovering snapshots. 22 defaultDiscoveryTime = 20 * time.Second 23 // chunkFetchers is the number of concurrent chunk fetchers to run. 24 chunkFetchers = 4 25 // chunkTimeout is the timeout while waiting for the next chunk from the chunk queue. 26 chunkTimeout = 2 * time.Minute 27 // requestTimeout is the timeout before rerequesting a chunk, possibly from a different peer. 28 chunkRequestTimeout = 10 * time.Second 29 ) 30 31 var ( 32 // errAbort is returned by Sync() when snapshot restoration is aborted. 33 errAbort = errors.New("state sync aborted") 34 // errRetrySnapshot is returned by Sync() when the snapshot should be retried. 35 errRetrySnapshot = errors.New("retry snapshot") 36 // errRejectSnapshot is returned by Sync() when the snapshot is rejected. 37 errRejectSnapshot = errors.New("snapshot was rejected") 38 // errRejectFormat is returned by Sync() when the snapshot format is rejected. 39 errRejectFormat = errors.New("snapshot format was rejected") 40 // errRejectSender is returned by Sync() when the snapshot sender is rejected. 41 errRejectSender = errors.New("snapshot sender was rejected") 42 // errVerifyFailed is returned by Sync() when app hash or last height verification fails. 43 errVerifyFailed = errors.New("verification failed") 44 // errTimeout is returned by Sync() when we've waited too long to receive a chunk. 45 errTimeout = errors.New("timed out waiting for chunk") 46 // errNoSnapshots is returned by SyncAny() if no snapshots are found and discovery is disabled. 47 errNoSnapshots = errors.New("no suitable snapshots found") 48 ) 49 50 // syncer runs a state sync against an ABCI app. Use either SyncAny() to automatically attempt to 51 // sync all snapshots in the pool (pausing to discover new ones), or Sync() to sync a specific 52 // snapshot. Snapshots and chunks are fed via AddSnapshot() and AddChunk() as appropriate. 53 type syncer struct { 54 logger log.Logger 55 stateProvider StateProvider 56 conn proxy.AppConnSnapshot 57 connQuery proxy.AppConnQuery 58 snapshots *snapshotPool 59 tempDir string 60 61 mtx tmsync.RWMutex 62 chunks *chunkQueue 63 } 64 65 // newSyncer creates a new syncer. 66 func newSyncer(logger log.Logger, conn proxy.AppConnSnapshot, connQuery proxy.AppConnQuery, 67 stateProvider StateProvider, tempDir string) *syncer { 68 return &syncer{ 69 logger: logger, 70 stateProvider: stateProvider, 71 conn: conn, 72 connQuery: connQuery, 73 snapshots: newSnapshotPool(stateProvider), 74 tempDir: tempDir, 75 } 76 } 77 78 // AddChunk adds a chunk to the chunk queue, if any. It returns false if the chunk has already 79 // been added to the queue, or an error if there's no sync in progress. 80 func (s *syncer) AddChunk(chunk *chunk) (bool, error) { 81 s.mtx.RLock() 82 defer s.mtx.RUnlock() 83 if s.chunks == nil { 84 return false, errors.New("no state sync in progress") 85 } 86 added, err := s.chunks.Add(chunk) 87 if err != nil { 88 return false, err 89 } 90 if added { 91 s.logger.Debug("Added chunk to queue", "height", chunk.Height, "format", chunk.Format, 92 "chunk", chunk.Index) 93 } else { 94 s.logger.Debug("Ignoring duplicate chunk in queue", "height", chunk.Height, "format", chunk.Format, 95 "chunk", chunk.Index) 96 } 97 return added, nil 98 } 99 100 // AddSnapshot adds a snapshot to the snapshot pool. It returns true if a new, previously unseen 101 // snapshot was accepted and added. 102 func (s *syncer) AddSnapshot(peer p2p.Peer, snapshot *snapshot) (bool, error) { 103 added, err := s.snapshots.Add(peer, snapshot) 104 if err != nil { 105 return false, err 106 } 107 if added { 108 s.logger.Info("Discovered new snapshot", "height", snapshot.Height, "format", snapshot.Format, 109 "hash", fmt.Sprintf("%X", snapshot.Hash)) 110 } 111 return added, nil 112 } 113 114 // AddPeer adds a peer to the pool. For now we just keep it simple and send a single request 115 // to discover snapshots, later we may want to do retries and stuff. 116 func (s *syncer) AddPeer(peer p2p.Peer) { 117 s.logger.Debug("Requesting snapshots from peer", "peer", peer.ID()) 118 peer.Send(SnapshotChannel, mustEncodeMsg(&ssproto.SnapshotsRequest{})) 119 } 120 121 // RemovePeer removes a peer from the pool. 122 func (s *syncer) RemovePeer(peer p2p.Peer) { 123 s.logger.Debug("Removing peer from sync", "peer", peer.ID()) 124 s.snapshots.RemovePeer(peer.ID()) 125 } 126 127 // SyncAny tries to sync any of the snapshots in the snapshot pool, waiting to discover further 128 // snapshots if none were found and discoveryTime > 0. It returns the latest state and block commit 129 // which the caller must use to bootstrap the node. 130 func (s *syncer) SyncAny(discoveryTime time.Duration) (sm.State, *types.Commit, error) { 131 if discoveryTime > 0 { 132 s.logger.Info(fmt.Sprintf("Discovering snapshots for %v", discoveryTime)) 133 time.Sleep(discoveryTime) 134 } 135 136 // The app may ask us to retry a snapshot restoration, in which case we need to reuse 137 // the snapshot and chunk queue from the previous loop iteration. 138 var ( 139 snapshot *snapshot 140 chunks *chunkQueue 141 err error 142 ) 143 for { 144 // If not nil, we're going to retry restoration of the same snapshot. 145 if snapshot == nil { 146 snapshot = s.snapshots.Best() 147 chunks = nil 148 } 149 if snapshot == nil { 150 if discoveryTime == 0 { 151 return sm.State{}, nil, errNoSnapshots 152 } 153 s.logger.Info(fmt.Sprintf("Discovering snapshots for %v", discoveryTime)) 154 time.Sleep(discoveryTime) 155 continue 156 } 157 if chunks == nil { 158 chunks, err = newChunkQueue(snapshot, s.tempDir) 159 if err != nil { 160 return sm.State{}, nil, fmt.Errorf("failed to create chunk queue: %w", err) 161 } 162 defer chunks.Close() // in case we forget to close it elsewhere 163 } 164 165 newState, commit, err := s.Sync(snapshot, chunks) 166 switch { 167 case err == nil: 168 return newState, commit, nil 169 170 case errors.Is(err, errAbort): 171 return sm.State{}, nil, err 172 173 case errors.Is(err, errRetrySnapshot): 174 chunks.RetryAll() 175 s.logger.Info("Retrying snapshot", "height", snapshot.Height, "format", snapshot.Format, 176 "hash", fmt.Sprintf("%X", snapshot.Hash)) 177 continue 178 179 case errors.Is(err, errTimeout): 180 s.snapshots.Reject(snapshot) 181 s.logger.Error("Timed out waiting for snapshot chunks, rejected snapshot", 182 "height", snapshot.Height, "format", snapshot.Format, "hash", fmt.Sprintf("%X", snapshot.Hash)) 183 184 case errors.Is(err, errRejectSnapshot): 185 s.snapshots.Reject(snapshot) 186 s.logger.Info("Snapshot rejected", "height", snapshot.Height, "format", snapshot.Format, 187 "hash", fmt.Sprintf("%X", snapshot.Hash)) 188 189 case errors.Is(err, errRejectFormat): 190 s.snapshots.RejectFormat(snapshot.Format) 191 s.logger.Info("Snapshot format rejected", "format", snapshot.Format) 192 193 case errors.Is(err, errRejectSender): 194 s.logger.Info("Snapshot senders rejected", "height", snapshot.Height, "format", snapshot.Format, 195 "hash", fmt.Sprintf("%X", snapshot.Hash)) 196 for _, peer := range s.snapshots.GetPeers(snapshot) { 197 s.snapshots.RejectPeer(peer.ID()) 198 s.logger.Info("Snapshot sender rejected", "peer", peer.ID()) 199 } 200 201 default: 202 return sm.State{}, nil, fmt.Errorf("snapshot restoration failed: %w", err) 203 } 204 205 // Discard snapshot and chunks for next iteration 206 err = chunks.Close() 207 if err != nil { 208 s.logger.Error("Failed to clean up chunk queue", "err", err) 209 } 210 snapshot = nil 211 chunks = nil 212 } 213 } 214 215 // Sync executes a sync for a specific snapshot, returning the latest state and block commit which 216 // the caller must use to bootstrap the node. 217 func (s *syncer) Sync(snapshot *snapshot, chunks *chunkQueue) (sm.State, *types.Commit, error) { 218 s.mtx.Lock() 219 if s.chunks != nil { 220 s.mtx.Unlock() 221 return sm.State{}, nil, errors.New("a state sync is already in progress") 222 } 223 s.chunks = chunks 224 s.mtx.Unlock() 225 defer func() { 226 s.mtx.Lock() 227 s.chunks = nil 228 s.mtx.Unlock() 229 }() 230 231 // Offer snapshot to ABCI app. 232 err := s.offerSnapshot(snapshot) 233 if err != nil { 234 return sm.State{}, nil, err 235 } 236 237 // Spawn chunk fetchers. They will terminate when the chunk queue is closed or context cancelled. 238 ctx, cancel := context.WithCancel(context.Background()) 239 defer cancel() 240 for i := int32(0); i < chunkFetchers; i++ { 241 go s.fetchChunks(ctx, snapshot, chunks) 242 } 243 244 // Optimistically build new state, so we don't discover any light client failures at the end. 245 state, err := s.stateProvider.State(snapshot.Height) 246 if err != nil { 247 return sm.State{}, nil, fmt.Errorf("failed to build new state: %w", err) 248 } 249 commit, err := s.stateProvider.Commit(snapshot.Height) 250 if err != nil { 251 return sm.State{}, nil, fmt.Errorf("failed to fetch commit: %w", err) 252 } 253 254 // Restore snapshot 255 err = s.applyChunks(chunks) 256 if err != nil { 257 return sm.State{}, nil, err 258 } 259 260 // Verify app and update app version 261 appVersion, err := s.verifyApp(snapshot) 262 if err != nil { 263 return sm.State{}, nil, err 264 } 265 state.Version.Consensus.App = appVersion 266 267 // Done! 🎉 268 s.logger.Info("Snapshot restored", "height", snapshot.Height, "format", snapshot.Format, 269 "hash", fmt.Sprintf("%X", snapshot.Hash)) 270 271 return state, commit, nil 272 } 273 274 // offerSnapshot offers a snapshot to the app. It returns various errors depending on the app's 275 // response, or nil if the snapshot was accepted. 276 func (s *syncer) offerSnapshot(snapshot *snapshot) error { 277 s.logger.Info("Offering snapshot to ABCI app", "height", snapshot.Height, 278 "format", snapshot.Format, "hash", fmt.Sprintf("%X", snapshot.Hash)) 279 resp, err := s.conn.OfferSnapshotSync(abci.RequestOfferSnapshot{ 280 Snapshot: &abci.Snapshot{ 281 Height: snapshot.Height, 282 Format: snapshot.Format, 283 Chunks: snapshot.Chunks, 284 Hash: snapshot.Hash, 285 Metadata: snapshot.Metadata, 286 }, 287 AppHash: snapshot.trustedAppHash, 288 }) 289 if err != nil { 290 return fmt.Errorf("failed to offer snapshot: %w", err) 291 } 292 switch resp.Result { 293 case abci.ResponseOfferSnapshot_ACCEPT: 294 s.logger.Info("Snapshot accepted, restoring", "height", snapshot.Height, 295 "format", snapshot.Format, "hash", fmt.Sprintf("%X", snapshot.Hash)) 296 return nil 297 case abci.ResponseOfferSnapshot_ABORT: 298 return errAbort 299 case abci.ResponseOfferSnapshot_REJECT: 300 return errRejectSnapshot 301 case abci.ResponseOfferSnapshot_REJECT_FORMAT: 302 return errRejectFormat 303 case abci.ResponseOfferSnapshot_REJECT_SENDER: 304 return errRejectSender 305 default: 306 return fmt.Errorf("unknown ResponseOfferSnapshot result %v", resp.Result) 307 } 308 } 309 310 // applyChunks applies chunks to the app. It returns various errors depending on the app's 311 // response, or nil once the snapshot is fully restored. 312 func (s *syncer) applyChunks(chunks *chunkQueue) error { 313 for { 314 chunk, err := chunks.Next() 315 if err == errDone { 316 return nil 317 } else if err != nil { 318 return fmt.Errorf("failed to fetch chunk: %w", err) 319 } 320 321 resp, err := s.conn.ApplySnapshotChunkSync(abci.RequestApplySnapshotChunk{ 322 Index: chunk.Index, 323 Chunk: chunk.Chunk, 324 Sender: string(chunk.Sender), 325 }) 326 if err != nil { 327 return fmt.Errorf("failed to apply chunk %v: %w", chunk.Index, err) 328 } 329 s.logger.Info("Applied snapshot chunk to ABCI app", "height", chunk.Height, 330 "format", chunk.Format, "chunk", chunk.Index, "total", chunks.Size()) 331 332 // Discard and refetch any chunks as requested by the app 333 for _, index := range resp.RefetchChunks { 334 err := chunks.Discard(index) 335 if err != nil { 336 return fmt.Errorf("failed to discard chunk %v: %w", index, err) 337 } 338 } 339 340 // Reject any senders as requested by the app 341 for _, sender := range resp.RejectSenders { 342 if sender != "" { 343 s.snapshots.RejectPeer(p2p.ID(sender)) 344 err := chunks.DiscardSender(p2p.ID(sender)) 345 if err != nil { 346 return fmt.Errorf("failed to reject sender: %w", err) 347 } 348 } 349 } 350 351 switch resp.Result { 352 case abci.ResponseApplySnapshotChunk_ACCEPT: 353 case abci.ResponseApplySnapshotChunk_ABORT: 354 return errAbort 355 case abci.ResponseApplySnapshotChunk_RETRY: 356 chunks.Retry(chunk.Index) 357 case abci.ResponseApplySnapshotChunk_RETRY_SNAPSHOT: 358 return errRetrySnapshot 359 case abci.ResponseApplySnapshotChunk_REJECT_SNAPSHOT: 360 return errRejectSnapshot 361 default: 362 return fmt.Errorf("unknown ResponseApplySnapshotChunk result %v", resp.Result) 363 } 364 } 365 } 366 367 // fetchChunks requests chunks from peers, receiving allocations from the chunk queue. Chunks 368 // will be received from the reactor via syncer.AddChunks() to chunkQueue.Add(). 369 func (s *syncer) fetchChunks(ctx context.Context, snapshot *snapshot, chunks *chunkQueue) { 370 for { 371 index, err := chunks.Allocate() 372 if err == errDone { 373 // Keep checking until the context is cancelled (restore is done), in case any 374 // chunks need to be refetched. 375 select { 376 case <-ctx.Done(): 377 return 378 default: 379 } 380 time.Sleep(2 * time.Second) 381 continue 382 } 383 if err != nil { 384 s.logger.Error("Failed to allocate chunk from queue", "err", err) 385 return 386 } 387 s.logger.Info("Fetching snapshot chunk", "height", snapshot.Height, 388 "format", snapshot.Format, "chunk", index, "total", chunks.Size()) 389 390 ticker := time.NewTicker(chunkRequestTimeout) 391 defer ticker.Stop() 392 s.requestChunk(snapshot, index) 393 select { 394 case <-chunks.WaitFor(index): 395 case <-ticker.C: 396 s.requestChunk(snapshot, index) 397 case <-ctx.Done(): 398 return 399 } 400 ticker.Stop() 401 } 402 } 403 404 // requestChunk requests a chunk from a peer. 405 func (s *syncer) requestChunk(snapshot *snapshot, chunk uint32) { 406 peer := s.snapshots.GetPeer(snapshot) 407 if peer == nil { 408 s.logger.Error("No valid peers found for snapshot", "height", snapshot.Height, 409 "format", snapshot.Format, "hash", snapshot.Hash) 410 return 411 } 412 s.logger.Debug("Requesting snapshot chunk", "height", snapshot.Height, 413 "format", snapshot.Format, "chunk", chunk, "peer", peer.ID()) 414 peer.Send(ChunkChannel, mustEncodeMsg(&ssproto.ChunkRequest{ 415 Height: snapshot.Height, 416 Format: snapshot.Format, 417 Index: chunk, 418 })) 419 } 420 421 // verifyApp verifies the sync, checking the app hash and last block height. It returns the 422 // app version, which should be returned as part of the initial state. 423 func (s *syncer) verifyApp(snapshot *snapshot) (uint64, error) { 424 resp, err := s.connQuery.InfoSync(proxy.RequestInfo) 425 if err != nil { 426 return 0, fmt.Errorf("failed to query ABCI app for appHash: %w", err) 427 } 428 if !bytes.Equal(snapshot.trustedAppHash, resp.LastBlockAppHash) { 429 s.logger.Error("appHash verification failed", 430 "expected", fmt.Sprintf("%X", snapshot.trustedAppHash), 431 "actual", fmt.Sprintf("%X", resp.LastBlockAppHash)) 432 return 0, errVerifyFailed 433 } 434 if uint64(resp.LastBlockHeight) != snapshot.Height { 435 s.logger.Error("ABCI app reported unexpected last block height", 436 "expected", snapshot.Height, "actual", resp.LastBlockHeight) 437 return 0, errVerifyFailed 438 } 439 s.logger.Info("Verified ABCI app", "height", snapshot.Height, 440 "appHash", fmt.Sprintf("%X", snapshot.trustedAppHash)) 441 return resp.AppVersion, nil 442 }