github.com/adoriasoft/tendermint@v0.34.0-dev1.0.20200722151356-96d84601a75a/statesync/syncer.go (about)

     1  package statesync
     2  
     3  import (
     4  	"bytes"
     5  	"context"
     6  	"errors"
     7  	"fmt"
     8  	"time"
     9  
    10  	abci "github.com/tendermint/tendermint/abci/types"
    11  	"github.com/tendermint/tendermint/libs/log"
    12  	tmsync "github.com/tendermint/tendermint/libs/sync"
    13  	"github.com/tendermint/tendermint/p2p"
    14  	ssproto "github.com/tendermint/tendermint/proto/tendermint/statesync"
    15  	"github.com/tendermint/tendermint/proxy"
    16  	sm "github.com/tendermint/tendermint/state"
    17  	"github.com/tendermint/tendermint/types"
    18  )
    19  
    20  const (
    21  	// defaultDiscoveryTime is the time to spend discovering snapshots.
    22  	defaultDiscoveryTime = 20 * time.Second
    23  	// chunkFetchers is the number of concurrent chunk fetchers to run.
    24  	chunkFetchers = 4
    25  	// chunkTimeout is the timeout while waiting for the next chunk from the chunk queue.
    26  	chunkTimeout = 2 * time.Minute
    27  	// requestTimeout is the timeout before rerequesting a chunk, possibly from a different peer.
    28  	chunkRequestTimeout = 10 * time.Second
    29  )
    30  
    31  var (
    32  	// errAbort is returned by Sync() when snapshot restoration is aborted.
    33  	errAbort = errors.New("state sync aborted")
    34  	// errRetrySnapshot is returned by Sync() when the snapshot should be retried.
    35  	errRetrySnapshot = errors.New("retry snapshot")
    36  	// errRejectSnapshot is returned by Sync() when the snapshot is rejected.
    37  	errRejectSnapshot = errors.New("snapshot was rejected")
    38  	// errRejectFormat is returned by Sync() when the snapshot format is rejected.
    39  	errRejectFormat = errors.New("snapshot format was rejected")
    40  	// errRejectSender is returned by Sync() when the snapshot sender is rejected.
    41  	errRejectSender = errors.New("snapshot sender was rejected")
    42  	// errVerifyFailed is returned by Sync() when app hash or last height verification fails.
    43  	errVerifyFailed = errors.New("verification failed")
    44  	// errTimeout is returned by Sync() when we've waited too long to receive a chunk.
    45  	errTimeout = errors.New("timed out waiting for chunk")
    46  	// errNoSnapshots is returned by SyncAny() if no snapshots are found and discovery is disabled.
    47  	errNoSnapshots = errors.New("no suitable snapshots found")
    48  )
    49  
    50  // syncer runs a state sync against an ABCI app. Use either SyncAny() to automatically attempt to
    51  // sync all snapshots in the pool (pausing to discover new ones), or Sync() to sync a specific
    52  // snapshot. Snapshots and chunks are fed via AddSnapshot() and AddChunk() as appropriate.
    53  type syncer struct {
    54  	logger        log.Logger
    55  	stateProvider StateProvider
    56  	conn          proxy.AppConnSnapshot
    57  	connQuery     proxy.AppConnQuery
    58  	snapshots     *snapshotPool
    59  	tempDir       string
    60  
    61  	mtx    tmsync.RWMutex
    62  	chunks *chunkQueue
    63  }
    64  
    65  // newSyncer creates a new syncer.
    66  func newSyncer(logger log.Logger, conn proxy.AppConnSnapshot, connQuery proxy.AppConnQuery,
    67  	stateProvider StateProvider, tempDir string) *syncer {
    68  	return &syncer{
    69  		logger:        logger,
    70  		stateProvider: stateProvider,
    71  		conn:          conn,
    72  		connQuery:     connQuery,
    73  		snapshots:     newSnapshotPool(stateProvider),
    74  		tempDir:       tempDir,
    75  	}
    76  }
    77  
    78  // AddChunk adds a chunk to the chunk queue, if any. It returns false if the chunk has already
    79  // been added to the queue, or an error if there's no sync in progress.
    80  func (s *syncer) AddChunk(chunk *chunk) (bool, error) {
    81  	s.mtx.RLock()
    82  	defer s.mtx.RUnlock()
    83  	if s.chunks == nil {
    84  		return false, errors.New("no state sync in progress")
    85  	}
    86  	added, err := s.chunks.Add(chunk)
    87  	if err != nil {
    88  		return false, err
    89  	}
    90  	if added {
    91  		s.logger.Debug("Added chunk to queue", "height", chunk.Height, "format", chunk.Format,
    92  			"chunk", chunk.Index)
    93  	} else {
    94  		s.logger.Debug("Ignoring duplicate chunk in queue", "height", chunk.Height, "format", chunk.Format,
    95  			"chunk", chunk.Index)
    96  	}
    97  	return added, nil
    98  }
    99  
   100  // AddSnapshot adds a snapshot to the snapshot pool. It returns true if a new, previously unseen
   101  // snapshot was accepted and added.
   102  func (s *syncer) AddSnapshot(peer p2p.Peer, snapshot *snapshot) (bool, error) {
   103  	added, err := s.snapshots.Add(peer, snapshot)
   104  	if err != nil {
   105  		return false, err
   106  	}
   107  	if added {
   108  		s.logger.Info("Discovered new snapshot", "height", snapshot.Height, "format", snapshot.Format,
   109  			"hash", fmt.Sprintf("%X", snapshot.Hash))
   110  	}
   111  	return added, nil
   112  }
   113  
   114  // AddPeer adds a peer to the pool. For now we just keep it simple and send a single request
   115  // to discover snapshots, later we may want to do retries and stuff.
   116  func (s *syncer) AddPeer(peer p2p.Peer) {
   117  	s.logger.Debug("Requesting snapshots from peer", "peer", peer.ID())
   118  	peer.Send(SnapshotChannel, mustEncodeMsg(&ssproto.SnapshotsRequest{}))
   119  }
   120  
   121  // RemovePeer removes a peer from the pool.
   122  func (s *syncer) RemovePeer(peer p2p.Peer) {
   123  	s.logger.Debug("Removing peer from sync", "peer", peer.ID())
   124  	s.snapshots.RemovePeer(peer.ID())
   125  }
   126  
   127  // SyncAny tries to sync any of the snapshots in the snapshot pool, waiting to discover further
   128  // snapshots if none were found and discoveryTime > 0. It returns the latest state and block commit
   129  // which the caller must use to bootstrap the node.
   130  func (s *syncer) SyncAny(discoveryTime time.Duration) (sm.State, *types.Commit, error) {
   131  	if discoveryTime > 0 {
   132  		s.logger.Info(fmt.Sprintf("Discovering snapshots for %v", discoveryTime))
   133  		time.Sleep(discoveryTime)
   134  	}
   135  
   136  	// The app may ask us to retry a snapshot restoration, in which case we need to reuse
   137  	// the snapshot and chunk queue from the previous loop iteration.
   138  	var (
   139  		snapshot *snapshot
   140  		chunks   *chunkQueue
   141  		err      error
   142  	)
   143  	for {
   144  		// If not nil, we're going to retry restoration of the same snapshot.
   145  		if snapshot == nil {
   146  			snapshot = s.snapshots.Best()
   147  			chunks = nil
   148  		}
   149  		if snapshot == nil {
   150  			if discoveryTime == 0 {
   151  				return sm.State{}, nil, errNoSnapshots
   152  			}
   153  			s.logger.Info(fmt.Sprintf("Discovering snapshots for %v", discoveryTime))
   154  			time.Sleep(discoveryTime)
   155  			continue
   156  		}
   157  		if chunks == nil {
   158  			chunks, err = newChunkQueue(snapshot, s.tempDir)
   159  			if err != nil {
   160  				return sm.State{}, nil, fmt.Errorf("failed to create chunk queue: %w", err)
   161  			}
   162  			defer chunks.Close() // in case we forget to close it elsewhere
   163  		}
   164  
   165  		newState, commit, err := s.Sync(snapshot, chunks)
   166  		switch {
   167  		case err == nil:
   168  			return newState, commit, nil
   169  
   170  		case errors.Is(err, errAbort):
   171  			return sm.State{}, nil, err
   172  
   173  		case errors.Is(err, errRetrySnapshot):
   174  			chunks.RetryAll()
   175  			s.logger.Info("Retrying snapshot", "height", snapshot.Height, "format", snapshot.Format,
   176  				"hash", fmt.Sprintf("%X", snapshot.Hash))
   177  			continue
   178  
   179  		case errors.Is(err, errTimeout):
   180  			s.snapshots.Reject(snapshot)
   181  			s.logger.Error("Timed out waiting for snapshot chunks, rejected snapshot",
   182  				"height", snapshot.Height, "format", snapshot.Format, "hash", fmt.Sprintf("%X", snapshot.Hash))
   183  
   184  		case errors.Is(err, errRejectSnapshot):
   185  			s.snapshots.Reject(snapshot)
   186  			s.logger.Info("Snapshot rejected", "height", snapshot.Height, "format", snapshot.Format,
   187  				"hash", fmt.Sprintf("%X", snapshot.Hash))
   188  
   189  		case errors.Is(err, errRejectFormat):
   190  			s.snapshots.RejectFormat(snapshot.Format)
   191  			s.logger.Info("Snapshot format rejected", "format", snapshot.Format)
   192  
   193  		case errors.Is(err, errRejectSender):
   194  			s.logger.Info("Snapshot senders rejected", "height", snapshot.Height, "format", snapshot.Format,
   195  				"hash", fmt.Sprintf("%X", snapshot.Hash))
   196  			for _, peer := range s.snapshots.GetPeers(snapshot) {
   197  				s.snapshots.RejectPeer(peer.ID())
   198  				s.logger.Info("Snapshot sender rejected", "peer", peer.ID())
   199  			}
   200  
   201  		default:
   202  			return sm.State{}, nil, fmt.Errorf("snapshot restoration failed: %w", err)
   203  		}
   204  
   205  		// Discard snapshot and chunks for next iteration
   206  		err = chunks.Close()
   207  		if err != nil {
   208  			s.logger.Error("Failed to clean up chunk queue", "err", err)
   209  		}
   210  		snapshot = nil
   211  		chunks = nil
   212  	}
   213  }
   214  
   215  // Sync executes a sync for a specific snapshot, returning the latest state and block commit which
   216  // the caller must use to bootstrap the node.
   217  func (s *syncer) Sync(snapshot *snapshot, chunks *chunkQueue) (sm.State, *types.Commit, error) {
   218  	s.mtx.Lock()
   219  	if s.chunks != nil {
   220  		s.mtx.Unlock()
   221  		return sm.State{}, nil, errors.New("a state sync is already in progress")
   222  	}
   223  	s.chunks = chunks
   224  	s.mtx.Unlock()
   225  	defer func() {
   226  		s.mtx.Lock()
   227  		s.chunks = nil
   228  		s.mtx.Unlock()
   229  	}()
   230  
   231  	// Offer snapshot to ABCI app.
   232  	err := s.offerSnapshot(snapshot)
   233  	if err != nil {
   234  		return sm.State{}, nil, err
   235  	}
   236  
   237  	// Spawn chunk fetchers. They will terminate when the chunk queue is closed or context cancelled.
   238  	ctx, cancel := context.WithCancel(context.Background())
   239  	defer cancel()
   240  	for i := int32(0); i < chunkFetchers; i++ {
   241  		go s.fetchChunks(ctx, snapshot, chunks)
   242  	}
   243  
   244  	// Optimistically build new state, so we don't discover any light client failures at the end.
   245  	state, err := s.stateProvider.State(snapshot.Height)
   246  	if err != nil {
   247  		return sm.State{}, nil, fmt.Errorf("failed to build new state: %w", err)
   248  	}
   249  	commit, err := s.stateProvider.Commit(snapshot.Height)
   250  	if err != nil {
   251  		return sm.State{}, nil, fmt.Errorf("failed to fetch commit: %w", err)
   252  	}
   253  
   254  	// Restore snapshot
   255  	err = s.applyChunks(chunks)
   256  	if err != nil {
   257  		return sm.State{}, nil, err
   258  	}
   259  
   260  	// Verify app and update app version
   261  	appVersion, err := s.verifyApp(snapshot)
   262  	if err != nil {
   263  		return sm.State{}, nil, err
   264  	}
   265  	state.Version.Consensus.App = appVersion
   266  
   267  	// Done! 🎉
   268  	s.logger.Info("Snapshot restored", "height", snapshot.Height, "format", snapshot.Format,
   269  		"hash", fmt.Sprintf("%X", snapshot.Hash))
   270  
   271  	return state, commit, nil
   272  }
   273  
   274  // offerSnapshot offers a snapshot to the app. It returns various errors depending on the app's
   275  // response, or nil if the snapshot was accepted.
   276  func (s *syncer) offerSnapshot(snapshot *snapshot) error {
   277  	s.logger.Info("Offering snapshot to ABCI app", "height", snapshot.Height,
   278  		"format", snapshot.Format, "hash", fmt.Sprintf("%X", snapshot.Hash))
   279  	resp, err := s.conn.OfferSnapshotSync(abci.RequestOfferSnapshot{
   280  		Snapshot: &abci.Snapshot{
   281  			Height:   snapshot.Height,
   282  			Format:   snapshot.Format,
   283  			Chunks:   snapshot.Chunks,
   284  			Hash:     snapshot.Hash,
   285  			Metadata: snapshot.Metadata,
   286  		},
   287  		AppHash: snapshot.trustedAppHash,
   288  	})
   289  	if err != nil {
   290  		return fmt.Errorf("failed to offer snapshot: %w", err)
   291  	}
   292  	switch resp.Result {
   293  	case abci.ResponseOfferSnapshot_ACCEPT:
   294  		s.logger.Info("Snapshot accepted, restoring", "height", snapshot.Height,
   295  			"format", snapshot.Format, "hash", fmt.Sprintf("%X", snapshot.Hash))
   296  		return nil
   297  	case abci.ResponseOfferSnapshot_ABORT:
   298  		return errAbort
   299  	case abci.ResponseOfferSnapshot_REJECT:
   300  		return errRejectSnapshot
   301  	case abci.ResponseOfferSnapshot_REJECT_FORMAT:
   302  		return errRejectFormat
   303  	case abci.ResponseOfferSnapshot_REJECT_SENDER:
   304  		return errRejectSender
   305  	default:
   306  		return fmt.Errorf("unknown ResponseOfferSnapshot result %v", resp.Result)
   307  	}
   308  }
   309  
   310  // applyChunks applies chunks to the app. It returns various errors depending on the app's
   311  // response, or nil once the snapshot is fully restored.
   312  func (s *syncer) applyChunks(chunks *chunkQueue) error {
   313  	for {
   314  		chunk, err := chunks.Next()
   315  		if err == errDone {
   316  			return nil
   317  		} else if err != nil {
   318  			return fmt.Errorf("failed to fetch chunk: %w", err)
   319  		}
   320  
   321  		resp, err := s.conn.ApplySnapshotChunkSync(abci.RequestApplySnapshotChunk{
   322  			Index:  chunk.Index,
   323  			Chunk:  chunk.Chunk,
   324  			Sender: string(chunk.Sender),
   325  		})
   326  		if err != nil {
   327  			return fmt.Errorf("failed to apply chunk %v: %w", chunk.Index, err)
   328  		}
   329  		s.logger.Info("Applied snapshot chunk to ABCI app", "height", chunk.Height,
   330  			"format", chunk.Format, "chunk", chunk.Index, "total", chunks.Size())
   331  
   332  		// Discard and refetch any chunks as requested by the app
   333  		for _, index := range resp.RefetchChunks {
   334  			err := chunks.Discard(index)
   335  			if err != nil {
   336  				return fmt.Errorf("failed to discard chunk %v: %w", index, err)
   337  			}
   338  		}
   339  
   340  		// Reject any senders as requested by the app
   341  		for _, sender := range resp.RejectSenders {
   342  			if sender != "" {
   343  				s.snapshots.RejectPeer(p2p.ID(sender))
   344  				err := chunks.DiscardSender(p2p.ID(sender))
   345  				if err != nil {
   346  					return fmt.Errorf("failed to reject sender: %w", err)
   347  				}
   348  			}
   349  		}
   350  
   351  		switch resp.Result {
   352  		case abci.ResponseApplySnapshotChunk_ACCEPT:
   353  		case abci.ResponseApplySnapshotChunk_ABORT:
   354  			return errAbort
   355  		case abci.ResponseApplySnapshotChunk_RETRY:
   356  			chunks.Retry(chunk.Index)
   357  		case abci.ResponseApplySnapshotChunk_RETRY_SNAPSHOT:
   358  			return errRetrySnapshot
   359  		case abci.ResponseApplySnapshotChunk_REJECT_SNAPSHOT:
   360  			return errRejectSnapshot
   361  		default:
   362  			return fmt.Errorf("unknown ResponseApplySnapshotChunk result %v", resp.Result)
   363  		}
   364  	}
   365  }
   366  
   367  // fetchChunks requests chunks from peers, receiving allocations from the chunk queue. Chunks
   368  // will be received from the reactor via syncer.AddChunks() to chunkQueue.Add().
   369  func (s *syncer) fetchChunks(ctx context.Context, snapshot *snapshot, chunks *chunkQueue) {
   370  	for {
   371  		index, err := chunks.Allocate()
   372  		if err == errDone {
   373  			// Keep checking until the context is cancelled (restore is done), in case any
   374  			// chunks need to be refetched.
   375  			select {
   376  			case <-ctx.Done():
   377  				return
   378  			default:
   379  			}
   380  			time.Sleep(2 * time.Second)
   381  			continue
   382  		}
   383  		if err != nil {
   384  			s.logger.Error("Failed to allocate chunk from queue", "err", err)
   385  			return
   386  		}
   387  		s.logger.Info("Fetching snapshot chunk", "height", snapshot.Height,
   388  			"format", snapshot.Format, "chunk", index, "total", chunks.Size())
   389  
   390  		ticker := time.NewTicker(chunkRequestTimeout)
   391  		defer ticker.Stop()
   392  		s.requestChunk(snapshot, index)
   393  		select {
   394  		case <-chunks.WaitFor(index):
   395  		case <-ticker.C:
   396  			s.requestChunk(snapshot, index)
   397  		case <-ctx.Done():
   398  			return
   399  		}
   400  		ticker.Stop()
   401  	}
   402  }
   403  
   404  // requestChunk requests a chunk from a peer.
   405  func (s *syncer) requestChunk(snapshot *snapshot, chunk uint32) {
   406  	peer := s.snapshots.GetPeer(snapshot)
   407  	if peer == nil {
   408  		s.logger.Error("No valid peers found for snapshot", "height", snapshot.Height,
   409  			"format", snapshot.Format, "hash", snapshot.Hash)
   410  		return
   411  	}
   412  	s.logger.Debug("Requesting snapshot chunk", "height", snapshot.Height,
   413  		"format", snapshot.Format, "chunk", chunk, "peer", peer.ID())
   414  	peer.Send(ChunkChannel, mustEncodeMsg(&ssproto.ChunkRequest{
   415  		Height: snapshot.Height,
   416  		Format: snapshot.Format,
   417  		Index:  chunk,
   418  	}))
   419  }
   420  
   421  // verifyApp verifies the sync, checking the app hash and last block height. It returns the
   422  // app version, which should be returned as part of the initial state.
   423  func (s *syncer) verifyApp(snapshot *snapshot) (uint64, error) {
   424  	resp, err := s.connQuery.InfoSync(proxy.RequestInfo)
   425  	if err != nil {
   426  		return 0, fmt.Errorf("failed to query ABCI app for appHash: %w", err)
   427  	}
   428  	if !bytes.Equal(snapshot.trustedAppHash, resp.LastBlockAppHash) {
   429  		s.logger.Error("appHash verification failed",
   430  			"expected", fmt.Sprintf("%X", snapshot.trustedAppHash),
   431  			"actual", fmt.Sprintf("%X", resp.LastBlockAppHash))
   432  		return 0, errVerifyFailed
   433  	}
   434  	if uint64(resp.LastBlockHeight) != snapshot.Height {
   435  		s.logger.Error("ABCI app reported unexpected last block height",
   436  			"expected", snapshot.Height, "actual", resp.LastBlockHeight)
   437  		return 0, errVerifyFailed
   438  	}
   439  	s.logger.Info("Verified ABCI app", "height", snapshot.Height,
   440  		"appHash", fmt.Sprintf("%X", snapshot.trustedAppHash))
   441  	return resp.AppVersion, nil
   442  }