github.com/vipernet-xyz/tm@v0.34.24/statesync/syncer.go (about)

     1  package statesync
     2  
     3  import (
     4  	"bytes"
     5  	"context"
     6  	"errors"
     7  	"fmt"
     8  	"time"
     9  
    10  	abci "github.com/vipernet-xyz/tm/abci/types"
    11  	"github.com/vipernet-xyz/tm/config"
    12  	"github.com/vipernet-xyz/tm/libs/log"
    13  	tmsync "github.com/vipernet-xyz/tm/libs/sync"
    14  	"github.com/vipernet-xyz/tm/light"
    15  	"github.com/vipernet-xyz/tm/p2p"
    16  	ssproto "github.com/vipernet-xyz/tm/proto/tendermint/statesync"
    17  	"github.com/vipernet-xyz/tm/proxy"
    18  	sm "github.com/vipernet-xyz/tm/state"
    19  	"github.com/vipernet-xyz/tm/types"
    20  )
    21  
    22  const (
    23  	// chunkTimeout is the timeout while waiting for the next chunk from the chunk queue.
    24  	chunkTimeout = 2 * time.Minute
    25  
    26  	// minimumDiscoveryTime is the lowest allowable time for a
    27  	// SyncAny discovery time.
    28  	minimumDiscoveryTime = 5 * time.Second
    29  )
    30  
    31  var (
    32  	// errAbort is returned by Sync() when snapshot restoration is aborted.
    33  	errAbort = errors.New("state sync aborted")
    34  	// errRetrySnapshot is returned by Sync() when the snapshot should be retried.
    35  	errRetrySnapshot = errors.New("retry snapshot")
    36  	// errRejectSnapshot is returned by Sync() when the snapshot is rejected.
    37  	errRejectSnapshot = errors.New("snapshot was rejected")
    38  	// errRejectFormat is returned by Sync() when the snapshot format is rejected.
    39  	errRejectFormat = errors.New("snapshot format was rejected")
    40  	// errRejectSender is returned by Sync() when the snapshot sender is rejected.
    41  	errRejectSender = errors.New("snapshot sender was rejected")
    42  	// errVerifyFailed is returned by Sync() when app hash or last height verification fails.
    43  	errVerifyFailed = errors.New("verification failed")
    44  	// errTimeout is returned by Sync() when we've waited too long to receive a chunk.
    45  	errTimeout = errors.New("timed out waiting for chunk")
    46  	// errNoSnapshots is returned by SyncAny() if no snapshots are found and discovery is disabled.
    47  	errNoSnapshots = errors.New("no suitable snapshots found")
    48  )
    49  
    50  // syncer runs a state sync against an ABCI app. Use either SyncAny() to automatically attempt to
    51  // sync all snapshots in the pool (pausing to discover new ones), or Sync() to sync a specific
    52  // snapshot. Snapshots and chunks are fed via AddSnapshot() and AddChunk() as appropriate.
    53  type syncer struct {
    54  	logger        log.Logger
    55  	stateProvider StateProvider
    56  	conn          proxy.AppConnSnapshot
    57  	connQuery     proxy.AppConnQuery
    58  	snapshots     *snapshotPool
    59  	tempDir       string
    60  	chunkFetchers int32
    61  	retryTimeout  time.Duration
    62  
    63  	mtx    tmsync.RWMutex
    64  	chunks *chunkQueue
    65  }
    66  
    67  // newSyncer creates a new syncer.
    68  func newSyncer(
    69  	cfg config.StateSyncConfig,
    70  	logger log.Logger,
    71  	conn proxy.AppConnSnapshot,
    72  	connQuery proxy.AppConnQuery,
    73  	stateProvider StateProvider,
    74  	tempDir string,
    75  ) *syncer {
    76  
    77  	return &syncer{
    78  		logger:        logger,
    79  		stateProvider: stateProvider,
    80  		conn:          conn,
    81  		connQuery:     connQuery,
    82  		snapshots:     newSnapshotPool(),
    83  		tempDir:       tempDir,
    84  		chunkFetchers: cfg.ChunkFetchers,
    85  		retryTimeout:  cfg.ChunkRequestTimeout,
    86  	}
    87  }
    88  
    89  // AddChunk adds a chunk to the chunk queue, if any. It returns false if the chunk has already
    90  // been added to the queue, or an error if there's no sync in progress.
    91  func (s *syncer) AddChunk(chunk *chunk) (bool, error) {
    92  	s.mtx.RLock()
    93  	defer s.mtx.RUnlock()
    94  	if s.chunks == nil {
    95  		return false, errors.New("no state sync in progress")
    96  	}
    97  	added, err := s.chunks.Add(chunk)
    98  	if err != nil {
    99  		return false, err
   100  	}
   101  	if added {
   102  		s.logger.Debug("Added chunk to queue", "height", chunk.Height, "format", chunk.Format,
   103  			"chunk", chunk.Index)
   104  	} else {
   105  		s.logger.Debug("Ignoring duplicate chunk in queue", "height", chunk.Height, "format", chunk.Format,
   106  			"chunk", chunk.Index)
   107  	}
   108  	return added, nil
   109  }
   110  
   111  // AddSnapshot adds a snapshot to the snapshot pool. It returns true if a new, previously unseen
   112  // snapshot was accepted and added.
   113  func (s *syncer) AddSnapshot(peer p2p.Peer, snapshot *snapshot) (bool, error) {
   114  	added, err := s.snapshots.Add(peer, snapshot)
   115  	if err != nil {
   116  		return false, err
   117  	}
   118  	if added {
   119  		s.logger.Info("Discovered new snapshot", "height", snapshot.Height, "format", snapshot.Format,
   120  			"hash", snapshot.Hash)
   121  	}
   122  	return added, nil
   123  }
   124  
   125  // AddPeer adds a peer to the pool. For now we just keep it simple and send a single request
   126  // to discover snapshots, later we may want to do retries and stuff.
   127  func (s *syncer) AddPeer(peer p2p.Peer) {
   128  	s.logger.Debug("Requesting snapshots from peer", "peer", peer.ID())
   129  	e := p2p.Envelope{
   130  		ChannelID: SnapshotChannel,
   131  		Message:   &ssproto.SnapshotsRequest{},
   132  	}
   133  	p2p.SendEnvelopeShim(peer, e, s.logger) //nolint: staticcheck
   134  }
   135  
   136  // RemovePeer removes a peer from the pool.
   137  func (s *syncer) RemovePeer(peer p2p.Peer) {
   138  	s.logger.Debug("Removing peer from sync", "peer", peer.ID())
   139  	s.snapshots.RemovePeer(peer.ID())
   140  }
   141  
   142  // SyncAny tries to sync any of the snapshots in the snapshot pool, waiting to discover further
   143  // snapshots if none were found and discoveryTime > 0. It returns the latest state and block commit
   144  // which the caller must use to bootstrap the node.
   145  func (s *syncer) SyncAny(discoveryTime time.Duration, retryHook func()) (sm.State, *types.Commit, error) {
   146  	if discoveryTime != 0 && discoveryTime < minimumDiscoveryTime {
   147  		discoveryTime = 5 * minimumDiscoveryTime
   148  	}
   149  
   150  	if discoveryTime > 0 {
   151  		s.logger.Info("sync any", "msg", log.NewLazySprintf("Discovering snapshots for %v", discoveryTime))
   152  		time.Sleep(discoveryTime)
   153  	}
   154  
   155  	// The app may ask us to retry a snapshot restoration, in which case we need to reuse
   156  	// the snapshot and chunk queue from the previous loop iteration.
   157  	var (
   158  		snapshot *snapshot
   159  		chunks   *chunkQueue
   160  		err      error
   161  	)
   162  	for {
   163  		// If not nil, we're going to retry restoration of the same snapshot.
   164  		if snapshot == nil {
   165  			snapshot = s.snapshots.Best()
   166  			chunks = nil
   167  		}
   168  		if snapshot == nil {
   169  			if discoveryTime == 0 {
   170  				return sm.State{}, nil, errNoSnapshots
   171  			}
   172  			retryHook()
   173  			s.logger.Info("sync any", "msg", log.NewLazySprintf("Discovering snapshots for %v", discoveryTime))
   174  			time.Sleep(discoveryTime)
   175  			continue
   176  		}
   177  		if chunks == nil {
   178  			chunks, err = newChunkQueue(snapshot, s.tempDir)
   179  			if err != nil {
   180  				return sm.State{}, nil, fmt.Errorf("failed to create chunk queue: %w", err)
   181  			}
   182  			defer chunks.Close() // in case we forget to close it elsewhere
   183  		}
   184  
   185  		newState, commit, err := s.Sync(snapshot, chunks)
   186  		switch {
   187  		case err == nil:
   188  			return newState, commit, nil
   189  
   190  		case errors.Is(err, errAbort):
   191  			return sm.State{}, nil, err
   192  
   193  		case errors.Is(err, errRetrySnapshot):
   194  			chunks.RetryAll()
   195  			s.logger.Info("Retrying snapshot", "height", snapshot.Height, "format", snapshot.Format,
   196  				"hash", snapshot.Hash)
   197  			continue
   198  
   199  		case errors.Is(err, errTimeout):
   200  			s.snapshots.Reject(snapshot)
   201  			s.logger.Error("Timed out waiting for snapshot chunks, rejected snapshot",
   202  				"height", snapshot.Height, "format", snapshot.Format, "hash", snapshot.Hash)
   203  
   204  		case errors.Is(err, errRejectSnapshot):
   205  			s.snapshots.Reject(snapshot)
   206  			s.logger.Info("Snapshot rejected", "height", snapshot.Height, "format", snapshot.Format,
   207  				"hash", snapshot.Hash)
   208  
   209  		case errors.Is(err, errRejectFormat):
   210  			s.snapshots.RejectFormat(snapshot.Format)
   211  			s.logger.Info("Snapshot format rejected", "format", snapshot.Format)
   212  
   213  		case errors.Is(err, errRejectSender):
   214  			s.logger.Info("Snapshot senders rejected", "height", snapshot.Height, "format", snapshot.Format,
   215  				"hash", snapshot.Hash)
   216  			for _, peer := range s.snapshots.GetPeers(snapshot) {
   217  				s.snapshots.RejectPeer(peer.ID())
   218  				s.logger.Info("Snapshot sender rejected", "peer", peer.ID())
   219  			}
   220  
   221  		case errors.Is(err, context.DeadlineExceeded):
   222  			s.logger.Info("Timed out validating snapshot, rejecting", "height", snapshot.Height, "err", err)
   223  			s.snapshots.Reject(snapshot)
   224  
   225  		default:
   226  			return sm.State{}, nil, fmt.Errorf("snapshot restoration failed: %w", err)
   227  		}
   228  
   229  		// Discard snapshot and chunks for next iteration
   230  		err = chunks.Close()
   231  		if err != nil {
   232  			s.logger.Error("Failed to clean up chunk queue", "err", err)
   233  		}
   234  		snapshot = nil
   235  		chunks = nil
   236  	}
   237  }
   238  
   239  // Sync executes a sync for a specific snapshot, returning the latest state and block commit which
   240  // the caller must use to bootstrap the node.
   241  func (s *syncer) Sync(snapshot *snapshot, chunks *chunkQueue) (sm.State, *types.Commit, error) {
   242  	s.mtx.Lock()
   243  	if s.chunks != nil {
   244  		s.mtx.Unlock()
   245  		return sm.State{}, nil, errors.New("a state sync is already in progress")
   246  	}
   247  	s.chunks = chunks
   248  	s.mtx.Unlock()
   249  	defer func() {
   250  		s.mtx.Lock()
   251  		s.chunks = nil
   252  		s.mtx.Unlock()
   253  	}()
   254  
   255  	hctx, cancel := context.WithTimeout(context.TODO(), 30*time.Second)
   256  	defer cancel()
   257  
   258  	appHash, err := s.stateProvider.AppHash(hctx, snapshot.Height)
   259  	if err != nil {
   260  		s.logger.Info("failed to fetch and verify app hash", "err", err)
   261  		if err == light.ErrNoWitnesses {
   262  			return sm.State{}, nil, err
   263  		}
   264  		return sm.State{}, nil, errRejectSnapshot
   265  	}
   266  	snapshot.trustedAppHash = appHash
   267  
   268  	// Offer snapshot to ABCI app.
   269  	err = s.offerSnapshot(snapshot)
   270  	if err != nil {
   271  		return sm.State{}, nil, err
   272  	}
   273  
   274  	// Spawn chunk fetchers. They will terminate when the chunk queue is closed or context cancelled.
   275  	fetchCtx, cancel := context.WithCancel(context.TODO())
   276  	defer cancel()
   277  	for i := int32(0); i < s.chunkFetchers; i++ {
   278  		go s.fetchChunks(fetchCtx, snapshot, chunks)
   279  	}
   280  
   281  	pctx, pcancel := context.WithTimeout(context.TODO(), 30*time.Second)
   282  	defer pcancel()
   283  
   284  	// Optimistically build new state, so we don't discover any light client failures at the end.
   285  	state, err := s.stateProvider.State(pctx, snapshot.Height)
   286  	if err != nil {
   287  		s.logger.Info("failed to fetch and verify tendermint state", "err", err)
   288  		if err == light.ErrNoWitnesses {
   289  			return sm.State{}, nil, err
   290  		}
   291  		return sm.State{}, nil, errRejectSnapshot
   292  	}
   293  	commit, err := s.stateProvider.Commit(pctx, snapshot.Height)
   294  	if err != nil {
   295  		s.logger.Info("failed to fetch and verify commit", "err", err)
   296  		if err == light.ErrNoWitnesses {
   297  			return sm.State{}, nil, err
   298  		}
   299  		return sm.State{}, nil, errRejectSnapshot
   300  	}
   301  
   302  	// Restore snapshot
   303  	err = s.applyChunks(chunks)
   304  	if err != nil {
   305  		return sm.State{}, nil, err
   306  	}
   307  
   308  	// Verify app and app version
   309  	if err := s.verifyApp(snapshot, state.Version.Consensus.App); err != nil {
   310  		return sm.State{}, nil, err
   311  	}
   312  
   313  	// Done! 🎉
   314  	s.logger.Info("Snapshot restored", "height", snapshot.Height, "format", snapshot.Format,
   315  		"hash", snapshot.Hash)
   316  
   317  	return state, commit, nil
   318  }
   319  
   320  // offerSnapshot offers a snapshot to the app. It returns various errors depending on the app's
   321  // response, or nil if the snapshot was accepted.
   322  func (s *syncer) offerSnapshot(snapshot *snapshot) error {
   323  	s.logger.Info("Offering snapshot to ABCI app", "height", snapshot.Height,
   324  		"format", snapshot.Format, "hash", snapshot.Hash)
   325  	resp, err := s.conn.OfferSnapshotSync(abci.RequestOfferSnapshot{
   326  		Snapshot: &abci.Snapshot{
   327  			Height:   snapshot.Height,
   328  			Format:   snapshot.Format,
   329  			Chunks:   snapshot.Chunks,
   330  			Hash:     snapshot.Hash,
   331  			Metadata: snapshot.Metadata,
   332  		},
   333  		AppHash: snapshot.trustedAppHash,
   334  	})
   335  	if err != nil {
   336  		return fmt.Errorf("failed to offer snapshot: %w", err)
   337  	}
   338  	switch resp.Result {
   339  	case abci.ResponseOfferSnapshot_ACCEPT:
   340  		s.logger.Info("Snapshot accepted, restoring", "height", snapshot.Height,
   341  			"format", snapshot.Format, "hash", snapshot.Hash)
   342  		return nil
   343  	case abci.ResponseOfferSnapshot_ABORT:
   344  		return errAbort
   345  	case abci.ResponseOfferSnapshot_REJECT:
   346  		return errRejectSnapshot
   347  	case abci.ResponseOfferSnapshot_REJECT_FORMAT:
   348  		return errRejectFormat
   349  	case abci.ResponseOfferSnapshot_REJECT_SENDER:
   350  		return errRejectSender
   351  	default:
   352  		return fmt.Errorf("unknown ResponseOfferSnapshot result %v", resp.Result)
   353  	}
   354  }
   355  
   356  // applyChunks applies chunks to the app. It returns various errors depending on the app's
   357  // response, or nil once the snapshot is fully restored.
   358  func (s *syncer) applyChunks(chunks *chunkQueue) error {
   359  	for {
   360  		chunk, err := chunks.Next()
   361  		if err == errDone {
   362  			return nil
   363  		} else if err != nil {
   364  			return fmt.Errorf("failed to fetch chunk: %w", err)
   365  		}
   366  
   367  		resp, err := s.conn.ApplySnapshotChunkSync(abci.RequestApplySnapshotChunk{
   368  			Index:  chunk.Index,
   369  			Chunk:  chunk.Chunk,
   370  			Sender: string(chunk.Sender),
   371  		})
   372  		if err != nil {
   373  			return fmt.Errorf("failed to apply chunk %v: %w", chunk.Index, err)
   374  		}
   375  		s.logger.Info("Applied snapshot chunk to ABCI app", "height", chunk.Height,
   376  			"format", chunk.Format, "chunk", chunk.Index, "total", chunks.Size())
   377  
   378  		// Discard and refetch any chunks as requested by the app
   379  		for _, index := range resp.RefetchChunks {
   380  			err := chunks.Discard(index)
   381  			if err != nil {
   382  				return fmt.Errorf("failed to discard chunk %v: %w", index, err)
   383  			}
   384  		}
   385  
   386  		// Reject any senders as requested by the app
   387  		for _, sender := range resp.RejectSenders {
   388  			if sender != "" {
   389  				s.snapshots.RejectPeer(p2p.ID(sender))
   390  				err := chunks.DiscardSender(p2p.ID(sender))
   391  				if err != nil {
   392  					return fmt.Errorf("failed to reject sender: %w", err)
   393  				}
   394  			}
   395  		}
   396  
   397  		switch resp.Result {
   398  		case abci.ResponseApplySnapshotChunk_ACCEPT:
   399  		case abci.ResponseApplySnapshotChunk_ABORT:
   400  			return errAbort
   401  		case abci.ResponseApplySnapshotChunk_RETRY:
   402  			chunks.Retry(chunk.Index)
   403  		case abci.ResponseApplySnapshotChunk_RETRY_SNAPSHOT:
   404  			return errRetrySnapshot
   405  		case abci.ResponseApplySnapshotChunk_REJECT_SNAPSHOT:
   406  			return errRejectSnapshot
   407  		default:
   408  			return fmt.Errorf("unknown ResponseApplySnapshotChunk result %v", resp.Result)
   409  		}
   410  	}
   411  }
   412  
   413  // fetchChunks requests chunks from peers, receiving allocations from the chunk queue. Chunks
   414  // will be received from the reactor via syncer.AddChunks() to chunkQueue.Add().
   415  func (s *syncer) fetchChunks(ctx context.Context, snapshot *snapshot, chunks *chunkQueue) {
   416  	var (
   417  		next  = true
   418  		index uint32
   419  		err   error
   420  	)
   421  
   422  	for {
   423  		if next {
   424  			index, err = chunks.Allocate()
   425  			if errors.Is(err, errDone) {
   426  				// Keep checking until the context is canceled (restore is done), in case any
   427  				// chunks need to be refetched.
   428  				select {
   429  				case <-ctx.Done():
   430  					return
   431  				default:
   432  				}
   433  				time.Sleep(2 * time.Second)
   434  				continue
   435  			}
   436  			if err != nil {
   437  				s.logger.Error("Failed to allocate chunk from queue", "err", err)
   438  				return
   439  			}
   440  		}
   441  		s.logger.Info("Fetching snapshot chunk", "height", snapshot.Height,
   442  			"format", snapshot.Format, "chunk", index, "total", chunks.Size())
   443  
   444  		ticker := time.NewTicker(s.retryTimeout)
   445  		defer ticker.Stop()
   446  
   447  		s.requestChunk(snapshot, index)
   448  
   449  		select {
   450  		case <-chunks.WaitFor(index):
   451  			next = true
   452  
   453  		case <-ticker.C:
   454  			next = false
   455  
   456  		case <-ctx.Done():
   457  			return
   458  		}
   459  
   460  		ticker.Stop()
   461  	}
   462  }
   463  
   464  // requestChunk requests a chunk from a peer.
   465  func (s *syncer) requestChunk(snapshot *snapshot, chunk uint32) {
   466  	peer := s.snapshots.GetPeer(snapshot)
   467  	if peer == nil {
   468  		s.logger.Error("No valid peers found for snapshot", "height", snapshot.Height,
   469  			"format", snapshot.Format, "hash", snapshot.Hash)
   470  		return
   471  	}
   472  	s.logger.Debug("Requesting snapshot chunk", "height", snapshot.Height,
   473  		"format", snapshot.Format, "chunk", chunk, "peer", peer.ID())
   474  	p2p.SendEnvelopeShim(peer, p2p.Envelope{ //nolint: staticcheck
   475  		ChannelID: ChunkChannel,
   476  		Message: &ssproto.ChunkRequest{
   477  			Height: snapshot.Height,
   478  			Format: snapshot.Format,
   479  			Index:  chunk,
   480  		},
   481  	}, s.logger)
   482  }
   483  
   484  // verifyApp verifies the sync, checking the app hash, last block height and app version
   485  func (s *syncer) verifyApp(snapshot *snapshot, appVersion uint64) error {
   486  	resp, err := s.connQuery.InfoSync(proxy.RequestInfo)
   487  	if err != nil {
   488  		return fmt.Errorf("failed to query ABCI app for appHash: %w", err)
   489  	}
   490  
   491  	// sanity check that the app version in the block matches the application's own record
   492  	// of its version
   493  	if resp.AppVersion != appVersion {
   494  		// An error here most likely means that the app hasn't inplemented state sync
   495  		// or the Info call correctly
   496  		return fmt.Errorf("app version mismatch. Expected: %d, got: %d",
   497  			appVersion, resp.AppVersion)
   498  	}
   499  	if !bytes.Equal(snapshot.trustedAppHash, resp.LastBlockAppHash) {
   500  		s.logger.Error("appHash verification failed",
   501  			"expected", snapshot.trustedAppHash,
   502  			"actual", resp.LastBlockAppHash)
   503  		return errVerifyFailed
   504  	}
   505  	if uint64(resp.LastBlockHeight) != snapshot.Height {
   506  		s.logger.Error(
   507  			"ABCI app reported unexpected last block height",
   508  			"expected", snapshot.Height,
   509  			"actual", resp.LastBlockHeight,
   510  		)
   511  		return errVerifyFailed
   512  	}
   513  
   514  	s.logger.Info("Verified ABCI app", "height", snapshot.Height, "appHash", snapshot.trustedAppHash)
   515  	return nil
   516  }