github.com/Finschia/ostracon@v1.1.5/statesync/syncer.go (about)

     1  package statesync
     2  
     3  import (
     4  	"bytes"
     5  	"context"
     6  	"errors"
     7  	"fmt"
     8  	"time"
     9  
    10  	abci "github.com/tendermint/tendermint/abci/types"
    11  	ssproto "github.com/tendermint/tendermint/proto/tendermint/statesync"
    12  
    13  	"github.com/Finschia/ostracon/config"
    14  	"github.com/Finschia/ostracon/libs/log"
    15  	tmsync "github.com/Finschia/ostracon/libs/sync"
    16  	"github.com/Finschia/ostracon/light"
    17  	"github.com/Finschia/ostracon/p2p"
    18  	"github.com/Finschia/ostracon/proxy"
    19  	sm "github.com/Finschia/ostracon/state"
    20  	"github.com/Finschia/ostracon/types"
    21  )
    22  
    23  const (
    24  	// chunkTimeout is the timeout while waiting for the next chunk from the chunk queue.
    25  	chunkTimeout = 2 * time.Minute
    26  
    27  	// minimumDiscoveryTime is the lowest allowable time for a
    28  	// SyncAny discovery time.
    29  	minimumDiscoveryTime = 5 * time.Second
    30  )
    31  
    32  var (
    33  	// errAbort is returned by Sync() when snapshot restoration is aborted.
    34  	errAbort = errors.New("state sync aborted")
    35  	// errRetrySnapshot is returned by Sync() when the snapshot should be retried.
    36  	errRetrySnapshot = errors.New("retry snapshot")
    37  	// errRejectSnapshot is returned by Sync() when the snapshot is rejected.
    38  	errRejectSnapshot = errors.New("snapshot was rejected")
    39  	// errRejectFormat is returned by Sync() when the snapshot format is rejected.
    40  	errRejectFormat = errors.New("snapshot format was rejected")
    41  	// errRejectSender is returned by Sync() when the snapshot sender is rejected.
    42  	errRejectSender = errors.New("snapshot sender was rejected")
    43  	// errVerifyFailed is returned by Sync() when app hash or last height verification fails.
    44  	errVerifyFailed = errors.New("verification failed")
    45  	// errTimeout is returned by Sync() when we've waited too long to receive a chunk.
    46  	errTimeout = errors.New("timed out waiting for chunk")
    47  	// errNoSnapshots is returned by SyncAny() if no snapshots are found and discovery is disabled.
    48  	errNoSnapshots = errors.New("no suitable snapshots found")
    49  )
    50  
    51  // syncer runs a state sync against an ABCI app. Use either SyncAny() to automatically attempt to
    52  // sync all snapshots in the pool (pausing to discover new ones), or Sync() to sync a specific
    53  // snapshot. Snapshots and chunks are fed via AddSnapshot() and AddChunk() as appropriate.
    54  type syncer struct {
    55  	logger        log.Logger
    56  	stateProvider StateProvider
    57  	conn          proxy.AppConnSnapshot
    58  	connQuery     proxy.AppConnQuery
    59  	snapshots     *snapshotPool
    60  	tempDir       string
    61  	chunkFetchers int32
    62  	retryTimeout  time.Duration
    63  
    64  	mtx    tmsync.RWMutex
    65  	chunks *chunkQueue
    66  }
    67  
    68  // newSyncer creates a new syncer.
    69  func newSyncer(
    70  	cfg config.StateSyncConfig,
    71  	logger log.Logger,
    72  	conn proxy.AppConnSnapshot,
    73  	connQuery proxy.AppConnQuery,
    74  	stateProvider StateProvider,
    75  	tempDir string,
    76  ) *syncer {
    77  
    78  	return &syncer{
    79  		logger:        logger,
    80  		stateProvider: stateProvider,
    81  		conn:          conn,
    82  		connQuery:     connQuery,
    83  		snapshots:     newSnapshotPool(),
    84  		tempDir:       tempDir,
    85  		chunkFetchers: cfg.ChunkFetchers,
    86  		retryTimeout:  cfg.ChunkRequestTimeout,
    87  	}
    88  }
    89  
    90  // AddChunk adds a chunk to the chunk queue, if any. It returns false if the chunk has already
    91  // been added to the queue, or an error if there's no sync in progress.
    92  func (s *syncer) AddChunk(chunk *chunk) (bool, error) {
    93  	s.mtx.RLock()
    94  	defer s.mtx.RUnlock()
    95  	if s.chunks == nil {
    96  		return false, errors.New("no state sync in progress")
    97  	}
    98  	added, err := s.chunks.Add(chunk)
    99  	if err != nil {
   100  		return false, err
   101  	}
   102  	if added {
   103  		s.logger.Debug("Added chunk to queue", "height", chunk.Height, "format", chunk.Format,
   104  			"chunk", chunk.Index)
   105  	} else {
   106  		s.logger.Debug("Ignoring duplicate chunk in queue", "height", chunk.Height, "format", chunk.Format,
   107  			"chunk", chunk.Index)
   108  	}
   109  	return added, nil
   110  }
   111  
   112  // AddSnapshot adds a snapshot to the snapshot pool. It returns true if a new, previously unseen
   113  // snapshot was accepted and added.
   114  func (s *syncer) AddSnapshot(peer p2p.Peer, snapshot *snapshot) (bool, error) {
   115  	added, err := s.snapshots.Add(peer, snapshot)
   116  	if err != nil {
   117  		return false, err
   118  	}
   119  	if added {
   120  		s.logger.Info("Discovered new snapshot", "height", snapshot.Height, "format", snapshot.Format,
   121  			"hash", snapshot.Hash)
   122  	}
   123  	return added, nil
   124  }
   125  
   126  // AddPeer adds a peer to the pool. For now we just keep it simple and send a single request
   127  // to discover snapshots, later we may want to do retries and stuff.
   128  func (s *syncer) AddPeer(peer p2p.Peer) {
   129  	s.logger.Debug("Requesting snapshots from peer", "peer", peer.ID())
   130  	e := p2p.Envelope{
   131  		ChannelID: SnapshotChannel,
   132  		Message:   &ssproto.SnapshotsRequest{},
   133  	}
   134  	p2p.SendEnvelopeShim(peer, e, s.logger) //nolint: staticcheck
   135  }
   136  
   137  // RemovePeer removes a peer from the pool.
   138  func (s *syncer) RemovePeer(peer p2p.Peer) {
   139  	s.logger.Debug("Removing peer from sync", "peer", peer.ID())
   140  	s.snapshots.RemovePeer(peer.ID())
   141  }
   142  
   143  // SyncAny tries to sync any of the snapshots in the snapshot pool, waiting to discover further
   144  // snapshots if none were found and discoveryTime > 0. It returns the latest state, previous state and block commit
   145  // which the caller must use to bootstrap the node.
   146  func (s *syncer) SyncAny(discoveryTime time.Duration, retryHook func()) (sm.State, sm.State, *types.Commit, error) {
   147  	if discoveryTime != 0 && discoveryTime < minimumDiscoveryTime {
   148  		discoveryTime = 5 * minimumDiscoveryTime
   149  	}
   150  
   151  	if discoveryTime > 0 {
   152  		s.logger.Info("sync any", "msg", log.NewLazySprintf("Discovering snapshots for %v", discoveryTime))
   153  		time.Sleep(discoveryTime)
   154  	}
   155  
   156  	// The app may ask us to retry a snapshot restoration, in which case we need to reuse
   157  	// the snapshot and chunk queue from the previous loop iteration.
   158  	var (
   159  		snapshot *snapshot
   160  		chunks   *chunkQueue
   161  		err      error
   162  	)
   163  	for {
   164  		// If not nil, we're going to retry restoration of the same snapshot.
   165  		if snapshot == nil {
   166  			snapshot = s.snapshots.Best()
   167  			chunks = nil
   168  		}
   169  		if snapshot == nil {
   170  			if discoveryTime == 0 {
   171  				return sm.State{}, sm.State{}, nil, errNoSnapshots
   172  			}
   173  			retryHook()
   174  			s.logger.Info("sync any", "msg", log.NewLazySprintf("Discovering snapshots for %v", discoveryTime))
   175  			time.Sleep(discoveryTime)
   176  			continue
   177  		}
   178  		if chunks == nil {
   179  			chunks, err = newChunkQueue(snapshot, s.tempDir)
   180  			if err != nil {
   181  				return sm.State{}, sm.State{}, nil, fmt.Errorf("failed to create chunk queue: %w", err)
   182  			}
   183  			defer chunks.Close() // in case we forget to close it elsewhere
   184  		}
   185  
   186  		newState, previousState, commit, err := s.Sync(snapshot, chunks)
   187  		switch {
   188  		case err == nil:
   189  			return newState, previousState, commit, nil
   190  
   191  		case errors.Is(err, errAbort):
   192  			return sm.State{}, sm.State{}, nil, err
   193  
   194  		case errors.Is(err, errRetrySnapshot):
   195  			chunks.RetryAll()
   196  			s.logger.Info("Retrying snapshot", "height", snapshot.Height, "format", snapshot.Format,
   197  				"hash", snapshot.Hash)
   198  			continue
   199  
   200  		case errors.Is(err, errTimeout):
   201  			s.snapshots.Reject(snapshot)
   202  			s.logger.Error("Timed out waiting for snapshot chunks, rejected snapshot",
   203  				"height", snapshot.Height, "format", snapshot.Format, "hash", snapshot.Hash)
   204  
   205  		case errors.Is(err, errRejectSnapshot):
   206  			s.snapshots.Reject(snapshot)
   207  			s.logger.Info("Snapshot rejected", "height", snapshot.Height, "format", snapshot.Format,
   208  				"hash", snapshot.Hash)
   209  
   210  		case errors.Is(err, errRejectFormat):
   211  			s.snapshots.RejectFormat(snapshot.Format)
   212  			s.logger.Info("Snapshot format rejected", "format", snapshot.Format)
   213  
   214  		case errors.Is(err, errRejectSender):
   215  			s.logger.Info("Snapshot senders rejected", "height", snapshot.Height, "format", snapshot.Format,
   216  				"hash", snapshot.Hash)
   217  			for _, peer := range s.snapshots.GetPeers(snapshot) {
   218  				s.snapshots.RejectPeer(peer.ID())
   219  				s.logger.Info("Snapshot sender rejected", "peer", peer.ID())
   220  			}
   221  
   222  		case errors.Is(err, context.DeadlineExceeded):
   223  			s.logger.Info("Timed out validating snapshot, rejecting", "height", snapshot.Height, "err", err)
   224  			s.snapshots.Reject(snapshot)
   225  
   226  		default:
   227  			return sm.State{}, sm.State{}, nil, fmt.Errorf("snapshot restoration failed: %w", err)
   228  		}
   229  
   230  		// Discard snapshot and chunks for next iteration
   231  		err = chunks.Close()
   232  		if err != nil {
   233  			s.logger.Error("Failed to clean up chunk queue", "err", err)
   234  		}
   235  		snapshot = nil
   236  		chunks = nil
   237  	}
   238  }
   239  
   240  // Sync executes a sync for a specific snapshot, returning the latest state, previous state and block commit which
   241  // the caller must use to bootstrap the node.
   242  func (s *syncer) Sync(snapshot *snapshot, chunks *chunkQueue) (sm.State, sm.State, *types.Commit, error) {
   243  	s.mtx.Lock()
   244  	if s.chunks != nil {
   245  		s.mtx.Unlock()
   246  		return sm.State{}, sm.State{}, nil, errors.New("a state sync is already in progress")
   247  	}
   248  	s.chunks = chunks
   249  	s.mtx.Unlock()
   250  	defer func() {
   251  		s.mtx.Lock()
   252  		s.chunks = nil
   253  		s.mtx.Unlock()
   254  	}()
   255  
   256  	hctx, cancel := context.WithTimeout(context.TODO(), 30*time.Second)
   257  	defer cancel()
   258  
   259  	appHash, err := s.stateProvider.AppHash(hctx, snapshot.Height)
   260  	if err != nil {
   261  		s.logger.Info("failed to fetch and verify app hash", "err", err)
   262  		if err == light.ErrNoWitnesses {
   263  			return sm.State{}, sm.State{}, nil, err
   264  		}
   265  		return sm.State{}, sm.State{}, nil, errRejectSnapshot
   266  	}
   267  	snapshot.trustedAppHash = appHash
   268  
   269  	// Offer snapshot to ABCI app.
   270  	err = s.offerSnapshot(snapshot)
   271  	if err != nil {
   272  		return sm.State{}, sm.State{}, nil, err
   273  	}
   274  
   275  	// Spawn chunk fetchers. They will terminate when the chunk queue is closed or context cancelled.
   276  	fetchCtx, cancel := context.WithCancel(context.TODO())
   277  	defer cancel()
   278  	for i := int32(0); i < s.chunkFetchers; i++ {
   279  		go s.fetchChunks(fetchCtx, snapshot, chunks)
   280  	}
   281  
   282  	pctx, pcancel := context.WithTimeout(context.TODO(), 30*time.Second)
   283  	defer pcancel()
   284  
   285  	// Optimistically build new state, so we don't discover any light client failures at the end.
   286  	state, err := s.stateProvider.State(pctx, snapshot.Height)
   287  	if err != nil {
   288  		s.logger.Info("failed to fetch and verify ostracon state", "err", err)
   289  		if err == light.ErrNoWitnesses {
   290  			return sm.State{}, sm.State{}, nil, err
   291  		}
   292  		return sm.State{}, sm.State{}, nil, errRejectSnapshot
   293  	}
   294  	var previousState sm.State
   295  	if snapshot.Height-1 < 1 {
   296  		previousState = sm.State{}
   297  	} else {
   298  		previousState, err = s.stateProvider.State(pctx, snapshot.Height-1)
   299  		if err != nil {
   300  			s.logger.Info("failed to fetch and verify ostracon previous state", "err", err)
   301  			if err == light.ErrNoWitnesses {
   302  				return sm.State{}, sm.State{}, nil, err
   303  			}
   304  			return sm.State{}, sm.State{}, nil, errRejectSnapshot
   305  		}
   306  	}
   307  	commit, err := s.stateProvider.Commit(pctx, snapshot.Height)
   308  	if err != nil {
   309  		s.logger.Info("failed to fetch and verify commit", "err", err)
   310  		if err == light.ErrNoWitnesses {
   311  			return sm.State{}, sm.State{}, nil, err
   312  		}
   313  		return sm.State{}, sm.State{}, nil, errRejectSnapshot
   314  	}
   315  
   316  	// Restore snapshot
   317  	err = s.applyChunks(chunks)
   318  	if err != nil {
   319  		return sm.State{}, sm.State{}, nil, err
   320  	}
   321  
   322  	// Verify app and app version
   323  	if err := s.verifyApp(snapshot, state.Version.Consensus.App); err != nil {
   324  		return sm.State{}, sm.State{}, nil, err
   325  	}
   326  
   327  	// Done! 🎉
   328  	s.logger.Info("Snapshot restored", "height", snapshot.Height, "format", snapshot.Format,
   329  		"hash", snapshot.Hash)
   330  
   331  	return state, previousState, commit, nil
   332  }
   333  
   334  // offerSnapshot offers a snapshot to the app. It returns various errors depending on the app's
   335  // response, or nil if the snapshot was accepted.
   336  func (s *syncer) offerSnapshot(snapshot *snapshot) error {
   337  	s.logger.Info("Offering snapshot to ABCI app", "height", snapshot.Height,
   338  		"format", snapshot.Format, "hash", snapshot.Hash)
   339  	resp, err := s.conn.OfferSnapshotSync(abci.RequestOfferSnapshot{
   340  		Snapshot: &abci.Snapshot{
   341  			Height:   snapshot.Height,
   342  			Format:   snapshot.Format,
   343  			Chunks:   snapshot.Chunks,
   344  			Hash:     snapshot.Hash,
   345  			Metadata: snapshot.Metadata,
   346  		},
   347  		AppHash: snapshot.trustedAppHash,
   348  	})
   349  	if err != nil {
   350  		return fmt.Errorf("failed to offer snapshot: %w", err)
   351  	}
   352  	switch resp.Result {
   353  	case abci.ResponseOfferSnapshot_ACCEPT:
   354  		s.logger.Info("Snapshot accepted, restoring", "height", snapshot.Height,
   355  			"format", snapshot.Format, "hash", snapshot.Hash)
   356  		return nil
   357  	case abci.ResponseOfferSnapshot_ABORT:
   358  		return errAbort
   359  	case abci.ResponseOfferSnapshot_REJECT:
   360  		return errRejectSnapshot
   361  	case abci.ResponseOfferSnapshot_REJECT_FORMAT:
   362  		return errRejectFormat
   363  	case abci.ResponseOfferSnapshot_REJECT_SENDER:
   364  		return errRejectSender
   365  	default:
   366  		return fmt.Errorf("unknown ResponseOfferSnapshot result %v", resp.Result)
   367  	}
   368  }
   369  
   370  // applyChunks applies chunks to the app. It returns various errors depending on the app's
   371  // response, or nil once the snapshot is fully restored.
   372  func (s *syncer) applyChunks(chunks *chunkQueue) error {
   373  	for {
   374  		chunk, err := chunks.Next()
   375  		if err == errDone {
   376  			return nil
   377  		} else if err != nil {
   378  			return fmt.Errorf("failed to fetch chunk: %w", err)
   379  		}
   380  
   381  		resp, err := s.conn.ApplySnapshotChunkSync(abci.RequestApplySnapshotChunk{
   382  			Index:  chunk.Index,
   383  			Chunk:  chunk.Chunk,
   384  			Sender: string(chunk.Sender),
   385  		})
   386  		if err != nil {
   387  			return fmt.Errorf("failed to apply chunk %v: %w", chunk.Index, err)
   388  		}
   389  		s.logger.Info("Applied snapshot chunk to ABCI app", "height", chunk.Height,
   390  			"format", chunk.Format, "chunk", chunk.Index, "total", chunks.Size())
   391  
   392  		// Discard and refetch any chunks as requested by the app
   393  		for _, index := range resp.RefetchChunks {
   394  			err := chunks.Discard(index)
   395  			if err != nil {
   396  				return fmt.Errorf("failed to discard chunk %v: %w", index, err)
   397  			}
   398  		}
   399  
   400  		// Reject any senders as requested by the app
   401  		for _, sender := range resp.RejectSenders {
   402  			if sender != "" {
   403  				s.snapshots.RejectPeer(p2p.ID(sender))
   404  				err := chunks.DiscardSender(p2p.ID(sender))
   405  				if err != nil {
   406  					return fmt.Errorf("failed to reject sender: %w", err)
   407  				}
   408  			}
   409  		}
   410  
   411  		switch resp.Result {
   412  		case abci.ResponseApplySnapshotChunk_ACCEPT:
   413  		case abci.ResponseApplySnapshotChunk_ABORT:
   414  			return errAbort
   415  		case abci.ResponseApplySnapshotChunk_RETRY:
   416  			chunks.Retry(chunk.Index)
   417  		case abci.ResponseApplySnapshotChunk_RETRY_SNAPSHOT:
   418  			return errRetrySnapshot
   419  		case abci.ResponseApplySnapshotChunk_REJECT_SNAPSHOT:
   420  			return errRejectSnapshot
   421  		default:
   422  			return fmt.Errorf("unknown ResponseApplySnapshotChunk result %v", resp.Result)
   423  		}
   424  	}
   425  }
   426  
   427  // fetchChunks requests chunks from peers, receiving allocations from the chunk queue. Chunks
   428  // will be received from the reactor via syncer.AddChunks() to chunkQueue.Add().
   429  func (s *syncer) fetchChunks(ctx context.Context, snapshot *snapshot, chunks *chunkQueue) {
   430  	var (
   431  		next  = true
   432  		index uint32
   433  		err   error
   434  	)
   435  
   436  	for {
   437  		if next {
   438  			index, err = chunks.Allocate()
   439  			if errors.Is(err, errDone) {
   440  				// Keep checking until the context is canceled (restore is done), in case any
   441  				// chunks need to be refetched.
   442  				select {
   443  				case <-ctx.Done():
   444  					return
   445  				default:
   446  				}
   447  				time.Sleep(2 * time.Second)
   448  				continue
   449  			}
   450  			if err != nil {
   451  				s.logger.Error("Failed to allocate chunk from queue", "err", err)
   452  				return
   453  			}
   454  		}
   455  		s.logger.Info("Fetching snapshot chunk", "height", snapshot.Height,
   456  			"format", snapshot.Format, "chunk", index, "total", chunks.Size())
   457  
   458  		ticker := time.NewTicker(s.retryTimeout)
   459  		defer ticker.Stop()
   460  
   461  		s.requestChunk(snapshot, index)
   462  
   463  		select {
   464  		case <-chunks.WaitFor(index):
   465  			next = true
   466  
   467  		case <-ticker.C:
   468  			next = false
   469  
   470  		case <-ctx.Done():
   471  			return
   472  		}
   473  
   474  		ticker.Stop()
   475  	}
   476  }
   477  
   478  // requestChunk requests a chunk from a peer.
   479  func (s *syncer) requestChunk(snapshot *snapshot, chunk uint32) {
   480  	peer := s.snapshots.GetPeer(snapshot)
   481  	if peer == nil {
   482  		s.logger.Error("No valid peers found for snapshot", "height", snapshot.Height,
   483  			"format", snapshot.Format, "hash", snapshot.Hash)
   484  		return
   485  	}
   486  	s.logger.Debug("Requesting snapshot chunk", "height", snapshot.Height,
   487  		"format", snapshot.Format, "chunk", chunk, "peer", peer.ID())
   488  	p2p.SendEnvelopeShim(peer, p2p.Envelope{ //nolint: staticcheck
   489  		ChannelID: ChunkChannel,
   490  		Message: &ssproto.ChunkRequest{
   491  			Height: snapshot.Height,
   492  			Format: snapshot.Format,
   493  			Index:  chunk,
   494  		},
   495  	}, s.logger)
   496  }
   497  
   498  // verifyApp verifies the sync, checking the app hash, last block height and app version
   499  func (s *syncer) verifyApp(snapshot *snapshot, appVersion uint64) error {
   500  	// XXX Is it okay to check with "ABCI.Info" only
   501  	// since AppVersion will be updated by "ABCI.InitChain" and "ABCI.EndBlock"?
   502  	resp, err := s.connQuery.InfoSync(proxy.RequestInfo)
   503  	if err != nil {
   504  		return fmt.Errorf("failed to query ABCI app for appHash: %w", err)
   505  	}
   506  
   507  	// sanity check that the app version in the block matches the application's own record
   508  	// of its version
   509  	if resp.AppVersion != appVersion {
   510  		// An error here most likely means that the app hasn't implemented state sync
   511  		// or the Info call correctly
   512  		return fmt.Errorf("app version mismatch. Expected: %d, got: %d",
   513  			appVersion, resp.AppVersion)
   514  	}
   515  	if !bytes.Equal(snapshot.trustedAppHash, resp.LastBlockAppHash) {
   516  		s.logger.Error("appHash verification failed",
   517  			"expected", snapshot.trustedAppHash,
   518  			"actual", resp.LastBlockAppHash)
   519  		return errVerifyFailed
   520  	}
   521  	if uint64(resp.LastBlockHeight) != snapshot.Height {
   522  		s.logger.Error(
   523  			"ABCI app reported unexpected last block height",
   524  			"expected", snapshot.Height,
   525  			"actual", resp.LastBlockHeight,
   526  		)
   527  		return errVerifyFailed
   528  	}
   529  
   530  	s.logger.Info("Verified ABCI app", "height", snapshot.Height, "appHash", snapshot.trustedAppHash)
   531  	return nil
   532  }