github.com/ari-anchor/sei-tendermint@v0.0.0-20230519144642-dc826b7b56bb/internal/dbsync/syncer.go (about)

     1  package dbsync
     2  
     3  import (
     4  	"bytes"
     5  	"context"
     6  	"crypto/md5"
     7  	"errors"
     8  	"fmt"
     9  	"io/fs"
    10  	"os"
    11  	"path"
    12  	"strings"
    13  	"sync"
    14  	"time"
    15  
    16  	"github.com/ari-anchor/sei-tendermint/config"
    17  	sm "github.com/ari-anchor/sei-tendermint/internal/state"
    18  	"github.com/ari-anchor/sei-tendermint/libs/log"
    19  	dstypes "github.com/ari-anchor/sei-tendermint/proto/tendermint/dbsync"
    20  	"github.com/ari-anchor/sei-tendermint/types"
    21  )
    22  
    23  const ApplicationDBSubdirectory = "application.db"
    24  
    25  // TODO: this is bad as TM shouldn't be aware of wasm. DB sync/restore logic should ideally happen
    26  // on application-level (i.e. Cosmos layer) and communicate to TM via new ABCI methods
    27  const WasmDirectory = "wasm/wasm/state/wasm"
    28  const WasmSuffix = "_wasm"
    29  const LockFile = "LOCK"
    30  
    31  type Syncer struct {
    32  	mtx    *sync.RWMutex
    33  	logger log.Logger
    34  
    35  	active                 bool
    36  	heightToSync           uint64
    37  	peersToSync            []types.NodeID
    38  	expectedChecksums      map[string][]byte
    39  	pendingFiles           map[string]struct{}
    40  	syncedFiles            map[string]struct{}
    41  	completionSignals      map[string]chan struct{}
    42  	metadataSetAt          time.Time
    43  	timeoutInSeconds       time.Duration
    44  	fileQueue              []*dstypes.FileResponse
    45  	applicationDBDirectory string
    46  	wasmStateDirectory     string
    47  	sleepInSeconds         time.Duration
    48  	fileWorkerCount        int
    49  	fileWorkerTimeout      time.Duration
    50  	fileWorkerCancelFn     context.CancelFunc
    51  
    52  	metadataRequestFn func(context.Context) error
    53  	fileRequestFn     func(context.Context, types.NodeID, uint64, string) error
    54  	commitStateFn     func(context.Context, uint64) (sm.State, *types.Commit, error)
    55  	postSyncFn        func(context.Context, sm.State, *types.Commit) error
    56  	resetDirFn        func(*Syncer)
    57  
    58  	state  sm.State
    59  	commit *types.Commit
    60  }
    61  
    62  func defaultResetDirFn(s *Syncer) {
    63  	os.RemoveAll(s.applicationDBDirectory)
    64  	os.MkdirAll(s.applicationDBDirectory, fs.ModePerm)
    65  	os.RemoveAll(s.wasmStateDirectory)
    66  	os.MkdirAll(s.wasmStateDirectory, fs.ModePerm)
    67  }
    68  
    69  func NewSyncer(
    70  	logger log.Logger,
    71  	dbsyncConfig config.DBSyncConfig,
    72  	baseConfig config.BaseConfig,
    73  	enable bool,
    74  	metadataRequestFn func(context.Context) error,
    75  	fileRequestFn func(context.Context, types.NodeID, uint64, string) error,
    76  	commitStateFn func(context.Context, uint64) (sm.State, *types.Commit, error),
    77  	postSyncFn func(context.Context, sm.State, *types.Commit) error,
    78  	resetDirFn func(*Syncer),
    79  ) *Syncer {
    80  	return &Syncer{
    81  		logger:                 logger,
    82  		active:                 enable,
    83  		timeoutInSeconds:       time.Duration(dbsyncConfig.TimeoutInSeconds) * time.Second,
    84  		fileQueue:              []*dstypes.FileResponse{},
    85  		applicationDBDirectory: path.Join(baseConfig.DBDir(), ApplicationDBSubdirectory),
    86  		wasmStateDirectory:     path.Join(baseConfig.RootDir, WasmDirectory),
    87  		sleepInSeconds:         time.Duration(dbsyncConfig.NoFileSleepInSeconds) * time.Second,
    88  		fileWorkerCount:        dbsyncConfig.FileWorkerCount,
    89  		fileWorkerTimeout:      time.Duration(dbsyncConfig.FileWorkerTimeout) * time.Second,
    90  		metadataRequestFn:      metadataRequestFn,
    91  		fileRequestFn:          fileRequestFn,
    92  		commitStateFn:          commitStateFn,
    93  		postSyncFn:             postSyncFn,
    94  		resetDirFn:             resetDirFn,
    95  		mtx:                    &sync.RWMutex{},
    96  	}
    97  }
    98  
    99  func (s *Syncer) SetMetadata(ctx context.Context, sender types.NodeID, metadata *dstypes.MetadataResponse) {
   100  	s.mtx.RLock()
   101  
   102  	if !s.active {
   103  		s.mtx.RUnlock()
   104  		return
   105  	}
   106  	s.mtx.RUnlock()
   107  
   108  	if len(metadata.Filenames) != len(metadata.Md5Checksum) {
   109  		s.logger.Error("received bad metadata with inconsistent files and checksums count")
   110  		return
   111  	}
   112  
   113  	timedOut, now := s.isCurrentMetadataTimedOut()
   114  	s.mtx.Lock()
   115  	defer s.mtx.Unlock()
   116  	if timedOut {
   117  		if s.fileWorkerCancelFn != nil {
   118  			s.fileWorkerCancelFn()
   119  		}
   120  
   121  		state, commit, err := s.commitStateFn(ctx, metadata.Height)
   122  		if err != nil {
   123  			return
   124  		}
   125  		s.state = state
   126  		s.commit = commit
   127  		s.metadataSetAt = now
   128  		s.heightToSync = metadata.Height
   129  		s.expectedChecksums = map[string][]byte{}
   130  		s.syncedFiles = map[string]struct{}{}
   131  		s.pendingFiles = map[string]struct{}{}
   132  		s.completionSignals = map[string]chan struct{}{}
   133  		for i, filename := range metadata.Filenames {
   134  			if filename == LockFile {
   135  				// ignore lockfile
   136  				continue
   137  			}
   138  			s.expectedChecksums[filename] = metadata.Md5Checksum[i]
   139  		}
   140  		s.fileQueue = []*dstypes.FileResponse{}
   141  		s.peersToSync = []types.NodeID{sender}
   142  		s.resetDirFn(s)
   143  
   144  		cancellableCtx, cancel := context.WithCancel(ctx)
   145  		s.fileWorkerCancelFn = cancel
   146  		s.requestFiles(cancellableCtx, s.metadataSetAt)
   147  	} else if metadata.Height == s.heightToSync {
   148  		s.peersToSync = append(s.peersToSync, sender)
   149  	}
   150  }
   151  
   152  func (s *Syncer) Process(ctx context.Context) {
   153  	for {
   154  		s.mtx.RLock()
   155  		if !s.active {
   156  			s.logger.Info(fmt.Sprintf("sync for height %d with %d files finished!", s.heightToSync, len(s.expectedChecksums)))
   157  			s.mtx.RUnlock()
   158  			break
   159  		}
   160  		s.mtx.RUnlock()
   161  		timedOut, _ := s.isCurrentMetadataTimedOut()
   162  		if timedOut {
   163  			s.logger.Info(fmt.Sprintf("last metadata has timed out; sleeping for %f seconds", s.sleepInSeconds.Seconds()))
   164  			s.metadataRequestFn(ctx)
   165  			time.Sleep(s.sleepInSeconds)
   166  			continue
   167  		}
   168  		file := s.popFile()
   169  		if file == nil {
   170  			s.mtx.RLock()
   171  			numSynced := len(s.syncedFiles)
   172  			numTotal := len(s.expectedChecksums)
   173  			s.mtx.RUnlock()
   174  			s.logger.Info(fmt.Sprintf("no file to sync; sync'ed %d out of %d so far; sleeping for %f seconds", numSynced, numTotal, s.sleepInSeconds.Seconds()))
   175  			time.Sleep(s.sleepInSeconds)
   176  			continue
   177  		}
   178  		if err := s.processFile(ctx, file); err != nil {
   179  			s.logger.Error(err.Error())
   180  		}
   181  	}
   182  }
   183  
   184  func (s *Syncer) Stop() {
   185  	s.mtx.Lock()
   186  	defer s.mtx.Unlock()
   187  	if s.active {
   188  		s.resetDirFn(s)
   189  		s.active = false
   190  	}
   191  }
   192  
   193  func (s *Syncer) processFile(ctx context.Context, file *dstypes.FileResponse) error {
   194  	s.mtx.Lock()
   195  	defer s.mtx.Unlock()
   196  	defer func() {
   197  		delete(s.pendingFiles, file.Filename)
   198  	}()
   199  
   200  	if file.Height != s.heightToSync {
   201  		return fmt.Errorf("current height is %d but received file for height %d", s.heightToSync, file.Height)
   202  	}
   203  
   204  	if expectedChecksum, ok := s.expectedChecksums[file.Filename]; !ok {
   205  		return fmt.Errorf("received unexpected file %s", file.Filename)
   206  	} else if _, ok := s.syncedFiles[file.Filename]; ok {
   207  		return fmt.Errorf("received duplicate file %s", file.Filename)
   208  	} else if _, ok := s.pendingFiles[file.Filename]; !ok {
   209  		return fmt.Errorf("received unrequested file %s", file.Filename)
   210  	} else {
   211  		checkSum := md5.Sum(file.Data)
   212  		if !bytes.Equal(checkSum[:], expectedChecksum) {
   213  			return errors.New("received unexpected checksum")
   214  		}
   215  	}
   216  
   217  	var dbFile *os.File
   218  	var err error
   219  	if strings.HasSuffix(file.Filename, WasmSuffix) {
   220  		dbFile, err = os.Create(path.Join(s.wasmStateDirectory, strings.TrimSuffix(file.Filename, WasmSuffix)))
   221  	} else {
   222  		dbFile, err = os.Create(path.Join(s.applicationDBDirectory, file.Filename))
   223  	}
   224  	if err != nil {
   225  		return err
   226  	}
   227  	defer dbFile.Close()
   228  	_, err = dbFile.Write(file.Data)
   229  	if err != nil {
   230  		return err
   231  	}
   232  
   233  	s.syncedFiles[file.Filename] = struct{}{}
   234  	if len(s.syncedFiles) == len(s.expectedChecksums) {
   235  		// we have finished syncing
   236  		if err := s.postSyncFn(ctx, s.state, s.commit); err != nil {
   237  			// no graceful way to handle postsync error since we might be in a partially updated state
   238  			panic(err)
   239  		}
   240  		s.active = false
   241  	}
   242  	s.completionSignals[file.Filename] <- struct{}{}
   243  	return nil
   244  }
   245  
   246  func (s *Syncer) isCurrentMetadataTimedOut() (bool, time.Time) {
   247  	s.mtx.RLock()
   248  	defer s.mtx.RUnlock()
   249  	now := time.Now()
   250  	if s.metadataSetAt.IsZero() {
   251  		return true, now
   252  	}
   253  	return now.After(s.metadataSetAt.Add(s.timeoutInSeconds)), now
   254  }
   255  
   256  func (s *Syncer) requestFiles(ctx context.Context, metadataSetAt time.Time) {
   257  	worker := func() {
   258  		for {
   259  			s.mtx.Lock()
   260  			if metadataSetAt != s.metadataSetAt {
   261  				s.mtx.Unlock()
   262  				break
   263  			}
   264  			if len(s.expectedChecksums) == len(s.pendingFiles)+len(s.syncedFiles) {
   265  				// even if there are still pending items, there should be enough
   266  				// workers to handle them given one worker can have at most one
   267  				// pending item at a time
   268  				s.mtx.Unlock()
   269  				break
   270  			}
   271  			var picked string
   272  			for filename := range s.expectedChecksums {
   273  				_, pending := s.pendingFiles[filename]
   274  				_, synced := s.syncedFiles[filename]
   275  				if pending || synced {
   276  					continue
   277  				}
   278  				picked = filename
   279  				break
   280  			}
   281  			s.pendingFiles[picked] = struct{}{}
   282  			completionSignal := make(chan struct{}, 1)
   283  			s.completionSignals[picked] = completionSignal
   284  			s.fileRequestFn(ctx, s.peersToSync[0], s.heightToSync, picked)
   285  			s.mtx.Unlock()
   286  
   287  			ticker := time.NewTicker(s.fileWorkerTimeout)
   288  			defer ticker.Stop()
   289  
   290  			select {
   291  			case <-completionSignal:
   292  
   293  			case <-ticker.C:
   294  				s.mtx.Lock()
   295  				delete(s.pendingFiles, picked)
   296  				s.mtx.Unlock()
   297  
   298  			case <-ctx.Done():
   299  				return
   300  			}
   301  
   302  			ticker.Stop()
   303  		}
   304  	}
   305  	for i := 0; i < s.fileWorkerCount; i++ {
   306  		go worker()
   307  	}
   308  }
   309  
   310  func (s *Syncer) popFile() *dstypes.FileResponse {
   311  	s.mtx.Lock()
   312  	defer s.mtx.Unlock()
   313  
   314  	if len(s.fileQueue) == 0 {
   315  		return nil
   316  	}
   317  
   318  	file := s.fileQueue[0]
   319  	s.fileQueue = s.fileQueue[1:]
   320  	return file
   321  }
   322  
   323  func (s *Syncer) PushFile(file *dstypes.FileResponse) {
   324  	s.mtx.Lock()
   325  	defer s.mtx.Unlock()
   326  
   327  	s.fileQueue = append(s.fileQueue, file)
   328  }