github.com/ari-anchor/sei-tendermint@v0.0.0-20230519144642-dc826b7b56bb/internal/dbsync/syncer.go (about) 1 package dbsync 2 3 import ( 4 "bytes" 5 "context" 6 "crypto/md5" 7 "errors" 8 "fmt" 9 "io/fs" 10 "os" 11 "path" 12 "strings" 13 "sync" 14 "time" 15 16 "github.com/ari-anchor/sei-tendermint/config" 17 sm "github.com/ari-anchor/sei-tendermint/internal/state" 18 "github.com/ari-anchor/sei-tendermint/libs/log" 19 dstypes "github.com/ari-anchor/sei-tendermint/proto/tendermint/dbsync" 20 "github.com/ari-anchor/sei-tendermint/types" 21 ) 22 23 const ApplicationDBSubdirectory = "application.db" 24 25 // TODO: this is bad as TM shouldn't be aware of wasm. DB sync/restore logic should ideally happen 26 // on application-level (i.e. Cosmos layer) and communicate to TM via new ABCI methods 27 const WasmDirectory = "wasm/wasm/state/wasm" 28 const WasmSuffix = "_wasm" 29 const LockFile = "LOCK" 30 31 type Syncer struct { 32 mtx *sync.RWMutex 33 logger log.Logger 34 35 active bool 36 heightToSync uint64 37 peersToSync []types.NodeID 38 expectedChecksums map[string][]byte 39 pendingFiles map[string]struct{} 40 syncedFiles map[string]struct{} 41 completionSignals map[string]chan struct{} 42 metadataSetAt time.Time 43 timeoutInSeconds time.Duration 44 fileQueue []*dstypes.FileResponse 45 applicationDBDirectory string 46 wasmStateDirectory string 47 sleepInSeconds time.Duration 48 fileWorkerCount int 49 fileWorkerTimeout time.Duration 50 fileWorkerCancelFn context.CancelFunc 51 52 metadataRequestFn func(context.Context) error 53 fileRequestFn func(context.Context, types.NodeID, uint64, string) error 54 commitStateFn func(context.Context, uint64) (sm.State, *types.Commit, error) 55 postSyncFn func(context.Context, sm.State, *types.Commit) error 56 resetDirFn func(*Syncer) 57 58 state sm.State 59 commit *types.Commit 60 } 61 62 func defaultResetDirFn(s *Syncer) { 63 os.RemoveAll(s.applicationDBDirectory) 64 os.MkdirAll(s.applicationDBDirectory, fs.ModePerm) 65 os.RemoveAll(s.wasmStateDirectory) 66 os.MkdirAll(s.wasmStateDirectory, fs.ModePerm) 67 } 68 69 func NewSyncer( 70 logger log.Logger, 71 dbsyncConfig config.DBSyncConfig, 72 baseConfig config.BaseConfig, 73 enable bool, 74 metadataRequestFn func(context.Context) error, 75 fileRequestFn func(context.Context, types.NodeID, uint64, string) error, 76 commitStateFn func(context.Context, uint64) (sm.State, *types.Commit, error), 77 postSyncFn func(context.Context, sm.State, *types.Commit) error, 78 resetDirFn func(*Syncer), 79 ) *Syncer { 80 return &Syncer{ 81 logger: logger, 82 active: enable, 83 timeoutInSeconds: time.Duration(dbsyncConfig.TimeoutInSeconds) * time.Second, 84 fileQueue: []*dstypes.FileResponse{}, 85 applicationDBDirectory: path.Join(baseConfig.DBDir(), ApplicationDBSubdirectory), 86 wasmStateDirectory: path.Join(baseConfig.RootDir, WasmDirectory), 87 sleepInSeconds: time.Duration(dbsyncConfig.NoFileSleepInSeconds) * time.Second, 88 fileWorkerCount: dbsyncConfig.FileWorkerCount, 89 fileWorkerTimeout: time.Duration(dbsyncConfig.FileWorkerTimeout) * time.Second, 90 metadataRequestFn: metadataRequestFn, 91 fileRequestFn: fileRequestFn, 92 commitStateFn: commitStateFn, 93 postSyncFn: postSyncFn, 94 resetDirFn: resetDirFn, 95 mtx: &sync.RWMutex{}, 96 } 97 } 98 99 func (s *Syncer) SetMetadata(ctx context.Context, sender types.NodeID, metadata *dstypes.MetadataResponse) { 100 s.mtx.RLock() 101 102 if !s.active { 103 s.mtx.RUnlock() 104 return 105 } 106 s.mtx.RUnlock() 107 108 if len(metadata.Filenames) != len(metadata.Md5Checksum) { 109 s.logger.Error("received bad metadata with inconsistent files and checksums count") 110 return 111 } 112 113 timedOut, now := s.isCurrentMetadataTimedOut() 114 s.mtx.Lock() 115 defer s.mtx.Unlock() 116 if timedOut { 117 if s.fileWorkerCancelFn != nil { 118 s.fileWorkerCancelFn() 119 } 120 121 state, commit, err := s.commitStateFn(ctx, metadata.Height) 122 if err != nil { 123 return 124 } 125 s.state = state 126 s.commit = commit 127 s.metadataSetAt = now 128 s.heightToSync = metadata.Height 129 s.expectedChecksums = map[string][]byte{} 130 s.syncedFiles = map[string]struct{}{} 131 s.pendingFiles = map[string]struct{}{} 132 s.completionSignals = map[string]chan struct{}{} 133 for i, filename := range metadata.Filenames { 134 if filename == LockFile { 135 // ignore lockfile 136 continue 137 } 138 s.expectedChecksums[filename] = metadata.Md5Checksum[i] 139 } 140 s.fileQueue = []*dstypes.FileResponse{} 141 s.peersToSync = []types.NodeID{sender} 142 s.resetDirFn(s) 143 144 cancellableCtx, cancel := context.WithCancel(ctx) 145 s.fileWorkerCancelFn = cancel 146 s.requestFiles(cancellableCtx, s.metadataSetAt) 147 } else if metadata.Height == s.heightToSync { 148 s.peersToSync = append(s.peersToSync, sender) 149 } 150 } 151 152 func (s *Syncer) Process(ctx context.Context) { 153 for { 154 s.mtx.RLock() 155 if !s.active { 156 s.logger.Info(fmt.Sprintf("sync for height %d with %d files finished!", s.heightToSync, len(s.expectedChecksums))) 157 s.mtx.RUnlock() 158 break 159 } 160 s.mtx.RUnlock() 161 timedOut, _ := s.isCurrentMetadataTimedOut() 162 if timedOut { 163 s.logger.Info(fmt.Sprintf("last metadata has timed out; sleeping for %f seconds", s.sleepInSeconds.Seconds())) 164 s.metadataRequestFn(ctx) 165 time.Sleep(s.sleepInSeconds) 166 continue 167 } 168 file := s.popFile() 169 if file == nil { 170 s.mtx.RLock() 171 numSynced := len(s.syncedFiles) 172 numTotal := len(s.expectedChecksums) 173 s.mtx.RUnlock() 174 s.logger.Info(fmt.Sprintf("no file to sync; sync'ed %d out of %d so far; sleeping for %f seconds", numSynced, numTotal, s.sleepInSeconds.Seconds())) 175 time.Sleep(s.sleepInSeconds) 176 continue 177 } 178 if err := s.processFile(ctx, file); err != nil { 179 s.logger.Error(err.Error()) 180 } 181 } 182 } 183 184 func (s *Syncer) Stop() { 185 s.mtx.Lock() 186 defer s.mtx.Unlock() 187 if s.active { 188 s.resetDirFn(s) 189 s.active = false 190 } 191 } 192 193 func (s *Syncer) processFile(ctx context.Context, file *dstypes.FileResponse) error { 194 s.mtx.Lock() 195 defer s.mtx.Unlock() 196 defer func() { 197 delete(s.pendingFiles, file.Filename) 198 }() 199 200 if file.Height != s.heightToSync { 201 return fmt.Errorf("current height is %d but received file for height %d", s.heightToSync, file.Height) 202 } 203 204 if expectedChecksum, ok := s.expectedChecksums[file.Filename]; !ok { 205 return fmt.Errorf("received unexpected file %s", file.Filename) 206 } else if _, ok := s.syncedFiles[file.Filename]; ok { 207 return fmt.Errorf("received duplicate file %s", file.Filename) 208 } else if _, ok := s.pendingFiles[file.Filename]; !ok { 209 return fmt.Errorf("received unrequested file %s", file.Filename) 210 } else { 211 checkSum := md5.Sum(file.Data) 212 if !bytes.Equal(checkSum[:], expectedChecksum) { 213 return errors.New("received unexpected checksum") 214 } 215 } 216 217 var dbFile *os.File 218 var err error 219 if strings.HasSuffix(file.Filename, WasmSuffix) { 220 dbFile, err = os.Create(path.Join(s.wasmStateDirectory, strings.TrimSuffix(file.Filename, WasmSuffix))) 221 } else { 222 dbFile, err = os.Create(path.Join(s.applicationDBDirectory, file.Filename)) 223 } 224 if err != nil { 225 return err 226 } 227 defer dbFile.Close() 228 _, err = dbFile.Write(file.Data) 229 if err != nil { 230 return err 231 } 232 233 s.syncedFiles[file.Filename] = struct{}{} 234 if len(s.syncedFiles) == len(s.expectedChecksums) { 235 // we have finished syncing 236 if err := s.postSyncFn(ctx, s.state, s.commit); err != nil { 237 // no graceful way to handle postsync error since we might be in a partially updated state 238 panic(err) 239 } 240 s.active = false 241 } 242 s.completionSignals[file.Filename] <- struct{}{} 243 return nil 244 } 245 246 func (s *Syncer) isCurrentMetadataTimedOut() (bool, time.Time) { 247 s.mtx.RLock() 248 defer s.mtx.RUnlock() 249 now := time.Now() 250 if s.metadataSetAt.IsZero() { 251 return true, now 252 } 253 return now.After(s.metadataSetAt.Add(s.timeoutInSeconds)), now 254 } 255 256 func (s *Syncer) requestFiles(ctx context.Context, metadataSetAt time.Time) { 257 worker := func() { 258 for { 259 s.mtx.Lock() 260 if metadataSetAt != s.metadataSetAt { 261 s.mtx.Unlock() 262 break 263 } 264 if len(s.expectedChecksums) == len(s.pendingFiles)+len(s.syncedFiles) { 265 // even if there are still pending items, there should be enough 266 // workers to handle them given one worker can have at most one 267 // pending item at a time 268 s.mtx.Unlock() 269 break 270 } 271 var picked string 272 for filename := range s.expectedChecksums { 273 _, pending := s.pendingFiles[filename] 274 _, synced := s.syncedFiles[filename] 275 if pending || synced { 276 continue 277 } 278 picked = filename 279 break 280 } 281 s.pendingFiles[picked] = struct{}{} 282 completionSignal := make(chan struct{}, 1) 283 s.completionSignals[picked] = completionSignal 284 s.fileRequestFn(ctx, s.peersToSync[0], s.heightToSync, picked) 285 s.mtx.Unlock() 286 287 ticker := time.NewTicker(s.fileWorkerTimeout) 288 defer ticker.Stop() 289 290 select { 291 case <-completionSignal: 292 293 case <-ticker.C: 294 s.mtx.Lock() 295 delete(s.pendingFiles, picked) 296 s.mtx.Unlock() 297 298 case <-ctx.Done(): 299 return 300 } 301 302 ticker.Stop() 303 } 304 } 305 for i := 0; i < s.fileWorkerCount; i++ { 306 go worker() 307 } 308 } 309 310 func (s *Syncer) popFile() *dstypes.FileResponse { 311 s.mtx.Lock() 312 defer s.mtx.Unlock() 313 314 if len(s.fileQueue) == 0 { 315 return nil 316 } 317 318 file := s.fileQueue[0] 319 s.fileQueue = s.fileQueue[1:] 320 return file 321 } 322 323 func (s *Syncer) PushFile(file *dstypes.FileResponse) { 324 s.mtx.Lock() 325 defer s.mtx.Unlock() 326 327 s.fileQueue = append(s.fileQueue, file) 328 }