github.com/onflow/flow-go@v0.35.7-crescendo-preview.23-atree-inlining/ledger/complete/compactor.go (about) 1 package complete 2 3 import ( 4 "context" 5 "errors" 6 "fmt" 7 "time" 8 9 "github.com/rs/zerolog" 10 "go.uber.org/atomic" 11 "golang.org/x/sync/semaphore" 12 13 "github.com/onflow/flow-go/ledger" 14 "github.com/onflow/flow-go/ledger/complete/mtrie/trie" 15 realWAL "github.com/onflow/flow-go/ledger/complete/wal" 16 "github.com/onflow/flow-go/module" 17 "github.com/onflow/flow-go/module/lifecycle" 18 "github.com/onflow/flow-go/module/observable" 19 ) 20 21 // WALTrieUpdate is a message communicated through channel between Ledger and Compactor. 22 type WALTrieUpdate struct { 23 Update *ledger.TrieUpdate // Update data needs to be encoded and saved in WAL. 24 ResultCh chan<- error // ResultCh channel is used to send WAL update result from Compactor to Ledger. 25 TrieCh <-chan *trie.MTrie // TrieCh channel is used to send new trie from Ledger to Compactor. 26 } 27 28 // checkpointResult is a message to communicate checkpointing number and error if any. 29 type checkpointResult struct { 30 num int 31 err error 32 } 33 34 // Compactor is a long-running goroutine responsible for: 35 // - writing WAL record from trie update, 36 // - starting checkpointing async when enough segments are finalized. 37 // 38 // Compactor communicates with Ledger through channels 39 // to ensure that by the end of any trie update processing, 40 // update is written to WAL and new trie is pushed to trie queue. 41 // 42 // Compactor stores pointers to tries in ledger state in a fix-sized 43 // checkpointing queue (FIFO). Checkpointing queue is decoupled from 44 // main ledger state to allow separate optimization and looser coupling, etc. 45 // CAUTION: If the forest LRU Cache is used for main state, 46 // then ledger state and checkpointing queue may contain different tries. 47 // This will be resolved automaticaly after the forest LRU Cache 48 // (code outside checkpointing) is replaced by something like a FIFO queue. 49 type Compactor struct { 50 checkpointer *realWAL.Checkpointer 51 wal realWAL.LedgerWAL 52 trieQueue *realWAL.TrieQueue 53 logger zerolog.Logger 54 lm *lifecycle.LifecycleManager 55 observers map[observable.Observer]struct{} 56 checkpointDistance uint 57 checkpointsToKeep uint 58 stopCh chan chan struct{} 59 trieUpdateCh <-chan *WALTrieUpdate 60 triggerCheckpointOnNextSegmentFinish *atomic.Bool // to trigger checkpoint manually 61 metrics module.WALMetrics 62 } 63 64 // NewCompactor creates new Compactor which writes WAL record and triggers 65 // checkpointing asynchronously when enough segments are finalized. 66 // The checkpointDistance is a flag that specifies how many segments need to 67 // be finalized to trigger checkpointing. However, if a prior checkpointing 68 // is already running and not finished, then more segments than specified 69 // could be accumulated for the new checkpointing (to reduce memory). 70 // All returned errors indicate that Compactor can't be created. 71 // Since failure to create Compactor will end up blocking ledger updates, 72 // the caller should handle all returned errors as unrecoverable. 73 func NewCompactor( 74 l *Ledger, 75 w realWAL.LedgerWAL, 76 logger zerolog.Logger, 77 checkpointCapacity uint, 78 checkpointDistance uint, 79 checkpointsToKeep uint, 80 triggerCheckpointOnNextSegmentFinish *atomic.Bool, 81 metrics module.WALMetrics, 82 ) (*Compactor, error) { 83 if checkpointDistance < 1 { 84 checkpointDistance = 1 85 } 86 87 checkpointer, err := w.NewCheckpointer() 88 if err != nil { 89 return nil, err 90 } 91 92 // Get trieUpdateCh channel to communicate trieUpdate, WAL result, and new trie 93 // created from the update. 94 trieUpdateCh := l.TrieUpdateChan() 95 if trieUpdateCh == nil { 96 return nil, errors.New("failed to get valid trie update channel from ledger") 97 } 98 99 // Get all tries from ledger state. 100 tries, err := l.Tries() 101 if err != nil { 102 return nil, err 103 } 104 105 // Create trieQueue with initial values from ledger state. 106 trieQueue := realWAL.NewTrieQueueWithValues(checkpointCapacity, tries) 107 108 return &Compactor{ 109 checkpointer: checkpointer, 110 wal: w, 111 trieQueue: trieQueue, 112 logger: logger.With().Str("ledger_mod", "compactor").Logger(), 113 stopCh: make(chan chan struct{}), 114 trieUpdateCh: trieUpdateCh, 115 observers: make(map[observable.Observer]struct{}), 116 lm: lifecycle.NewLifecycleManager(), 117 checkpointDistance: checkpointDistance, 118 checkpointsToKeep: checkpointsToKeep, 119 triggerCheckpointOnNextSegmentFinish: triggerCheckpointOnNextSegmentFinish, 120 metrics: metrics, 121 }, nil 122 } 123 124 // Subscribe subscribes observer to Compactor. 125 func (c *Compactor) Subscribe(observer observable.Observer) { 126 var void struct{} 127 c.observers[observer] = void 128 } 129 130 // Unsubscribe unsubscribes observer to Compactor. 131 func (c *Compactor) Unsubscribe(observer observable.Observer) { 132 delete(c.observers, observer) 133 } 134 135 // Ready returns channel which would be closed when Compactor goroutine starts. 136 func (c *Compactor) Ready() <-chan struct{} { 137 c.lm.OnStart(func() { 138 go c.run() 139 }) 140 return c.lm.Started() 141 } 142 143 // Done returns channel which would be closed when Compactor goroutine exits. 144 func (c *Compactor) Done() <-chan struct{} { 145 c.lm.OnStop(func() { 146 // Signal Compactor goroutine to stop 147 doneCh := make(chan struct{}) 148 c.stopCh <- doneCh 149 150 // Wait for Compactor goroutine to stop 151 <-doneCh 152 153 // Shut down WAL component. 154 // only shut down wal after compactor has been shut down, in case there 155 // is still writing to WAL files. 156 <-c.wal.Done() 157 158 // Notify observers 159 for observer := range c.observers { 160 observer.OnComplete() 161 } 162 }) 163 return c.lm.Stopped() 164 } 165 166 // run writes WAL records from trie updates and starts checkpointing 167 // asynchronously when enough segments are finalized. 168 func (c *Compactor) run() { 169 170 // checkpointSem is used to limit checkpointing to one. 171 // If previous checkpointing isn't finished when enough segments 172 // are finalized for next checkpointing, retry checkpointing 173 // again when next segment is finalized. 174 // This avoids having more tries in memory than needed. 175 checkpointSem := semaphore.NewWeighted(1) 176 177 checkpointResultCh := make(chan checkpointResult, 1) 178 179 // Get active segment number (opened segment that new records write to). 180 // activeSegmentNum is updated when record is written to a new segment. 181 _, activeSegmentNum, err := c.wal.Segments() 182 if err != nil { 183 c.logger.Error().Err(err).Msg("compactor failed to get active segment number") 184 activeSegmentNum = -1 185 } 186 187 lastCheckpointNum, err := c.checkpointer.LatestCheckpoint() 188 if err != nil { 189 c.logger.Error().Err(err).Msg("compactor failed to get last checkpoint number") 190 lastCheckpointNum = -1 191 } 192 193 // Compute next checkpoint number. 194 // nextCheckpointNum is updated when checkpointing starts, fails to start, or fails. 195 // NOTE: next checkpoint number must >= active segment num. 196 nextCheckpointNum := lastCheckpointNum + int(c.checkpointDistance) 197 if activeSegmentNum > nextCheckpointNum { 198 nextCheckpointNum = activeSegmentNum 199 } 200 201 ctx, cancel := context.WithCancel(context.Background()) 202 203 Loop: 204 for { 205 select { 206 207 case doneCh := <-c.stopCh: 208 defer close(doneCh) 209 cancel() 210 break Loop 211 212 case checkpointResult := <-checkpointResultCh: 213 if checkpointResult.err != nil { 214 c.logger.Error().Err(checkpointResult.err).Msg( 215 "compactor failed to create or remove checkpoint", 216 ) 217 var createError *createCheckpointError 218 if errors.As(checkpointResult.err, &createError) { 219 // Retry checkpointing when active segment is finalized. 220 nextCheckpointNum = activeSegmentNum 221 } 222 } 223 224 case update, ok := <-c.trieUpdateCh: 225 if !ok { 226 // trieUpdateCh channel is closed. 227 // Wait for stop signal from c.stopCh 228 continue 229 } 230 231 // listen to signals from admin tool in order to trigger a checkpoint when the current segment file is finished 232 if c.triggerCheckpointOnNextSegmentFinish.CompareAndSwap(true, false) { 233 // sanity checking, usually the nextCheckpointNum is a segment number in the future that when the activeSegmentNum 234 // finishes and reaches the nextCheckpointNum, then checkpoint will be triggered. 235 if nextCheckpointNum >= activeSegmentNum { 236 originalNextCheckpointNum := nextCheckpointNum 237 nextCheckpointNum = activeSegmentNum 238 c.logger.Info().Msgf("compactor will trigger once finish writing segment %v, originalNextCheckpointNum: %v", nextCheckpointNum, originalNextCheckpointNum) 239 } else { 240 c.logger.Warn().Msgf("could not force triggering checkpoint, nextCheckpointNum %v is smaller than activeSegmentNum %v", nextCheckpointNum, activeSegmentNum) 241 } 242 } 243 244 var checkpointNum int 245 var checkpointTries []*trie.MTrie 246 activeSegmentNum, checkpointNum, checkpointTries = 247 c.processTrieUpdate(update, c.trieQueue, activeSegmentNum, nextCheckpointNum) 248 249 if checkpointTries == nil { 250 // Not enough segments for checkpointing (nextCheckpointNum >= activeSegmentNum) 251 continue 252 } 253 254 // Try to checkpoint 255 if checkpointSem.TryAcquire(1) { 256 257 // Compute next checkpoint number 258 nextCheckpointNum = checkpointNum + int(c.checkpointDistance) 259 260 go func() { 261 defer checkpointSem.Release(1) 262 err := c.checkpoint(ctx, checkpointTries, checkpointNum) 263 checkpointResultCh <- checkpointResult{checkpointNum, err} 264 }() 265 } else { 266 // Failed to get semaphore because checkpointing is running. 267 // Try again when active segment is finalized. 268 c.logger.Info().Msgf("compactor delayed checkpoint %d because prior checkpointing is ongoing", nextCheckpointNum) 269 nextCheckpointNum = activeSegmentNum 270 } 271 } 272 } 273 274 // Drain and process remaining trie updates in channel. 275 c.logger.Info().Msg("Starting draining trie update channel in compactor on shutdown") 276 for update := range c.trieUpdateCh { 277 _, _, err := c.wal.RecordUpdate(update.Update) 278 select { 279 case update.ResultCh <- err: 280 default: 281 } 282 } 283 c.logger.Info().Msg("Finished draining trie update channel in compactor on shutdown") 284 285 // Don't wait for checkpointing to finish because it might take too long. 286 } 287 288 // checkpoint creates checkpoint of tries snapshot, 289 // deletes prior checkpoint files (if needed), and notifies observers. 290 // Errors indicate that checkpoint file can't be created or prior checkpoints can't be removed. 291 // Caller should handle returned errors by retrying checkpointing when appropriate. 292 // Since this function is only for checkpointing, Compactor isn't affected by returned error. 293 func (c *Compactor) checkpoint(ctx context.Context, tries []*trie.MTrie, checkpointNum int) error { 294 295 err := createCheckpoint(c.checkpointer, c.logger, tries, checkpointNum, c.metrics) 296 if err != nil { 297 return &createCheckpointError{num: checkpointNum, err: err} 298 } 299 300 // Return if context is canceled. 301 select { 302 case <-ctx.Done(): 303 return nil 304 default: 305 } 306 307 err = cleanupCheckpoints(c.checkpointer, int(c.checkpointsToKeep)) 308 if err != nil { 309 return &removeCheckpointError{err: err} 310 } 311 312 if checkpointNum > 0 { 313 for observer := range c.observers { 314 // Don't notify observer if context is canceled. 315 // observer.OnComplete() is called when Compactor starts shutting down, 316 // which may close channel that observer.OnNext() uses to send data. 317 select { 318 case <-ctx.Done(): 319 return nil 320 default: 321 observer.OnNext(checkpointNum) 322 } 323 } 324 } 325 326 return nil 327 } 328 329 // createCheckpoint creates checkpoint with given checkpointNum and tries. 330 // Errors indicate that checkpoint file can't be created. 331 // Caller should handle returned errors by retrying checkpointing when appropriate. 332 func createCheckpoint(checkpointer *realWAL.Checkpointer, logger zerolog.Logger, tries []*trie.MTrie, checkpointNum int, metrics module.WALMetrics) error { 333 334 logger.Info().Msgf("serializing checkpoint %d with %v tries", checkpointNum, len(tries)) 335 336 startTime := time.Now() 337 338 fileName := realWAL.NumberToFilename(checkpointNum) 339 err := realWAL.StoreCheckpointV6SingleThread(tries, checkpointer.Dir(), fileName, logger) 340 if err != nil { 341 return fmt.Errorf("error serializing checkpoint (%d): %w", checkpointNum, err) 342 } 343 344 size, err := realWAL.ReadCheckpointFileSize(checkpointer.Dir(), fileName) 345 if err != nil { 346 return fmt.Errorf("error reading checkpoint file size (%d): %w", checkpointNum, err) 347 } 348 349 metrics.ExecutionCheckpointSize(size) 350 351 duration := time.Since(startTime) 352 logger.Info().Float64("total_time_s", duration.Seconds()).Msgf("created checkpoint %d", checkpointNum) 353 354 return nil 355 } 356 357 // cleanupCheckpoints deletes prior checkpoint files if needed. 358 // Since the function is side-effect free, all failures are simply a no-op. 359 func cleanupCheckpoints(checkpointer *realWAL.Checkpointer, checkpointsToKeep int) error { 360 // Don't list checkpoints if we keep them all 361 if checkpointsToKeep == 0 { 362 return nil 363 } 364 checkpoints, err := checkpointer.Checkpoints() 365 if err != nil { 366 return fmt.Errorf("cannot list checkpoints: %w", err) 367 } 368 if len(checkpoints) > int(checkpointsToKeep) { 369 // if condition guarantees this never fails 370 checkpointsToRemove := checkpoints[:len(checkpoints)-int(checkpointsToKeep)] 371 372 for _, checkpoint := range checkpointsToRemove { 373 err := checkpointer.RemoveCheckpoint(checkpoint) 374 if err != nil { 375 return fmt.Errorf("cannot remove checkpoint %d: %w", checkpoint, err) 376 } 377 } 378 } 379 return nil 380 } 381 382 // processTrieUpdate writes trie update to WAL, updates activeSegmentNum, 383 // and returns tries for checkpointing if needed. 384 // It sends WAL update result, receives updated trie, and pushes updated trie to trieQueue. 385 // When this function returns, WAL update is in sync with trieQueue update. 386 func (c *Compactor) processTrieUpdate( 387 update *WALTrieUpdate, 388 trieQueue *realWAL.TrieQueue, 389 activeSegmentNum int, 390 nextCheckpointNum int, 391 ) ( 392 _activeSegmentNum int, 393 checkpointNum int, 394 checkpointTries []*trie.MTrie, 395 ) { 396 397 // RecordUpdate returns the segment number the record was written to. 398 // Returned segment number (>= 0) can be 399 // - the same as previous segment number (same segment), or 400 // - incremented by 1 from previous segment number (new segment) 401 segmentNum, skipped, updateErr := c.wal.RecordUpdate(update.Update) 402 403 // Send result of WAL update 404 update.ResultCh <- updateErr 405 406 // This ensures that updated trie matches WAL update. 407 defer func() { 408 // Wait for updated trie 409 trie := <-update.TrieCh 410 if trie == nil { 411 c.logger.Error().Msg("compactor failed to get updated trie") 412 return 413 } 414 415 trieQueue.Push(trie) 416 }() 417 418 if activeSegmentNum == -1 { 419 // Recover from failure to get active segment number at initialization. 420 return segmentNum, -1, nil 421 } 422 423 if updateErr != nil || skipped || segmentNum == activeSegmentNum { 424 return activeSegmentNum, -1, nil 425 } 426 427 // In the remaining code: segmentNum > activeSegmentNum 428 429 // active segment is finalized. 430 431 // Check new segment number is incremented by 1 432 if segmentNum != activeSegmentNum+1 { 433 c.logger.Error().Msg(fmt.Sprintf("compactor got unexpected new segment numer %d, want %d", segmentNum, activeSegmentNum+1)) 434 } 435 436 // Update activeSegmentNum 437 prevSegmentNum := activeSegmentNum 438 activeSegmentNum = segmentNum 439 440 c.logger.Info().Msgf("finish writing segment file %v, trie update is writing to segment file %v, checkpoint will trigger when segment %v is finished", 441 prevSegmentNum, activeSegmentNum, nextCheckpointNum) 442 443 if nextCheckpointNum > prevSegmentNum { 444 // Not enough segments for checkpointing 445 return activeSegmentNum, -1, nil 446 } 447 448 // In the remaining code: nextCheckpointNum == prevSegmentNum 449 450 // Enough segments are created for checkpointing 451 452 // Get tries from checkpoint queue. 453 // At this point, checkpoint queue contains tries up to 454 // last update (last record in finalized segment) 455 // It doesn't include trie for this update 456 // until updated trie is received and added to trieQueue. 457 tries := trieQueue.Tries() 458 459 checkpointNum = nextCheckpointNum 460 461 return activeSegmentNum, checkpointNum, tries 462 } 463 464 // createCheckpointError creates a checkpoint creation error. 465 type createCheckpointError struct { 466 num int 467 err error 468 } 469 470 func (e *createCheckpointError) Error() string { 471 return fmt.Sprintf("cannot create checkpoint %d: %s", e.num, e.err) 472 } 473 474 func (e *createCheckpointError) Unwrap() error { return e.err } 475 476 // removeCheckpointError creates a checkpoint removal error. 477 type removeCheckpointError struct { 478 err error 479 } 480 481 func (e *removeCheckpointError) Error() string { 482 return fmt.Sprintf("cannot cleanup checkpoints: %s", e.err) 483 } 484 485 func (e *removeCheckpointError) Unwrap() error { return e.err }