github.com/onflow/flow-go@v0.35.7-crescendo-preview.23-atree-inlining/ledger/complete/compactor.go (about)

     1  package complete
     2  
     3  import (
     4  	"context"
     5  	"errors"
     6  	"fmt"
     7  	"time"
     8  
     9  	"github.com/rs/zerolog"
    10  	"go.uber.org/atomic"
    11  	"golang.org/x/sync/semaphore"
    12  
    13  	"github.com/onflow/flow-go/ledger"
    14  	"github.com/onflow/flow-go/ledger/complete/mtrie/trie"
    15  	realWAL "github.com/onflow/flow-go/ledger/complete/wal"
    16  	"github.com/onflow/flow-go/module"
    17  	"github.com/onflow/flow-go/module/lifecycle"
    18  	"github.com/onflow/flow-go/module/observable"
    19  )
    20  
    21  // WALTrieUpdate is a message communicated through channel between Ledger and Compactor.
    22  type WALTrieUpdate struct {
    23  	Update   *ledger.TrieUpdate // Update data needs to be encoded and saved in WAL.
    24  	ResultCh chan<- error       // ResultCh channel is used to send WAL update result from Compactor to Ledger.
    25  	TrieCh   <-chan *trie.MTrie // TrieCh channel is used to send new trie from Ledger to Compactor.
    26  }
    27  
    28  // checkpointResult is a message to communicate checkpointing number and error if any.
    29  type checkpointResult struct {
    30  	num int
    31  	err error
    32  }
    33  
    34  // Compactor is a long-running goroutine responsible for:
    35  // - writing WAL record from trie update,
    36  // - starting checkpointing async when enough segments are finalized.
    37  //
    38  // Compactor communicates with Ledger through channels
    39  // to ensure that by the end of any trie update processing,
    40  // update is written to WAL and new trie is pushed to trie queue.
    41  //
    42  // Compactor stores pointers to tries in ledger state in a fix-sized
    43  // checkpointing queue (FIFO).  Checkpointing queue is decoupled from
    44  // main ledger state to allow separate optimization and looser coupling, etc.
    45  // CAUTION: If the forest LRU Cache is used for main state,
    46  // then ledger state and checkpointing queue may contain different tries.
    47  // This will be resolved automaticaly after the forest LRU Cache
    48  // (code outside checkpointing) is replaced by something like a FIFO queue.
    49  type Compactor struct {
    50  	checkpointer                         *realWAL.Checkpointer
    51  	wal                                  realWAL.LedgerWAL
    52  	trieQueue                            *realWAL.TrieQueue
    53  	logger                               zerolog.Logger
    54  	lm                                   *lifecycle.LifecycleManager
    55  	observers                            map[observable.Observer]struct{}
    56  	checkpointDistance                   uint
    57  	checkpointsToKeep                    uint
    58  	stopCh                               chan chan struct{}
    59  	trieUpdateCh                         <-chan *WALTrieUpdate
    60  	triggerCheckpointOnNextSegmentFinish *atomic.Bool // to trigger checkpoint manually
    61  	metrics                              module.WALMetrics
    62  }
    63  
    64  // NewCompactor creates new Compactor which writes WAL record and triggers
    65  // checkpointing asynchronously when enough segments are finalized.
    66  // The checkpointDistance is a flag that specifies how many segments need to
    67  // be finalized to trigger checkpointing.  However, if a prior checkpointing
    68  // is already running and not finished, then more segments than specified
    69  // could be accumulated for the new checkpointing (to reduce memory).
    70  // All returned errors indicate that Compactor can't be created.
    71  // Since failure to create Compactor will end up blocking ledger updates,
    72  // the caller should handle all returned errors as unrecoverable.
    73  func NewCompactor(
    74  	l *Ledger,
    75  	w realWAL.LedgerWAL,
    76  	logger zerolog.Logger,
    77  	checkpointCapacity uint,
    78  	checkpointDistance uint,
    79  	checkpointsToKeep uint,
    80  	triggerCheckpointOnNextSegmentFinish *atomic.Bool,
    81  	metrics module.WALMetrics,
    82  ) (*Compactor, error) {
    83  	if checkpointDistance < 1 {
    84  		checkpointDistance = 1
    85  	}
    86  
    87  	checkpointer, err := w.NewCheckpointer()
    88  	if err != nil {
    89  		return nil, err
    90  	}
    91  
    92  	// Get trieUpdateCh channel to communicate trieUpdate, WAL result, and new trie
    93  	// created from the update.
    94  	trieUpdateCh := l.TrieUpdateChan()
    95  	if trieUpdateCh == nil {
    96  		return nil, errors.New("failed to get valid trie update channel from ledger")
    97  	}
    98  
    99  	// Get all tries from ledger state.
   100  	tries, err := l.Tries()
   101  	if err != nil {
   102  		return nil, err
   103  	}
   104  
   105  	// Create trieQueue with initial values from ledger state.
   106  	trieQueue := realWAL.NewTrieQueueWithValues(checkpointCapacity, tries)
   107  
   108  	return &Compactor{
   109  		checkpointer:                         checkpointer,
   110  		wal:                                  w,
   111  		trieQueue:                            trieQueue,
   112  		logger:                               logger.With().Str("ledger_mod", "compactor").Logger(),
   113  		stopCh:                               make(chan chan struct{}),
   114  		trieUpdateCh:                         trieUpdateCh,
   115  		observers:                            make(map[observable.Observer]struct{}),
   116  		lm:                                   lifecycle.NewLifecycleManager(),
   117  		checkpointDistance:                   checkpointDistance,
   118  		checkpointsToKeep:                    checkpointsToKeep,
   119  		triggerCheckpointOnNextSegmentFinish: triggerCheckpointOnNextSegmentFinish,
   120  		metrics:                              metrics,
   121  	}, nil
   122  }
   123  
   124  // Subscribe subscribes observer to Compactor.
   125  func (c *Compactor) Subscribe(observer observable.Observer) {
   126  	var void struct{}
   127  	c.observers[observer] = void
   128  }
   129  
   130  // Unsubscribe unsubscribes observer to Compactor.
   131  func (c *Compactor) Unsubscribe(observer observable.Observer) {
   132  	delete(c.observers, observer)
   133  }
   134  
   135  // Ready returns channel which would be closed when Compactor goroutine starts.
   136  func (c *Compactor) Ready() <-chan struct{} {
   137  	c.lm.OnStart(func() {
   138  		go c.run()
   139  	})
   140  	return c.lm.Started()
   141  }
   142  
   143  // Done returns channel which would be closed when Compactor goroutine exits.
   144  func (c *Compactor) Done() <-chan struct{} {
   145  	c.lm.OnStop(func() {
   146  		// Signal Compactor goroutine to stop
   147  		doneCh := make(chan struct{})
   148  		c.stopCh <- doneCh
   149  
   150  		// Wait for Compactor goroutine to stop
   151  		<-doneCh
   152  
   153  		// Shut down WAL component.
   154  		// only shut down wal after compactor has been shut down, in case there
   155  		// is still writing to WAL files.
   156  		<-c.wal.Done()
   157  
   158  		// Notify observers
   159  		for observer := range c.observers {
   160  			observer.OnComplete()
   161  		}
   162  	})
   163  	return c.lm.Stopped()
   164  }
   165  
   166  // run writes WAL records from trie updates and starts checkpointing
   167  // asynchronously when enough segments are finalized.
   168  func (c *Compactor) run() {
   169  
   170  	// checkpointSem is used to limit checkpointing to one.
   171  	// If previous checkpointing isn't finished when enough segments
   172  	// are finalized for next checkpointing, retry checkpointing
   173  	// again when next segment is finalized.
   174  	// This avoids having more tries in memory than needed.
   175  	checkpointSem := semaphore.NewWeighted(1)
   176  
   177  	checkpointResultCh := make(chan checkpointResult, 1)
   178  
   179  	// Get active segment number (opened segment that new records write to).
   180  	// activeSegmentNum is updated when record is written to a new segment.
   181  	_, activeSegmentNum, err := c.wal.Segments()
   182  	if err != nil {
   183  		c.logger.Error().Err(err).Msg("compactor failed to get active segment number")
   184  		activeSegmentNum = -1
   185  	}
   186  
   187  	lastCheckpointNum, err := c.checkpointer.LatestCheckpoint()
   188  	if err != nil {
   189  		c.logger.Error().Err(err).Msg("compactor failed to get last checkpoint number")
   190  		lastCheckpointNum = -1
   191  	}
   192  
   193  	// Compute next checkpoint number.
   194  	// nextCheckpointNum is updated when checkpointing starts, fails to start, or fails.
   195  	// NOTE: next checkpoint number must >= active segment num.
   196  	nextCheckpointNum := lastCheckpointNum + int(c.checkpointDistance)
   197  	if activeSegmentNum > nextCheckpointNum {
   198  		nextCheckpointNum = activeSegmentNum
   199  	}
   200  
   201  	ctx, cancel := context.WithCancel(context.Background())
   202  
   203  Loop:
   204  	for {
   205  		select {
   206  
   207  		case doneCh := <-c.stopCh:
   208  			defer close(doneCh)
   209  			cancel()
   210  			break Loop
   211  
   212  		case checkpointResult := <-checkpointResultCh:
   213  			if checkpointResult.err != nil {
   214  				c.logger.Error().Err(checkpointResult.err).Msg(
   215  					"compactor failed to create or remove checkpoint",
   216  				)
   217  				var createError *createCheckpointError
   218  				if errors.As(checkpointResult.err, &createError) {
   219  					// Retry checkpointing when active segment is finalized.
   220  					nextCheckpointNum = activeSegmentNum
   221  				}
   222  			}
   223  
   224  		case update, ok := <-c.trieUpdateCh:
   225  			if !ok {
   226  				// trieUpdateCh channel is closed.
   227  				// Wait for stop signal from c.stopCh
   228  				continue
   229  			}
   230  
   231  			// listen to signals from admin tool in order to trigger a checkpoint when the current segment file is finished
   232  			if c.triggerCheckpointOnNextSegmentFinish.CompareAndSwap(true, false) {
   233  				// sanity checking, usually the nextCheckpointNum is a segment number in the future that when the activeSegmentNum
   234  				// finishes and reaches the nextCheckpointNum, then checkpoint will be triggered.
   235  				if nextCheckpointNum >= activeSegmentNum {
   236  					originalNextCheckpointNum := nextCheckpointNum
   237  					nextCheckpointNum = activeSegmentNum
   238  					c.logger.Info().Msgf("compactor will trigger once finish writing segment %v, originalNextCheckpointNum: %v", nextCheckpointNum, originalNextCheckpointNum)
   239  				} else {
   240  					c.logger.Warn().Msgf("could not force triggering checkpoint, nextCheckpointNum %v is smaller than activeSegmentNum %v", nextCheckpointNum, activeSegmentNum)
   241  				}
   242  			}
   243  
   244  			var checkpointNum int
   245  			var checkpointTries []*trie.MTrie
   246  			activeSegmentNum, checkpointNum, checkpointTries =
   247  				c.processTrieUpdate(update, c.trieQueue, activeSegmentNum, nextCheckpointNum)
   248  
   249  			if checkpointTries == nil {
   250  				// Not enough segments for checkpointing (nextCheckpointNum >= activeSegmentNum)
   251  				continue
   252  			}
   253  
   254  			// Try to checkpoint
   255  			if checkpointSem.TryAcquire(1) {
   256  
   257  				// Compute next checkpoint number
   258  				nextCheckpointNum = checkpointNum + int(c.checkpointDistance)
   259  
   260  				go func() {
   261  					defer checkpointSem.Release(1)
   262  					err := c.checkpoint(ctx, checkpointTries, checkpointNum)
   263  					checkpointResultCh <- checkpointResult{checkpointNum, err}
   264  				}()
   265  			} else {
   266  				// Failed to get semaphore because checkpointing is running.
   267  				// Try again when active segment is finalized.
   268  				c.logger.Info().Msgf("compactor delayed checkpoint %d because prior checkpointing is ongoing", nextCheckpointNum)
   269  				nextCheckpointNum = activeSegmentNum
   270  			}
   271  		}
   272  	}
   273  
   274  	// Drain and process remaining trie updates in channel.
   275  	c.logger.Info().Msg("Starting draining trie update channel in compactor on shutdown")
   276  	for update := range c.trieUpdateCh {
   277  		_, _, err := c.wal.RecordUpdate(update.Update)
   278  		select {
   279  		case update.ResultCh <- err:
   280  		default:
   281  		}
   282  	}
   283  	c.logger.Info().Msg("Finished draining trie update channel in compactor on shutdown")
   284  
   285  	// Don't wait for checkpointing to finish because it might take too long.
   286  }
   287  
   288  // checkpoint creates checkpoint of tries snapshot,
   289  // deletes prior checkpoint files (if needed), and notifies observers.
   290  // Errors indicate that checkpoint file can't be created or prior checkpoints can't be removed.
   291  // Caller should handle returned errors by retrying checkpointing when appropriate.
   292  // Since this function is only for checkpointing, Compactor isn't affected by returned error.
   293  func (c *Compactor) checkpoint(ctx context.Context, tries []*trie.MTrie, checkpointNum int) error {
   294  
   295  	err := createCheckpoint(c.checkpointer, c.logger, tries, checkpointNum, c.metrics)
   296  	if err != nil {
   297  		return &createCheckpointError{num: checkpointNum, err: err}
   298  	}
   299  
   300  	// Return if context is canceled.
   301  	select {
   302  	case <-ctx.Done():
   303  		return nil
   304  	default:
   305  	}
   306  
   307  	err = cleanupCheckpoints(c.checkpointer, int(c.checkpointsToKeep))
   308  	if err != nil {
   309  		return &removeCheckpointError{err: err}
   310  	}
   311  
   312  	if checkpointNum > 0 {
   313  		for observer := range c.observers {
   314  			// Don't notify observer if context is canceled.
   315  			// observer.OnComplete() is called when Compactor starts shutting down,
   316  			// which may close channel that observer.OnNext() uses to send data.
   317  			select {
   318  			case <-ctx.Done():
   319  				return nil
   320  			default:
   321  				observer.OnNext(checkpointNum)
   322  			}
   323  		}
   324  	}
   325  
   326  	return nil
   327  }
   328  
   329  // createCheckpoint creates checkpoint with given checkpointNum and tries.
   330  // Errors indicate that checkpoint file can't be created.
   331  // Caller should handle returned errors by retrying checkpointing when appropriate.
   332  func createCheckpoint(checkpointer *realWAL.Checkpointer, logger zerolog.Logger, tries []*trie.MTrie, checkpointNum int, metrics module.WALMetrics) error {
   333  
   334  	logger.Info().Msgf("serializing checkpoint %d with %v tries", checkpointNum, len(tries))
   335  
   336  	startTime := time.Now()
   337  
   338  	fileName := realWAL.NumberToFilename(checkpointNum)
   339  	err := realWAL.StoreCheckpointV6SingleThread(tries, checkpointer.Dir(), fileName, logger)
   340  	if err != nil {
   341  		return fmt.Errorf("error serializing checkpoint (%d): %w", checkpointNum, err)
   342  	}
   343  
   344  	size, err := realWAL.ReadCheckpointFileSize(checkpointer.Dir(), fileName)
   345  	if err != nil {
   346  		return fmt.Errorf("error reading checkpoint file size (%d): %w", checkpointNum, err)
   347  	}
   348  
   349  	metrics.ExecutionCheckpointSize(size)
   350  
   351  	duration := time.Since(startTime)
   352  	logger.Info().Float64("total_time_s", duration.Seconds()).Msgf("created checkpoint %d", checkpointNum)
   353  
   354  	return nil
   355  }
   356  
   357  // cleanupCheckpoints deletes prior checkpoint files if needed.
   358  // Since the function is side-effect free, all failures are simply a no-op.
   359  func cleanupCheckpoints(checkpointer *realWAL.Checkpointer, checkpointsToKeep int) error {
   360  	// Don't list checkpoints if we keep them all
   361  	if checkpointsToKeep == 0 {
   362  		return nil
   363  	}
   364  	checkpoints, err := checkpointer.Checkpoints()
   365  	if err != nil {
   366  		return fmt.Errorf("cannot list checkpoints: %w", err)
   367  	}
   368  	if len(checkpoints) > int(checkpointsToKeep) {
   369  		// if condition guarantees this never fails
   370  		checkpointsToRemove := checkpoints[:len(checkpoints)-int(checkpointsToKeep)]
   371  
   372  		for _, checkpoint := range checkpointsToRemove {
   373  			err := checkpointer.RemoveCheckpoint(checkpoint)
   374  			if err != nil {
   375  				return fmt.Errorf("cannot remove checkpoint %d: %w", checkpoint, err)
   376  			}
   377  		}
   378  	}
   379  	return nil
   380  }
   381  
   382  // processTrieUpdate writes trie update to WAL, updates activeSegmentNum,
   383  // and returns tries for checkpointing if needed.
   384  // It sends WAL update result, receives updated trie, and pushes updated trie to trieQueue.
   385  // When this function returns, WAL update is in sync with trieQueue update.
   386  func (c *Compactor) processTrieUpdate(
   387  	update *WALTrieUpdate,
   388  	trieQueue *realWAL.TrieQueue,
   389  	activeSegmentNum int,
   390  	nextCheckpointNum int,
   391  ) (
   392  	_activeSegmentNum int,
   393  	checkpointNum int,
   394  	checkpointTries []*trie.MTrie,
   395  ) {
   396  
   397  	// RecordUpdate returns the segment number the record was written to.
   398  	// Returned segment number (>= 0) can be
   399  	// - the same as previous segment number (same segment), or
   400  	// - incremented by 1 from previous segment number (new segment)
   401  	segmentNum, skipped, updateErr := c.wal.RecordUpdate(update.Update)
   402  
   403  	// Send result of WAL update
   404  	update.ResultCh <- updateErr
   405  
   406  	// This ensures that updated trie matches WAL update.
   407  	defer func() {
   408  		// Wait for updated trie
   409  		trie := <-update.TrieCh
   410  		if trie == nil {
   411  			c.logger.Error().Msg("compactor failed to get updated trie")
   412  			return
   413  		}
   414  
   415  		trieQueue.Push(trie)
   416  	}()
   417  
   418  	if activeSegmentNum == -1 {
   419  		// Recover from failure to get active segment number at initialization.
   420  		return segmentNum, -1, nil
   421  	}
   422  
   423  	if updateErr != nil || skipped || segmentNum == activeSegmentNum {
   424  		return activeSegmentNum, -1, nil
   425  	}
   426  
   427  	// In the remaining code: segmentNum > activeSegmentNum
   428  
   429  	// active segment is finalized.
   430  
   431  	// Check new segment number is incremented by 1
   432  	if segmentNum != activeSegmentNum+1 {
   433  		c.logger.Error().Msg(fmt.Sprintf("compactor got unexpected new segment numer %d, want %d", segmentNum, activeSegmentNum+1))
   434  	}
   435  
   436  	// Update activeSegmentNum
   437  	prevSegmentNum := activeSegmentNum
   438  	activeSegmentNum = segmentNum
   439  
   440  	c.logger.Info().Msgf("finish writing segment file %v, trie update is writing to segment file %v, checkpoint will trigger when segment %v is finished",
   441  		prevSegmentNum, activeSegmentNum, nextCheckpointNum)
   442  
   443  	if nextCheckpointNum > prevSegmentNum {
   444  		// Not enough segments for checkpointing
   445  		return activeSegmentNum, -1, nil
   446  	}
   447  
   448  	// In the remaining code: nextCheckpointNum == prevSegmentNum
   449  
   450  	// Enough segments are created for checkpointing
   451  
   452  	// Get tries from checkpoint queue.
   453  	// At this point, checkpoint queue contains tries up to
   454  	// last update (last record in finalized segment)
   455  	// It doesn't include trie for this update
   456  	// until updated trie is received and added to trieQueue.
   457  	tries := trieQueue.Tries()
   458  
   459  	checkpointNum = nextCheckpointNum
   460  
   461  	return activeSegmentNum, checkpointNum, tries
   462  }
   463  
   464  // createCheckpointError creates a checkpoint creation error.
   465  type createCheckpointError struct {
   466  	num int
   467  	err error
   468  }
   469  
   470  func (e *createCheckpointError) Error() string {
   471  	return fmt.Sprintf("cannot create checkpoint %d: %s", e.num, e.err)
   472  }
   473  
   474  func (e *createCheckpointError) Unwrap() error { return e.err }
   475  
   476  // removeCheckpointError creates a checkpoint removal error.
   477  type removeCheckpointError struct {
   478  	err error
   479  }
   480  
   481  func (e *removeCheckpointError) Error() string {
   482  	return fmt.Sprintf("cannot cleanup checkpoints: %s", e.err)
   483  }
   484  
   485  func (e *removeCheckpointError) Unwrap() error { return e.err }