github.com/koko1123/flow-go-1@v0.29.6/ledger/complete/compactor.go (about)

     1  package complete
     2  
     3  import (
     4  	"context"
     5  	"errors"
     6  	"fmt"
     7  	"time"
     8  
     9  	"github.com/rs/zerolog"
    10  	"go.uber.org/atomic"
    11  	"golang.org/x/sync/semaphore"
    12  
    13  	"github.com/koko1123/flow-go-1/ledger"
    14  	"github.com/koko1123/flow-go-1/ledger/complete/mtrie/trie"
    15  	realWAL "github.com/koko1123/flow-go-1/ledger/complete/wal"
    16  	"github.com/koko1123/flow-go-1/module/lifecycle"
    17  	"github.com/koko1123/flow-go-1/module/observable"
    18  )
    19  
    20  // WALTrieUpdate is a message communicated through channel between Ledger and Compactor.
    21  type WALTrieUpdate struct {
    22  	Update   *ledger.TrieUpdate // Update data needs to be encoded and saved in WAL.
    23  	ResultCh chan<- error       // ResultCh channel is used to send WAL update result from Compactor to Ledger.
    24  	TrieCh   <-chan *trie.MTrie // TrieCh channel is used to send new trie from Ledger to Compactor.
    25  }
    26  
    27  // checkpointResult is a message to communicate checkpointing number and error if any.
    28  type checkpointResult struct {
    29  	num int
    30  	err error
    31  }
    32  
    33  // Compactor is a long-running goroutine responsible for:
    34  // - writing WAL record from trie update,
    35  // - starting checkpointing async when enough segments are finalized.
    36  //
    37  // Compactor communicates with Ledger through channels
    38  // to ensure that by the end of any trie update processing,
    39  // update is written to WAL and new trie is pushed to trie queue.
    40  //
    41  // Compactor stores pointers to tries in ledger state in a fix-sized
    42  // checkpointing queue (FIFO).  Checkpointing queue is decoupled from
    43  // main ledger state to allow separate optimization and looser coupling, etc.
    44  // CAUTION: If the forest LRU Cache is used for main state,
    45  // then ledger state and checkpointing queue may contain different tries.
    46  // This will be resolved automaticaly after the forest LRU Cache
    47  // (code outside checkpointing) is replaced by something like a FIFO queue.
    48  type Compactor struct {
    49  	checkpointer                         *realWAL.Checkpointer
    50  	wal                                  realWAL.LedgerWAL
    51  	trieQueue                            *realWAL.TrieQueue
    52  	logger                               zerolog.Logger
    53  	lm                                   *lifecycle.LifecycleManager
    54  	observers                            map[observable.Observer]struct{}
    55  	checkpointDistance                   uint
    56  	checkpointsToKeep                    uint
    57  	stopCh                               chan chan struct{}
    58  	trieUpdateCh                         <-chan *WALTrieUpdate
    59  	triggerCheckpointOnNextSegmentFinish *atomic.Bool // to trigger checkpoint manually
    60  }
    61  
    62  // NewCompactor creates new Compactor which writes WAL record and triggers
    63  // checkpointing asynchronously when enough segments are finalized.
    64  // The checkpointDistance is a flag that specifies how many segments need to
    65  // be finalized to trigger checkpointing.  However, if a prior checkpointing
    66  // is already running and not finished, then more segments than specified
    67  // could be accumulated for the new checkpointing (to reduce memory).
    68  // All returned errors indicate that Compactor can't be created.
    69  // Since failure to create Compactor will end up blocking ledger updates,
    70  // the caller should handle all returned errors as unrecoverable.
    71  func NewCompactor(
    72  	l *Ledger,
    73  	w realWAL.LedgerWAL,
    74  	logger zerolog.Logger,
    75  	checkpointCapacity uint,
    76  	checkpointDistance uint,
    77  	checkpointsToKeep uint,
    78  	triggerCheckpointOnNextSegmentFinish *atomic.Bool,
    79  ) (*Compactor, error) {
    80  	if checkpointDistance < 1 {
    81  		checkpointDistance = 1
    82  	}
    83  
    84  	checkpointer, err := w.NewCheckpointer()
    85  	if err != nil {
    86  		return nil, err
    87  	}
    88  
    89  	// Get trieUpdateCh channel to communicate trieUpdate, WAL result, and new trie
    90  	// created from the update.
    91  	trieUpdateCh := l.TrieUpdateChan()
    92  	if trieUpdateCh == nil {
    93  		return nil, errors.New("failed to get valid trie update channel from ledger")
    94  	}
    95  
    96  	// Get all tries from ledger state.
    97  	tries, err := l.Tries()
    98  	if err != nil {
    99  		return nil, err
   100  	}
   101  
   102  	// Create trieQueue with initial values from ledger state.
   103  	trieQueue := realWAL.NewTrieQueueWithValues(checkpointCapacity, tries)
   104  
   105  	return &Compactor{
   106  		checkpointer:                         checkpointer,
   107  		wal:                                  w,
   108  		trieQueue:                            trieQueue,
   109  		logger:                               logger.With().Str("ledger_mod", "compactor").Logger(),
   110  		stopCh:                               make(chan chan struct{}),
   111  		trieUpdateCh:                         trieUpdateCh,
   112  		observers:                            make(map[observable.Observer]struct{}),
   113  		lm:                                   lifecycle.NewLifecycleManager(),
   114  		checkpointDistance:                   checkpointDistance,
   115  		checkpointsToKeep:                    checkpointsToKeep,
   116  		triggerCheckpointOnNextSegmentFinish: triggerCheckpointOnNextSegmentFinish,
   117  	}, nil
   118  }
   119  
   120  // Subscribe subscribes observer to Compactor.
   121  func (c *Compactor) Subscribe(observer observable.Observer) {
   122  	var void struct{}
   123  	c.observers[observer] = void
   124  }
   125  
   126  // Unsubscribe unsubscribes observer to Compactor.
   127  func (c *Compactor) Unsubscribe(observer observable.Observer) {
   128  	delete(c.observers, observer)
   129  }
   130  
   131  // Ready returns channel which would be closed when Compactor goroutine starts.
   132  func (c *Compactor) Ready() <-chan struct{} {
   133  	c.lm.OnStart(func() {
   134  		go c.run()
   135  	})
   136  	return c.lm.Started()
   137  }
   138  
   139  // Done returns channel which would be closed when Compactor goroutine exits.
   140  func (c *Compactor) Done() <-chan struct{} {
   141  	c.lm.OnStop(func() {
   142  		// Signal Compactor goroutine to stop
   143  		doneCh := make(chan struct{})
   144  		c.stopCh <- doneCh
   145  
   146  		// Wait for Compactor goroutine to stop
   147  		<-doneCh
   148  
   149  		// Shut down WAL component.
   150  		// only shut down wal after compactor has been shut down, in case there
   151  		// is still writing to WAL files.
   152  		<-c.wal.Done()
   153  
   154  		// Notify observers
   155  		for observer := range c.observers {
   156  			observer.OnComplete()
   157  		}
   158  	})
   159  	return c.lm.Stopped()
   160  }
   161  
   162  // run writes WAL records from trie updates and starts checkpointing
   163  // asynchronously when enough segments are finalized.
   164  func (c *Compactor) run() {
   165  
   166  	// checkpointSem is used to limit checkpointing to one.
   167  	// If previous checkpointing isn't finished when enough segments
   168  	// are finalized for next checkpointing, retry checkpointing
   169  	// again when next segment is finalized.
   170  	// This avoids having more tries in memory than needed.
   171  	checkpointSem := semaphore.NewWeighted(1)
   172  
   173  	checkpointResultCh := make(chan checkpointResult, 1)
   174  
   175  	// Get active segment number (opened segment that new records write to).
   176  	// activeSegmentNum is updated when record is written to a new segment.
   177  	_, activeSegmentNum, err := c.wal.Segments()
   178  	if err != nil {
   179  		c.logger.Error().Err(err).Msg("compactor failed to get active segment number")
   180  		activeSegmentNum = -1
   181  	}
   182  
   183  	lastCheckpointNum, err := c.checkpointer.LatestCheckpoint()
   184  	if err != nil {
   185  		c.logger.Error().Err(err).Msg("compactor failed to get last checkpoint number")
   186  		lastCheckpointNum = -1
   187  	}
   188  
   189  	// Compute next checkpoint number.
   190  	// nextCheckpointNum is updated when checkpointing starts, fails to start, or fails.
   191  	// NOTE: next checkpoint number must >= active segment num.
   192  	nextCheckpointNum := lastCheckpointNum + int(c.checkpointDistance)
   193  	if activeSegmentNum > nextCheckpointNum {
   194  		nextCheckpointNum = activeSegmentNum
   195  	}
   196  
   197  	ctx, cancel := context.WithCancel(context.Background())
   198  
   199  Loop:
   200  	for {
   201  		select {
   202  
   203  		case doneCh := <-c.stopCh:
   204  			defer close(doneCh)
   205  			cancel()
   206  			break Loop
   207  
   208  		case checkpointResult := <-checkpointResultCh:
   209  			if checkpointResult.err != nil {
   210  				c.logger.Error().Err(checkpointResult.err).Msg(
   211  					"compactor failed to create or remove checkpoint",
   212  				)
   213  				var createError *createCheckpointError
   214  				if errors.As(checkpointResult.err, &createError) {
   215  					// Retry checkpointing when active segment is finalized.
   216  					nextCheckpointNum = activeSegmentNum
   217  				}
   218  			}
   219  
   220  		case update, ok := <-c.trieUpdateCh:
   221  			if !ok {
   222  				// trieUpdateCh channel is closed.
   223  				// Wait for stop signal from c.stopCh
   224  				continue
   225  			}
   226  
   227  			// listen to signals from admin tool in order to trigger a checkpoint when the current segment file is finished
   228  			if c.triggerCheckpointOnNextSegmentFinish.CompareAndSwap(true, false) {
   229  				// sanity checking, usually the nextCheckpointNum is a segment number in the future that when the activeSegmentNum
   230  				// finishes and reaches the nextCheckpointNum, then checkpoint will be triggered.
   231  				if nextCheckpointNum >= activeSegmentNum {
   232  					originalNextCheckpointNum := nextCheckpointNum
   233  					nextCheckpointNum = activeSegmentNum
   234  					c.logger.Info().Msgf("compactor will trigger once finish writing segment %v, originalNextCheckpointNum: %v", nextCheckpointNum, originalNextCheckpointNum)
   235  				} else {
   236  					c.logger.Warn().Msgf("could not force triggering checkpoint, nextCheckpointNum %v is smaller than activeSegmentNum %v", nextCheckpointNum, activeSegmentNum)
   237  				}
   238  			}
   239  
   240  			var checkpointNum int
   241  			var checkpointTries []*trie.MTrie
   242  			activeSegmentNum, checkpointNum, checkpointTries =
   243  				c.processTrieUpdate(update, c.trieQueue, activeSegmentNum, nextCheckpointNum)
   244  
   245  			if checkpointTries == nil {
   246  				// Not enough segments for checkpointing (nextCheckpointNum >= activeSegmentNum)
   247  				continue
   248  			}
   249  
   250  			// Try to checkpoint
   251  			if checkpointSem.TryAcquire(1) {
   252  
   253  				// Compute next checkpoint number
   254  				nextCheckpointNum = checkpointNum + int(c.checkpointDistance)
   255  
   256  				go func() {
   257  					defer checkpointSem.Release(1)
   258  					err := c.checkpoint(ctx, checkpointTries, checkpointNum)
   259  					checkpointResultCh <- checkpointResult{checkpointNum, err}
   260  				}()
   261  			} else {
   262  				// Failed to get semaphore because checkpointing is running.
   263  				// Try again when active segment is finalized.
   264  				c.logger.Info().Msgf("compactor delayed checkpoint %d because prior checkpointing is ongoing", nextCheckpointNum)
   265  				nextCheckpointNum = activeSegmentNum
   266  			}
   267  		}
   268  	}
   269  
   270  	// Drain and process remaining trie updates in channel.
   271  	c.logger.Info().Msg("Starting draining trie update channel in compactor on shutdown")
   272  	for update := range c.trieUpdateCh {
   273  		_, _, err := c.wal.RecordUpdate(update.Update)
   274  		select {
   275  		case update.ResultCh <- err:
   276  		default:
   277  		}
   278  	}
   279  	c.logger.Info().Msg("Finished draining trie update channel in compactor on shutdown")
   280  
   281  	// Don't wait for checkpointing to finish because it might take too long.
   282  }
   283  
   284  // checkpoint creates checkpoint of tries snapshot,
   285  // deletes prior checkpoint files (if needed), and notifies observers.
   286  // Errors indicate that checkpoint file can't be created or prior checkpoints can't be removed.
   287  // Caller should handle returned errors by retrying checkpointing when appropriate.
   288  // Since this function is only for checkpointing, Compactor isn't affected by returned error.
   289  func (c *Compactor) checkpoint(ctx context.Context, tries []*trie.MTrie, checkpointNum int) error {
   290  
   291  	err := createCheckpoint(c.checkpointer, c.logger, tries, checkpointNum)
   292  	if err != nil {
   293  		return &createCheckpointError{num: checkpointNum, err: err}
   294  	}
   295  
   296  	// Return if context is canceled.
   297  	select {
   298  	case <-ctx.Done():
   299  		return nil
   300  	default:
   301  	}
   302  
   303  	err = cleanupCheckpoints(c.checkpointer, int(c.checkpointsToKeep))
   304  	if err != nil {
   305  		return &removeCheckpointError{err: err}
   306  	}
   307  
   308  	if checkpointNum > 0 {
   309  		for observer := range c.observers {
   310  			// Don't notify observer if context is canceled.
   311  			// observer.OnComplete() is called when Compactor starts shutting down,
   312  			// which may close channel that observer.OnNext() uses to send data.
   313  			select {
   314  			case <-ctx.Done():
   315  				return nil
   316  			default:
   317  				observer.OnNext(checkpointNum)
   318  			}
   319  		}
   320  	}
   321  
   322  	return nil
   323  }
   324  
   325  // createCheckpoint creates checkpoint with given checkpointNum and tries.
   326  // Errors indicate that checkpoint file can't be created.
   327  // Caller should handle returned errors by retrying checkpointing when appropriate.
   328  func createCheckpoint(checkpointer *realWAL.Checkpointer, logger zerolog.Logger, tries []*trie.MTrie, checkpointNum int) error {
   329  
   330  	logger.Info().Msgf("serializing checkpoint %d with %v tries", checkpointNum, len(tries))
   331  
   332  	startTime := time.Now()
   333  
   334  	fileName := realWAL.NumberToFilename(checkpointNum)
   335  	err := realWAL.StoreCheckpointV6SingleThread(tries, checkpointer.Dir(), fileName, &logger)
   336  	if err != nil {
   337  		return fmt.Errorf("error serializing checkpoint (%d): %w", checkpointNum, err)
   338  	}
   339  
   340  	duration := time.Since(startTime)
   341  	logger.Info().Float64("total_time_s", duration.Seconds()).Msgf("created checkpoint %d", checkpointNum)
   342  
   343  	return nil
   344  }
   345  
   346  // cleanupCheckpoints deletes prior checkpoint files if needed.
   347  // Since the function is side-effect free, all failures are simply a no-op.
   348  func cleanupCheckpoints(checkpointer *realWAL.Checkpointer, checkpointsToKeep int) error {
   349  	// Don't list checkpoints if we keep them all
   350  	if checkpointsToKeep == 0 {
   351  		return nil
   352  	}
   353  	checkpoints, err := checkpointer.Checkpoints()
   354  	if err != nil {
   355  		return fmt.Errorf("cannot list checkpoints: %w", err)
   356  	}
   357  	if len(checkpoints) > int(checkpointsToKeep) {
   358  		// if condition guarantees this never fails
   359  		checkpointsToRemove := checkpoints[:len(checkpoints)-int(checkpointsToKeep)]
   360  
   361  		for _, checkpoint := range checkpointsToRemove {
   362  			err := checkpointer.RemoveCheckpoint(checkpoint)
   363  			if err != nil {
   364  				return fmt.Errorf("cannot remove checkpoint %d: %w", checkpoint, err)
   365  			}
   366  		}
   367  	}
   368  	return nil
   369  }
   370  
   371  // processTrieUpdate writes trie update to WAL, updates activeSegmentNum,
   372  // and returns tries for checkpointing if needed.
   373  // It sends WAL update result, receives updated trie, and pushes updated trie to trieQueue.
   374  // When this function returns, WAL update is in sync with trieQueue update.
   375  func (c *Compactor) processTrieUpdate(
   376  	update *WALTrieUpdate,
   377  	trieQueue *realWAL.TrieQueue,
   378  	activeSegmentNum int,
   379  	nextCheckpointNum int,
   380  ) (
   381  	_activeSegmentNum int,
   382  	checkpointNum int,
   383  	checkpointTries []*trie.MTrie,
   384  ) {
   385  
   386  	// RecordUpdate returns the segment number the record was written to.
   387  	// Returned segment number (>= 0) can be
   388  	// - the same as previous segment number (same segment), or
   389  	// - incremented by 1 from previous segment number (new segment)
   390  	segmentNum, skipped, updateErr := c.wal.RecordUpdate(update.Update)
   391  
   392  	// Send result of WAL update
   393  	update.ResultCh <- updateErr
   394  
   395  	// This ensures that updated trie matches WAL update.
   396  	defer func() {
   397  		// Wait for updated trie
   398  		trie := <-update.TrieCh
   399  		if trie == nil {
   400  			c.logger.Error().Msg("compactor failed to get updated trie")
   401  			return
   402  		}
   403  
   404  		trieQueue.Push(trie)
   405  	}()
   406  
   407  	if activeSegmentNum == -1 {
   408  		// Recover from failure to get active segment number at initialization.
   409  		return segmentNum, -1, nil
   410  	}
   411  
   412  	if updateErr != nil || skipped || segmentNum == activeSegmentNum {
   413  		return activeSegmentNum, -1, nil
   414  	}
   415  
   416  	// In the remaining code: segmentNum > activeSegmentNum
   417  
   418  	// active segment is finalized.
   419  
   420  	// Check new segment number is incremented by 1
   421  	if segmentNum != activeSegmentNum+1 {
   422  		c.logger.Error().Msg(fmt.Sprintf("compactor got unexpected new segment numer %d, want %d", segmentNum, activeSegmentNum+1))
   423  	}
   424  
   425  	// Update activeSegmentNum
   426  	prevSegmentNum := activeSegmentNum
   427  	activeSegmentNum = segmentNum
   428  
   429  	c.logger.Info().Msgf("finish writing segment file %v, trie update is writing to segment file %v, checkpoint will trigger when segment %v is finished",
   430  		prevSegmentNum, activeSegmentNum, nextCheckpointNum)
   431  
   432  	if nextCheckpointNum > prevSegmentNum {
   433  		// Not enough segments for checkpointing
   434  		return activeSegmentNum, -1, nil
   435  	}
   436  
   437  	// In the remaining code: nextCheckpointNum == prevSegmentNum
   438  
   439  	// Enough segments are created for checkpointing
   440  
   441  	// Get tries from checkpoint queue.
   442  	// At this point, checkpoint queue contains tries up to
   443  	// last update (last record in finalized segment)
   444  	// It doesn't include trie for this update
   445  	// until updated trie is received and added to trieQueue.
   446  	tries := trieQueue.Tries()
   447  
   448  	checkpointNum = nextCheckpointNum
   449  
   450  	return activeSegmentNum, checkpointNum, tries
   451  }
   452  
   453  // createCheckpointError creates a checkpoint creation error.
   454  type createCheckpointError struct {
   455  	num int
   456  	err error
   457  }
   458  
   459  func (e *createCheckpointError) Error() string {
   460  	return fmt.Sprintf("cannot create checkpoint %d: %s", e.num, e.err)
   461  }
   462  
   463  func (e *createCheckpointError) Unwrap() error { return e.err }
   464  
   465  // removeCheckpointError creates a checkpoint removal error.
   466  type removeCheckpointError struct {
   467  	err error
   468  }
   469  
   470  func (e *removeCheckpointError) Error() string {
   471  	return fmt.Sprintf("cannot cleanup checkpoints: %s", e.err)
   472  }
   473  
   474  func (e *removeCheckpointError) Unwrap() error { return e.err }