github.com/onflow/flow-go@v0.35.7-crescendo-preview.23-atree-inlining/storage/badger/cleaner.go (about)

     1  package badger
     2  
     3  import (
     4  	"time"
     5  
     6  	"github.com/dgraph-io/badger/v2"
     7  	"github.com/rs/zerolog"
     8  
     9  	"github.com/onflow/flow-go/module"
    10  	"github.com/onflow/flow-go/module/component"
    11  	"github.com/onflow/flow-go/module/irrecoverable"
    12  	"github.com/onflow/flow-go/utils/rand"
    13  )
    14  
    15  // Cleaner uses component.ComponentManager to implement module.Startable and module.ReadyDoneAware
    16  // to run an internal goroutine which run badger value log garbage collection at a semi-regular interval.
    17  // The Cleaner exists for 2 reasons:
    18  //   - Run GC frequently enough that each GC is relatively inexpensive
    19  //   - Avoid GC being synchronized across all nodes. Since in the happy path, all nodes have very similar
    20  //     database load patterns, without intervention they are likely to schedule GC at the same time, which
    21  //     can cause temporary consensus halts.
    22  type Cleaner struct {
    23  	component.Component
    24  	log      zerolog.Logger
    25  	db       *badger.DB
    26  	metrics  module.CleanerMetrics
    27  	ratio    float64
    28  	interval time.Duration
    29  }
    30  
    31  var _ component.Component = (*Cleaner)(nil)
    32  
    33  // NewCleaner returns a cleaner that runs the badger value log garbage collection once every `interval` duration
    34  // if an interval of zero is passed in, we will not run the GC at all.
    35  func NewCleaner(log zerolog.Logger, db *badger.DB, metrics module.CleanerMetrics, interval time.Duration) *Cleaner {
    36  	// NOTE: we run garbage collection frequently at points in our business
    37  	// logic where we are likely to have a small breather in activity; it thus
    38  	// makes sense to run garbage collection often, with a smaller ratio, rather
    39  	// than running it rarely and having big rewrites at once
    40  	c := &Cleaner{
    41  		log:      log.With().Str("component", "cleaner").Logger(),
    42  		db:       db,
    43  		metrics:  metrics,
    44  		ratio:    0.2,
    45  		interval: interval,
    46  	}
    47  
    48  	// Disable if passed in 0 as interval
    49  	if c.interval == 0 {
    50  		c.Component = &module.NoopComponent{}
    51  		return c
    52  	}
    53  
    54  	c.Component = component.NewComponentManagerBuilder().
    55  		AddWorker(c.gcWorkerRoutine).
    56  		Build()
    57  
    58  	return c
    59  }
    60  
    61  // gcWorkerRoutine runs badger GC on timely basis.
    62  func (c *Cleaner) gcWorkerRoutine(ctx irrecoverable.SignalerContext, ready component.ReadyFunc) {
    63  	ready()
    64  	ticker := time.NewTicker(c.nextWaitDuration())
    65  	defer ticker.Stop()
    66  	for {
    67  		select {
    68  		case <-ctx.Done():
    69  			return
    70  		case <-ticker.C:
    71  			c.runGC()
    72  
    73  			// reset the ticker with a new interval and random jitter
    74  			ticker.Reset(c.nextWaitDuration())
    75  		}
    76  	}
    77  }
    78  
    79  // nextWaitDuration calculates next duration for Cleaner to wait before attempting to run GC.
    80  // We add 20% jitter into the interval, so that we don't risk nodes syncing their GC calls over time.
    81  // Therefore GC is run every X seconds, where X is uniformly sampled from [interval, interval*1.2]
    82  func (c *Cleaner) nextWaitDuration() time.Duration {
    83  	jitter, err := rand.Uint64n(uint64(c.interval.Nanoseconds() / 5))
    84  	if err != nil {
    85  		// if randomness fails, do not use a jitter for this instance.
    86  		// TODO: address the error properly and not swallow it.
    87  		// In this specific case, `utils/rand` only errors if the system randomness fails
    88  		// which is a symptom of a wider failure. Many other node components would catch such
    89  		// a failure.
    90  		c.log.Warn().Msg("jitter is zero beacuse system randomness has failed")
    91  		jitter = 0
    92  	}
    93  	return time.Duration(c.interval.Nanoseconds() + int64(jitter))
    94  }
    95  
    96  // runGC runs garbage collection for badger DB, handles sentinel errors and reports metrics.
    97  func (c *Cleaner) runGC() {
    98  	started := time.Now()
    99  	err := c.db.RunValueLogGC(c.ratio)
   100  	if err == badger.ErrRejected {
   101  		// NOTE: this happens when a GC call is already running
   102  		c.log.Warn().Msg("garbage collection on value log already running")
   103  		return
   104  	}
   105  	if err == badger.ErrNoRewrite {
   106  		// NOTE: this happens when no files have any garbage to drop
   107  		c.log.Debug().Msg("garbage collection on value log unnecessary")
   108  		return
   109  	}
   110  	if err != nil {
   111  		c.log.Error().Err(err).Msg("garbage collection on value log failed")
   112  		return
   113  	}
   114  
   115  	runtime := time.Since(started)
   116  	c.log.Debug().
   117  		Dur("gc_duration", runtime).
   118  		Msg("garbage collection on value log executed")
   119  	c.metrics.RanGC(runtime)
   120  }