github.com/onflow/flow-go@v0.33.17/storage/badger/cleaner.go (about)

     1  // (c) 2019 Dapper Labs - ALL RIGHTS RESERVED
     2  
     3  package badger
     4  
     5  import (
     6  	"time"
     7  
     8  	"github.com/dgraph-io/badger/v2"
     9  	"github.com/rs/zerolog"
    10  
    11  	"github.com/onflow/flow-go/module"
    12  	"github.com/onflow/flow-go/module/component"
    13  	"github.com/onflow/flow-go/module/irrecoverable"
    14  	"github.com/onflow/flow-go/utils/rand"
    15  )
    16  
    17  // Cleaner uses component.ComponentManager to implement module.Startable and module.ReadyDoneAware
    18  // to run an internal goroutine which run badger value log garbage collection at a semi-regular interval.
    19  // The Cleaner exists for 2 reasons:
    20  //   - Run GC frequently enough that each GC is relatively inexpensive
    21  //   - Avoid GC being synchronized across all nodes. Since in the happy path, all nodes have very similar
    22  //     database load patterns, without intervention they are likely to schedule GC at the same time, which
    23  //     can cause temporary consensus halts.
    24  type Cleaner struct {
    25  	component.Component
    26  	log      zerolog.Logger
    27  	db       *badger.DB
    28  	metrics  module.CleanerMetrics
    29  	ratio    float64
    30  	interval time.Duration
    31  }
    32  
    33  var _ component.Component = (*Cleaner)(nil)
    34  
    35  // NewCleaner returns a cleaner that runs the badger value log garbage collection once every `interval` duration
    36  // if an interval of zero is passed in, we will not run the GC at all.
    37  func NewCleaner(log zerolog.Logger, db *badger.DB, metrics module.CleanerMetrics, interval time.Duration) *Cleaner {
    38  	// NOTE: we run garbage collection frequently at points in our business
    39  	// logic where we are likely to have a small breather in activity; it thus
    40  	// makes sense to run garbage collection often, with a smaller ratio, rather
    41  	// than running it rarely and having big rewrites at once
    42  	c := &Cleaner{
    43  		log:      log.With().Str("component", "cleaner").Logger(),
    44  		db:       db,
    45  		metrics:  metrics,
    46  		ratio:    0.2,
    47  		interval: interval,
    48  	}
    49  
    50  	// Disable if passed in 0 as interval
    51  	if c.interval == 0 {
    52  		c.Component = &module.NoopComponent{}
    53  		return c
    54  	}
    55  
    56  	c.Component = component.NewComponentManagerBuilder().
    57  		AddWorker(c.gcWorkerRoutine).
    58  		Build()
    59  
    60  	return c
    61  }
    62  
    63  // gcWorkerRoutine runs badger GC on timely basis.
    64  func (c *Cleaner) gcWorkerRoutine(ctx irrecoverable.SignalerContext, ready component.ReadyFunc) {
    65  	ready()
    66  	ticker := time.NewTicker(c.nextWaitDuration())
    67  	defer ticker.Stop()
    68  	for {
    69  		select {
    70  		case <-ctx.Done():
    71  			return
    72  		case <-ticker.C:
    73  			c.runGC()
    74  
    75  			// reset the ticker with a new interval and random jitter
    76  			ticker.Reset(c.nextWaitDuration())
    77  		}
    78  	}
    79  }
    80  
    81  // nextWaitDuration calculates next duration for Cleaner to wait before attempting to run GC.
    82  // We add 20% jitter into the interval, so that we don't risk nodes syncing their GC calls over time.
    83  // Therefore GC is run every X seconds, where X is uniformly sampled from [interval, interval*1.2]
    84  func (c *Cleaner) nextWaitDuration() time.Duration {
    85  	jitter, err := rand.Uint64n(uint64(c.interval.Nanoseconds() / 5))
    86  	if err != nil {
    87  		// if randomness fails, do not use a jitter for this instance.
    88  		// TODO: address the error properly and not swallow it.
    89  		// In this specific case, `utils/rand` only errors if the system randomness fails
    90  		// which is a symptom of a wider failure. Many other node components would catch such
    91  		// a failure.
    92  		c.log.Warn().Msg("jitter is zero beacuse system randomness has failed")
    93  		jitter = 0
    94  	}
    95  	return time.Duration(c.interval.Nanoseconds() + int64(jitter))
    96  }
    97  
    98  // runGC runs garbage collection for badger DB, handles sentinel errors and reports metrics.
    99  func (c *Cleaner) runGC() {
   100  	started := time.Now()
   101  	err := c.db.RunValueLogGC(c.ratio)
   102  	if err == badger.ErrRejected {
   103  		// NOTE: this happens when a GC call is already running
   104  		c.log.Warn().Msg("garbage collection on value log already running")
   105  		return
   106  	}
   107  	if err == badger.ErrNoRewrite {
   108  		// NOTE: this happens when no files have any garbage to drop
   109  		c.log.Debug().Msg("garbage collection on value log unnecessary")
   110  		return
   111  	}
   112  	if err != nil {
   113  		c.log.Error().Err(err).Msg("garbage collection on value log failed")
   114  		return
   115  	}
   116  
   117  	runtime := time.Since(started)
   118  	c.log.Debug().
   119  		Dur("gc_duration", runtime).
   120  		Msg("garbage collection on value log executed")
   121  	c.metrics.RanGC(runtime)
   122  }