github.com/authzed/spicedb@v1.32.1-0.20240520085336-ebda56537386/internal/datastore/common/gc.go (about)

     1  package common
     2  
     3  import (
     4  	"context"
     5  	"fmt"
     6  	"time"
     7  
     8  	"github.com/cenkalti/backoff/v4"
     9  	"github.com/prometheus/client_golang/prometheus"
    10  	"github.com/rs/zerolog"
    11  
    12  	log "github.com/authzed/spicedb/internal/logging"
    13  	"github.com/authzed/spicedb/pkg/datastore"
    14  )
    15  
    16  var (
    17  	gcDurationHistogram = prometheus.NewHistogram(prometheus.HistogramOpts{
    18  		Namespace: "spicedb",
    19  		Subsystem: "datastore",
    20  		Name:      "gc_duration_seconds",
    21  		Help:      "The duration of datastore garbage collection.",
    22  		Buckets:   []float64{0.01, 0.1, 0.5, 1, 5, 10, 25, 60, 120},
    23  	})
    24  
    25  	gcRelationshipsCounter = prometheus.NewCounter(prometheus.CounterOpts{
    26  		Namespace: "spicedb",
    27  		Subsystem: "datastore",
    28  		Name:      "gc_relationships_total",
    29  		Help:      "The number of stale relationships deleted by the datastore garbage collection.",
    30  	})
    31  
    32  	gcTransactionsCounter = prometheus.NewCounter(prometheus.CounterOpts{
    33  		Namespace: "spicedb",
    34  		Subsystem: "datastore",
    35  		Name:      "gc_transactions_total",
    36  		Help:      "The number of stale transactions deleted by the datastore garbage collection.",
    37  	})
    38  
    39  	gcNamespacesCounter = prometheus.NewCounter(prometheus.CounterOpts{
    40  		Namespace: "spicedb",
    41  		Subsystem: "datastore",
    42  		Name:      "gc_namespaces_total",
    43  		Help:      "The number of stale namespaces deleted by the datastore garbage collection.",
    44  	})
    45  
    46  	gcFailureCounterConfig = prometheus.CounterOpts{
    47  		Namespace: "spicedb",
    48  		Subsystem: "datastore",
    49  		Name:      "gc_failure_total",
    50  		Help:      "The number of failed runs of the datastore garbage collection.",
    51  	}
    52  	gcFailureCounter = prometheus.NewCounter(gcFailureCounterConfig)
    53  )
    54  
    55  // RegisterGCMetrics registers garbage collection metrics to the default
    56  // registry.
    57  func RegisterGCMetrics() error {
    58  	for _, metric := range []prometheus.Collector{
    59  		gcDurationHistogram,
    60  		gcRelationshipsCounter,
    61  		gcTransactionsCounter,
    62  		gcNamespacesCounter,
    63  		gcFailureCounter,
    64  	} {
    65  		if err := prometheus.Register(metric); err != nil {
    66  			return err
    67  		}
    68  	}
    69  
    70  	return nil
    71  }
    72  
    73  // GarbageCollector represents any datastore that supports external garbage
    74  // collection.
    75  type GarbageCollector interface {
    76  	HasGCRun() bool
    77  	MarkGCCompleted()
    78  	ResetGCCompleted()
    79  
    80  	ReadyState(context.Context) (datastore.ReadyState, error)
    81  	Now(context.Context) (time.Time, error)
    82  	TxIDBefore(context.Context, time.Time) (datastore.Revision, error)
    83  	DeleteBeforeTx(ctx context.Context, txID datastore.Revision) (DeletionCounts, error)
    84  }
    85  
    86  // DeletionCounts tracks the amount of deletions that occurred when calling
    87  // DeleteBeforeTx.
    88  type DeletionCounts struct {
    89  	Relationships int64
    90  	Transactions  int64
    91  	Namespaces    int64
    92  }
    93  
    94  func (g DeletionCounts) MarshalZerologObject(e *zerolog.Event) {
    95  	e.
    96  		Int64("relationships", g.Relationships).
    97  		Int64("transactions", g.Transactions).
    98  		Int64("namespaces", g.Namespaces)
    99  }
   100  
   101  var MaxGCInterval = 60 * time.Minute
   102  
   103  // StartGarbageCollector loops forever until the context is canceled and
   104  // performs garbage collection on the provided interval.
   105  func StartGarbageCollector(ctx context.Context, gc GarbageCollector, interval, window, timeout time.Duration) error {
   106  	return startGarbageCollectorWithMaxElapsedTime(ctx, gc, interval, window, 0, timeout, gcFailureCounter)
   107  }
   108  
   109  func startGarbageCollectorWithMaxElapsedTime(ctx context.Context, gc GarbageCollector, interval, window, maxElapsedTime, timeout time.Duration, failureCounter prometheus.Counter) error {
   110  	backoffInterval := backoff.NewExponentialBackOff()
   111  	backoffInterval.InitialInterval = interval
   112  	backoffInterval.MaxInterval = max(MaxGCInterval, interval)
   113  	backoffInterval.MaxElapsedTime = maxElapsedTime
   114  	backoffInterval.Reset()
   115  
   116  	nextInterval := interval
   117  
   118  	log.Ctx(ctx).Info().
   119  		Dur("interval", nextInterval).
   120  		Msg("datastore garbage collection worker started")
   121  
   122  	for {
   123  		select {
   124  		case <-ctx.Done():
   125  			log.Ctx(ctx).Info().
   126  				Msg("shutting down datastore garbage collection worker")
   127  			return ctx.Err()
   128  
   129  		case <-time.After(nextInterval):
   130  			log.Ctx(ctx).Info().
   131  				Dur("interval", nextInterval).
   132  				Dur("window", window).
   133  				Dur("timeout", timeout).
   134  				Msg("running garbage collection worker")
   135  
   136  			err := RunGarbageCollection(gc, window, timeout)
   137  			if err != nil {
   138  				failureCounter.Inc()
   139  				nextInterval = backoffInterval.NextBackOff()
   140  				log.Ctx(ctx).Warn().Err(err).
   141  					Dur("next-attempt-in", nextInterval).
   142  					Msg("error attempting to perform garbage collection")
   143  				continue
   144  			}
   145  
   146  			backoffInterval.Reset()
   147  			nextInterval = interval
   148  
   149  			log.Ctx(ctx).Debug().
   150  				Dur("next-run-in", interval).
   151  				Msg("datastore garbage collection scheduled for next run")
   152  		}
   153  	}
   154  }
   155  
   156  // RunGarbageCollection runs garbage collection for the datastore.
   157  func RunGarbageCollection(gc GarbageCollector, window, timeout time.Duration) error {
   158  	ctx, cancel := context.WithTimeout(context.Background(), timeout)
   159  	defer cancel()
   160  
   161  	ctx, span := tracer.Start(ctx, "RunGarbageCollection")
   162  	defer span.End()
   163  
   164  	// Before attempting anything, check if the datastore is ready.
   165  	startTime := time.Now()
   166  	ready, err := gc.ReadyState(ctx)
   167  	if err != nil {
   168  		return err
   169  	}
   170  	if !ready.IsReady {
   171  		log.Ctx(ctx).Warn().
   172  			Msgf("datastore wasn't ready when attempting garbage collection: %s", ready.Message)
   173  		return nil
   174  	}
   175  
   176  	now, err := gc.Now(ctx)
   177  	if err != nil {
   178  		return fmt.Errorf("error retrieving now: %w", err)
   179  	}
   180  
   181  	watermark, err := gc.TxIDBefore(ctx, now.Add(-1*window))
   182  	if err != nil {
   183  		return fmt.Errorf("error retrieving watermark: %w", err)
   184  	}
   185  
   186  	collected, err := gc.DeleteBeforeTx(ctx, watermark)
   187  
   188  	// even if an error happened, garbage would have been collected. This makes sure these are reflected even if the
   189  	// worker eventually fails or times out.
   190  	gcRelationshipsCounter.Add(float64(collected.Relationships))
   191  	gcTransactionsCounter.Add(float64(collected.Transactions))
   192  	gcNamespacesCounter.Add(float64(collected.Namespaces))
   193  	collectionDuration := time.Since(startTime)
   194  	gcDurationHistogram.Observe(collectionDuration.Seconds())
   195  
   196  	if err != nil {
   197  		return fmt.Errorf("error deleting in gc: %w", err)
   198  	}
   199  
   200  	log.Ctx(ctx).Info().
   201  		Stringer("highestTxID", watermark).
   202  		Dur("duration", collectionDuration).
   203  		Time("nowTime", now).
   204  		Interface("collected", collected).
   205  		Msg("datastore garbage collection completed successfully")
   206  
   207  	gc.MarkGCCompleted()
   208  	return nil
   209  }