github.com/authzed/spicedb@v1.32.1-0.20240520085336-ebda56537386/internal/datastore/common/gc.go (about) 1 package common 2 3 import ( 4 "context" 5 "fmt" 6 "time" 7 8 "github.com/cenkalti/backoff/v4" 9 "github.com/prometheus/client_golang/prometheus" 10 "github.com/rs/zerolog" 11 12 log "github.com/authzed/spicedb/internal/logging" 13 "github.com/authzed/spicedb/pkg/datastore" 14 ) 15 16 var ( 17 gcDurationHistogram = prometheus.NewHistogram(prometheus.HistogramOpts{ 18 Namespace: "spicedb", 19 Subsystem: "datastore", 20 Name: "gc_duration_seconds", 21 Help: "The duration of datastore garbage collection.", 22 Buckets: []float64{0.01, 0.1, 0.5, 1, 5, 10, 25, 60, 120}, 23 }) 24 25 gcRelationshipsCounter = prometheus.NewCounter(prometheus.CounterOpts{ 26 Namespace: "spicedb", 27 Subsystem: "datastore", 28 Name: "gc_relationships_total", 29 Help: "The number of stale relationships deleted by the datastore garbage collection.", 30 }) 31 32 gcTransactionsCounter = prometheus.NewCounter(prometheus.CounterOpts{ 33 Namespace: "spicedb", 34 Subsystem: "datastore", 35 Name: "gc_transactions_total", 36 Help: "The number of stale transactions deleted by the datastore garbage collection.", 37 }) 38 39 gcNamespacesCounter = prometheus.NewCounter(prometheus.CounterOpts{ 40 Namespace: "spicedb", 41 Subsystem: "datastore", 42 Name: "gc_namespaces_total", 43 Help: "The number of stale namespaces deleted by the datastore garbage collection.", 44 }) 45 46 gcFailureCounterConfig = prometheus.CounterOpts{ 47 Namespace: "spicedb", 48 Subsystem: "datastore", 49 Name: "gc_failure_total", 50 Help: "The number of failed runs of the datastore garbage collection.", 51 } 52 gcFailureCounter = prometheus.NewCounter(gcFailureCounterConfig) 53 ) 54 55 // RegisterGCMetrics registers garbage collection metrics to the default 56 // registry. 57 func RegisterGCMetrics() error { 58 for _, metric := range []prometheus.Collector{ 59 gcDurationHistogram, 60 gcRelationshipsCounter, 61 gcTransactionsCounter, 62 gcNamespacesCounter, 63 gcFailureCounter, 64 } { 65 if err := prometheus.Register(metric); err != nil { 66 return err 67 } 68 } 69 70 return nil 71 } 72 73 // GarbageCollector represents any datastore that supports external garbage 74 // collection. 75 type GarbageCollector interface { 76 HasGCRun() bool 77 MarkGCCompleted() 78 ResetGCCompleted() 79 80 ReadyState(context.Context) (datastore.ReadyState, error) 81 Now(context.Context) (time.Time, error) 82 TxIDBefore(context.Context, time.Time) (datastore.Revision, error) 83 DeleteBeforeTx(ctx context.Context, txID datastore.Revision) (DeletionCounts, error) 84 } 85 86 // DeletionCounts tracks the amount of deletions that occurred when calling 87 // DeleteBeforeTx. 88 type DeletionCounts struct { 89 Relationships int64 90 Transactions int64 91 Namespaces int64 92 } 93 94 func (g DeletionCounts) MarshalZerologObject(e *zerolog.Event) { 95 e. 96 Int64("relationships", g.Relationships). 97 Int64("transactions", g.Transactions). 98 Int64("namespaces", g.Namespaces) 99 } 100 101 var MaxGCInterval = 60 * time.Minute 102 103 // StartGarbageCollector loops forever until the context is canceled and 104 // performs garbage collection on the provided interval. 105 func StartGarbageCollector(ctx context.Context, gc GarbageCollector, interval, window, timeout time.Duration) error { 106 return startGarbageCollectorWithMaxElapsedTime(ctx, gc, interval, window, 0, timeout, gcFailureCounter) 107 } 108 109 func startGarbageCollectorWithMaxElapsedTime(ctx context.Context, gc GarbageCollector, interval, window, maxElapsedTime, timeout time.Duration, failureCounter prometheus.Counter) error { 110 backoffInterval := backoff.NewExponentialBackOff() 111 backoffInterval.InitialInterval = interval 112 backoffInterval.MaxInterval = max(MaxGCInterval, interval) 113 backoffInterval.MaxElapsedTime = maxElapsedTime 114 backoffInterval.Reset() 115 116 nextInterval := interval 117 118 log.Ctx(ctx).Info(). 119 Dur("interval", nextInterval). 120 Msg("datastore garbage collection worker started") 121 122 for { 123 select { 124 case <-ctx.Done(): 125 log.Ctx(ctx).Info(). 126 Msg("shutting down datastore garbage collection worker") 127 return ctx.Err() 128 129 case <-time.After(nextInterval): 130 log.Ctx(ctx).Info(). 131 Dur("interval", nextInterval). 132 Dur("window", window). 133 Dur("timeout", timeout). 134 Msg("running garbage collection worker") 135 136 err := RunGarbageCollection(gc, window, timeout) 137 if err != nil { 138 failureCounter.Inc() 139 nextInterval = backoffInterval.NextBackOff() 140 log.Ctx(ctx).Warn().Err(err). 141 Dur("next-attempt-in", nextInterval). 142 Msg("error attempting to perform garbage collection") 143 continue 144 } 145 146 backoffInterval.Reset() 147 nextInterval = interval 148 149 log.Ctx(ctx).Debug(). 150 Dur("next-run-in", interval). 151 Msg("datastore garbage collection scheduled for next run") 152 } 153 } 154 } 155 156 // RunGarbageCollection runs garbage collection for the datastore. 157 func RunGarbageCollection(gc GarbageCollector, window, timeout time.Duration) error { 158 ctx, cancel := context.WithTimeout(context.Background(), timeout) 159 defer cancel() 160 161 ctx, span := tracer.Start(ctx, "RunGarbageCollection") 162 defer span.End() 163 164 // Before attempting anything, check if the datastore is ready. 165 startTime := time.Now() 166 ready, err := gc.ReadyState(ctx) 167 if err != nil { 168 return err 169 } 170 if !ready.IsReady { 171 log.Ctx(ctx).Warn(). 172 Msgf("datastore wasn't ready when attempting garbage collection: %s", ready.Message) 173 return nil 174 } 175 176 now, err := gc.Now(ctx) 177 if err != nil { 178 return fmt.Errorf("error retrieving now: %w", err) 179 } 180 181 watermark, err := gc.TxIDBefore(ctx, now.Add(-1*window)) 182 if err != nil { 183 return fmt.Errorf("error retrieving watermark: %w", err) 184 } 185 186 collected, err := gc.DeleteBeforeTx(ctx, watermark) 187 188 // even if an error happened, garbage would have been collected. This makes sure these are reflected even if the 189 // worker eventually fails or times out. 190 gcRelationshipsCounter.Add(float64(collected.Relationships)) 191 gcTransactionsCounter.Add(float64(collected.Transactions)) 192 gcNamespacesCounter.Add(float64(collected.Namespaces)) 193 collectionDuration := time.Since(startTime) 194 gcDurationHistogram.Observe(collectionDuration.Seconds()) 195 196 if err != nil { 197 return fmt.Errorf("error deleting in gc: %w", err) 198 } 199 200 log.Ctx(ctx).Info(). 201 Stringer("highestTxID", watermark). 202 Dur("duration", collectionDuration). 203 Time("nowTime", now). 204 Interface("collected", collected). 205 Msg("datastore garbage collection completed successfully") 206 207 gc.MarkGCCompleted() 208 return nil 209 }