github.com/pyroscope-io/pyroscope@v0.37.3-0.20230725203016-5f6947968bd0/pkg/storage/retention.go (about) 1 package storage 2 3 import ( 4 "context" 5 "errors" 6 "time" 7 8 "github.com/dgraph-io/badger/v2" 9 "github.com/prometheus/client_golang/prometheus" 10 11 "github.com/pyroscope-io/pyroscope/pkg/storage/dimension" 12 "github.com/pyroscope-io/pyroscope/pkg/storage/segment" 13 ) 14 15 const defaultBatchSize = 1 << 10 // 1K items 16 17 func (s *Storage) enforceRetentionPolicy(ctx context.Context, rp *segment.RetentionPolicy) { 18 observer := prometheus.ObserverFunc(s.metrics.retentionTaskDuration.Observe) 19 timer := prometheus.NewTimer(observer) 20 defer timer.ObserveDuration() 21 22 s.logger.Debug("enforcing retention policy") 23 err := s.iterateOverAllSegments(func(k *segment.Key) error { 24 return s.deleteSegmentData(ctx, k, rp) 25 }) 26 27 switch { 28 case err == nil: 29 case errors.Is(ctx.Err(), context.Canceled): 30 s.logger.Warn("enforcing retention policy canceled") 31 default: 32 s.logger.WithError(err).Error("failed to enforce retention policy") 33 } 34 } 35 36 func (s *Storage) deleteSegmentData(ctx context.Context, k *segment.Key, rp *segment.RetentionPolicy) error { 37 sk := k.SegmentKey() 38 cached, ok := s.segments.Lookup(sk) 39 if !ok { 40 return nil 41 } 42 43 // Instead of removing every tree in an individual transaction for each 44 // of them, blocking the segment for a long time, we remember them and 45 // drop in batches after the segment is released. 46 // 47 // To avoid a potential inconsistency when DeleteNodesBefore fails in the 48 // process, trees should be removed first. Only after successful commit 49 // segment nodes can be safely removed to guaranty idempotency. 50 51 // TODO(kolesnikovae): 52 // There is a better way of removing these trees: we could calculate 53 // non-overlapping prefixes for depth levels, and drop the data 54 // (both in cache and on disk) using these prefixes. 55 // Remaining trees (with overlapping prefixes) would be removed 56 // in batches. That would be especially efficient in cases when data 57 // removed for a very long period. For example, when retention-period 58 // is enabled for the first time on a server with historical data. 59 60 type segmentNode struct { 61 depth int 62 time int64 63 } 64 65 nodes := make([]segmentNode, 0) 66 seg := cached.(*segment.Segment) 67 deleted, err := seg.WalkNodesToDelete(rp, func(d int, t time.Time) error { 68 nodes = append(nodes, segmentNode{d, t.Unix()}) 69 return nil 70 }) 71 if err != nil { 72 return err 73 } 74 if deleted { 75 return s.deleteSegmentAndRelatedData(k) 76 } 77 78 var removed int 79 batch := s.trees.NewWriteBatch() 80 defer func() { 81 batch.Cancel() 82 }() 83 84 for _, n := range nodes { 85 treeKey := segment.TreeKey(sk, n.depth, n.time) 86 s.trees.Discard(treeKey) 87 switch err = batch.Delete(treePrefix.key(treeKey)); { 88 case err == nil: 89 case errors.Is(err, badger.ErrKeyNotFound): 90 continue 91 default: 92 return err 93 } 94 // It is not possible to make size estimation without reading 95 // the item. Therefore, the call does not report reclaimed space. 96 if removed++; removed%defaultBatchSize == 0 { 97 if err = batch.Flush(); err != nil { 98 return err 99 } 100 select { 101 default: 102 batch = s.trees.NewWriteBatch() 103 case <-ctx.Done(): 104 return ctx.Err() 105 } 106 } 107 } 108 109 // Flush remaining items, if any: it's important to make sure 110 // all trees were removed before deleting segment nodes - see 111 // note on a potential inconsistency above. 112 if removed%defaultBatchSize != 0 { 113 if err = batch.Flush(); err != nil { 114 return err 115 } 116 } 117 118 _, err = seg.DeleteNodesBefore(rp) 119 return err 120 } 121 122 // reclaimSegmentSpace is aimed to reclaim specified size by removing 123 // trees for the given segment. The amount of deleted trees is determined 124 // based on the KV item size estimation. 125 // 126 // Unfortunately, due to the fact that badger DB reclaims disk space 127 // eventually, there is no way to juxtapose the actual occupied disk size 128 // and the number of items to remove based on their estimated size. 129 func (s *Storage) reclaimSegmentSpace(k *segment.Key, size int64) error { 130 batchSize := s.trees.MaxBatchCount() 131 batch := s.trees.NewWriteBatch() 132 defer func() { 133 batch.Cancel() 134 }() 135 136 var ( 137 removed int64 138 reclaimed int64 139 err error 140 ) 141 142 // Keep track of the most recent removed tree time per every segment level. 143 rp := &segment.RetentionPolicy{Levels: make(map[int]time.Time)} 144 err = s.trees.View(func(txn *badger.Txn) error { 145 // Lower-level trees come first because of the lexicographical order: 146 // from the very first tree to the most recent one, from the lowest 147 // level (with highest resolution) to the highest. 148 it := txn.NewIterator(badger.IteratorOptions{ 149 // We count all version so that our estimation is more precise 150 // but slightly higher than the actual size in practice, 151 // meaning that we delete less data (and reclaim less space); 152 // otherwise there is a chance to remove more trees than needed. 153 AllVersions: true, 154 // The prefix matches all trees in the segment. 155 Prefix: treePrefix.key(k.SegmentKey()), 156 }) 157 defer it.Close() 158 for it.Rewind(); it.Valid(); it.Next() { 159 if size-reclaimed <= 0 { 160 return nil 161 } 162 163 item := it.Item() 164 if tk, ok := treePrefix.trim(item.Key()); ok { 165 treeKey := string(tk) 166 s.trees.Discard(treeKey) 167 // Update the time boundary for the segment level. 168 t, level, err := segment.ParseTreeKey(treeKey) 169 if err == nil { 170 if t.After(rp.Levels[level]) { 171 rp.Levels[level] = t 172 } 173 } 174 } 175 176 // A key copy must be taken. The slice is reused 177 // by iterator but is also used in the batch. 178 switch err = batch.Delete(item.KeyCopy(nil)); { 179 case err == nil: 180 case errors.Is(err, badger.ErrKeyNotFound): 181 continue 182 default: 183 return err 184 } 185 186 reclaimed += item.EstimatedSize() 187 if removed++; removed%batchSize == 0 { 188 if batch, err = s.flushTreeBatch(batch); err != nil { 189 return err 190 } 191 } 192 } 193 return nil 194 }) 195 if err != nil { 196 return err 197 } 198 199 // Flush remaining items, if any: it's important to make sure 200 // all trees were removed before deleting segment nodes - see 201 // note on a potential inconsistency above. 202 if removed%batchSize != 0 { 203 if err = batch.Flush(); err != nil { 204 return err 205 } 206 } 207 208 if len(rp.Levels) > 0 { 209 if cached, ok := s.segments.Lookup(k.SegmentKey()); ok { 210 if ok, err = cached.(*segment.Segment).DeleteNodesBefore(rp); ok { 211 err = s.deleteSegmentAndRelatedData(k) 212 } 213 } 214 } 215 216 return err 217 } 218 219 // flushTreeBatch commits the changes and returns a new batch. The call returns 220 // the batch unchanged in case of an error so that it can be safely cancelled. 221 // 222 // If the storage was requested to close, errClosed will be returned. 223 func (s *Storage) flushTreeBatch(batch *badger.WriteBatch) (*badger.WriteBatch, error) { 224 if err := batch.Flush(); err != nil { 225 return batch, err 226 } 227 select { 228 case <-s.stop: 229 return batch, errClosed 230 default: 231 return s.trees.NewWriteBatch(), nil 232 } 233 } 234 235 func (s *Storage) iterateOverAllSegments(cb func(*segment.Key) error) error { 236 nameKey := "__name__" 237 238 var dimensions []*dimension.Dimension 239 s.labels.GetValues(nameKey, func(v string) bool { 240 if d, ok := s.lookupAppDimension(v); ok { 241 dimensions = append(dimensions, d) 242 } 243 return true 244 }) 245 246 for _, r := range dimension.Union(dimensions...) { 247 k, err := segment.ParseKey(string(r)) 248 if err != nil { 249 s.logger.WithError(err).WithField("key", string(r)).Error("failed to parse segment key") 250 continue 251 } 252 if err = cb(k); err != nil { 253 return err 254 } 255 } 256 257 return nil 258 }