github.com/pyroscope-io/pyroscope@v0.37.3-0.20230725203016-5f6947968bd0/pkg/storage/retention.go (about)

     1  package storage
     2  
     3  import (
     4  	"context"
     5  	"errors"
     6  	"time"
     7  
     8  	"github.com/dgraph-io/badger/v2"
     9  	"github.com/prometheus/client_golang/prometheus"
    10  
    11  	"github.com/pyroscope-io/pyroscope/pkg/storage/dimension"
    12  	"github.com/pyroscope-io/pyroscope/pkg/storage/segment"
    13  )
    14  
    15  const defaultBatchSize = 1 << 10 // 1K items
    16  
    17  func (s *Storage) enforceRetentionPolicy(ctx context.Context, rp *segment.RetentionPolicy) {
    18  	observer := prometheus.ObserverFunc(s.metrics.retentionTaskDuration.Observe)
    19  	timer := prometheus.NewTimer(observer)
    20  	defer timer.ObserveDuration()
    21  
    22  	s.logger.Debug("enforcing retention policy")
    23  	err := s.iterateOverAllSegments(func(k *segment.Key) error {
    24  		return s.deleteSegmentData(ctx, k, rp)
    25  	})
    26  
    27  	switch {
    28  	case err == nil:
    29  	case errors.Is(ctx.Err(), context.Canceled):
    30  		s.logger.Warn("enforcing retention policy canceled")
    31  	default:
    32  		s.logger.WithError(err).Error("failed to enforce retention policy")
    33  	}
    34  }
    35  
    36  func (s *Storage) deleteSegmentData(ctx context.Context, k *segment.Key, rp *segment.RetentionPolicy) error {
    37  	sk := k.SegmentKey()
    38  	cached, ok := s.segments.Lookup(sk)
    39  	if !ok {
    40  		return nil
    41  	}
    42  
    43  	// Instead of removing every tree in an individual transaction for each
    44  	// of them, blocking the segment for a long time, we remember them and
    45  	// drop in batches after the segment is released.
    46  	//
    47  	// To avoid a potential inconsistency when DeleteNodesBefore fails in the
    48  	// process, trees should be removed first. Only after successful commit
    49  	// segment nodes can be safely removed to guaranty idempotency.
    50  
    51  	// TODO(kolesnikovae):
    52  	//  There is a better way of removing these trees: we could calculate
    53  	//  non-overlapping prefixes for depth levels, and drop the data
    54  	//  (both in cache and on disk) using these prefixes.
    55  	//  Remaining trees (with overlapping prefixes) would be removed
    56  	//  in batches. That would be especially efficient in cases when data
    57  	//  removed for a very long period. For example, when retention-period
    58  	//  is enabled for the first time on a server with historical data.
    59  
    60  	type segmentNode struct {
    61  		depth int
    62  		time  int64
    63  	}
    64  
    65  	nodes := make([]segmentNode, 0)
    66  	seg := cached.(*segment.Segment)
    67  	deleted, err := seg.WalkNodesToDelete(rp, func(d int, t time.Time) error {
    68  		nodes = append(nodes, segmentNode{d, t.Unix()})
    69  		return nil
    70  	})
    71  	if err != nil {
    72  		return err
    73  	}
    74  	if deleted {
    75  		return s.deleteSegmentAndRelatedData(k)
    76  	}
    77  
    78  	var removed int
    79  	batch := s.trees.NewWriteBatch()
    80  	defer func() {
    81  		batch.Cancel()
    82  	}()
    83  
    84  	for _, n := range nodes {
    85  		treeKey := segment.TreeKey(sk, n.depth, n.time)
    86  		s.trees.Discard(treeKey)
    87  		switch err = batch.Delete(treePrefix.key(treeKey)); {
    88  		case err == nil:
    89  		case errors.Is(err, badger.ErrKeyNotFound):
    90  			continue
    91  		default:
    92  			return err
    93  		}
    94  		// It is not possible to make size estimation without reading
    95  		// the item. Therefore, the call does not report reclaimed space.
    96  		if removed++; removed%defaultBatchSize == 0 {
    97  			if err = batch.Flush(); err != nil {
    98  				return err
    99  			}
   100  			select {
   101  			default:
   102  				batch = s.trees.NewWriteBatch()
   103  			case <-ctx.Done():
   104  				return ctx.Err()
   105  			}
   106  		}
   107  	}
   108  
   109  	// Flush remaining items, if any: it's important to make sure
   110  	// all trees were removed before deleting segment nodes - see
   111  	// note on a potential inconsistency above.
   112  	if removed%defaultBatchSize != 0 {
   113  		if err = batch.Flush(); err != nil {
   114  			return err
   115  		}
   116  	}
   117  
   118  	_, err = seg.DeleteNodesBefore(rp)
   119  	return err
   120  }
   121  
   122  // reclaimSegmentSpace is aimed to reclaim specified size by removing
   123  // trees for the given segment. The amount of deleted trees is determined
   124  // based on the KV item size estimation.
   125  //
   126  // Unfortunately, due to the fact that badger DB reclaims disk space
   127  // eventually, there is no way to juxtapose the actual occupied disk size
   128  // and the number of items to remove based on their estimated size.
   129  func (s *Storage) reclaimSegmentSpace(k *segment.Key, size int64) error {
   130  	batchSize := s.trees.MaxBatchCount()
   131  	batch := s.trees.NewWriteBatch()
   132  	defer func() {
   133  		batch.Cancel()
   134  	}()
   135  
   136  	var (
   137  		removed   int64
   138  		reclaimed int64
   139  		err       error
   140  	)
   141  
   142  	// Keep track of the most recent removed tree time per every segment level.
   143  	rp := &segment.RetentionPolicy{Levels: make(map[int]time.Time)}
   144  	err = s.trees.View(func(txn *badger.Txn) error {
   145  		// Lower-level trees come first because of the lexicographical order:
   146  		// from the very first tree to the most recent one, from the lowest
   147  		// level (with highest resolution) to the highest.
   148  		it := txn.NewIterator(badger.IteratorOptions{
   149  			// We count all version so that our estimation is more precise
   150  			// but slightly higher than the actual size in practice,
   151  			// meaning that we delete less data (and reclaim less space);
   152  			// otherwise there is a chance to remove more trees than needed.
   153  			AllVersions: true,
   154  			// The prefix matches all trees in the segment.
   155  			Prefix: treePrefix.key(k.SegmentKey()),
   156  		})
   157  		defer it.Close()
   158  		for it.Rewind(); it.Valid(); it.Next() {
   159  			if size-reclaimed <= 0 {
   160  				return nil
   161  			}
   162  
   163  			item := it.Item()
   164  			if tk, ok := treePrefix.trim(item.Key()); ok {
   165  				treeKey := string(tk)
   166  				s.trees.Discard(treeKey)
   167  				// Update the time boundary for the segment level.
   168  				t, level, err := segment.ParseTreeKey(treeKey)
   169  				if err == nil {
   170  					if t.After(rp.Levels[level]) {
   171  						rp.Levels[level] = t
   172  					}
   173  				}
   174  			}
   175  
   176  			// A key copy must be taken. The slice is reused
   177  			// by iterator but is also used in the batch.
   178  			switch err = batch.Delete(item.KeyCopy(nil)); {
   179  			case err == nil:
   180  			case errors.Is(err, badger.ErrKeyNotFound):
   181  				continue
   182  			default:
   183  				return err
   184  			}
   185  
   186  			reclaimed += item.EstimatedSize()
   187  			if removed++; removed%batchSize == 0 {
   188  				if batch, err = s.flushTreeBatch(batch); err != nil {
   189  					return err
   190  				}
   191  			}
   192  		}
   193  		return nil
   194  	})
   195  	if err != nil {
   196  		return err
   197  	}
   198  
   199  	// Flush remaining items, if any: it's important to make sure
   200  	// all trees were removed before deleting segment nodes - see
   201  	// note on a potential inconsistency above.
   202  	if removed%batchSize != 0 {
   203  		if err = batch.Flush(); err != nil {
   204  			return err
   205  		}
   206  	}
   207  
   208  	if len(rp.Levels) > 0 {
   209  		if cached, ok := s.segments.Lookup(k.SegmentKey()); ok {
   210  			if ok, err = cached.(*segment.Segment).DeleteNodesBefore(rp); ok {
   211  				err = s.deleteSegmentAndRelatedData(k)
   212  			}
   213  		}
   214  	}
   215  
   216  	return err
   217  }
   218  
   219  // flushTreeBatch commits the changes and returns a new batch. The call returns
   220  // the batch unchanged in case of an error so that it can be safely cancelled.
   221  //
   222  // If the storage was requested to close, errClosed will be returned.
   223  func (s *Storage) flushTreeBatch(batch *badger.WriteBatch) (*badger.WriteBatch, error) {
   224  	if err := batch.Flush(); err != nil {
   225  		return batch, err
   226  	}
   227  	select {
   228  	case <-s.stop:
   229  		return batch, errClosed
   230  	default:
   231  		return s.trees.NewWriteBatch(), nil
   232  	}
   233  }
   234  
   235  func (s *Storage) iterateOverAllSegments(cb func(*segment.Key) error) error {
   236  	nameKey := "__name__"
   237  
   238  	var dimensions []*dimension.Dimension
   239  	s.labels.GetValues(nameKey, func(v string) bool {
   240  		if d, ok := s.lookupAppDimension(v); ok {
   241  			dimensions = append(dimensions, d)
   242  		}
   243  		return true
   244  	})
   245  
   246  	for _, r := range dimension.Union(dimensions...) {
   247  		k, err := segment.ParseKey(string(r))
   248  		if err != nil {
   249  			s.logger.WithError(err).WithField("key", string(r)).Error("failed to parse segment key")
   250  			continue
   251  		}
   252  		if err = cb(k); err != nil {
   253  			return err
   254  		}
   255  	}
   256  
   257  	return nil
   258  }