github.com/grafana/pyroscope@v1.18.0/pkg/metastore/index/cleaner/retention/retention.go (about)

     1  package retention
     2  
     3  import (
     4  	"flag"
     5  	"iter"
     6  	"math"
     7  	"slices"
     8  	"time"
     9  
    10  	"github.com/go-kit/log"
    11  	"github.com/go-kit/log/level"
    12  	"github.com/prometheus/common/model"
    13  	"go.etcd.io/bbolt"
    14  
    15  	metastorev1 "github.com/grafana/pyroscope/api/gen/proto/go/metastore/v1"
    16  	indexstore "github.com/grafana/pyroscope/pkg/metastore/index/store"
    17  )
    18  
    19  // Policy determines which parts of the index should be retained or deleted.
    20  type Policy interface {
    21  	// CreateTombstones examines the provided partitions and returns tombstones
    22  	// for shards that should be deleted according to the policy.
    23  	CreateTombstones(*bbolt.Tx, iter.Seq[indexstore.Partition]) []*metastorev1.Tombstones
    24  }
    25  
    26  type Config struct {
    27  	RetentionPeriod model.Duration `yaml:"retention_period" doc:"hidden"`
    28  }
    29  
    30  type Overrides interface {
    31  	Retention() (defaults Config, overrides iter.Seq2[string, Config])
    32  }
    33  
    34  func (c *Config) RegisterFlags(f *flag.FlagSet) {
    35  	c.RetentionPeriod = model.Duration(time.Hour * 24 * 31)
    36  	f.Var(&c.RetentionPeriod, "retention-period", "Retention period for the data. 0 means data never deleted.")
    37  }
    38  
    39  // TimeBasedRetentionPolicy implements a retention policy based on time.
    40  type TimeBasedRetentionPolicy struct {
    41  	logger        log.Logger
    42  	overrides     map[string]*marker
    43  	gracePeriod   time.Duration
    44  	maxTombstones int
    45  
    46  	markers       []*marker
    47  	defaultPeriod *marker
    48  	tombstones    []*metastorev1.Tombstones
    49  }
    50  
    51  type marker struct {
    52  	tenantID  string
    53  	timestamp time.Time
    54  }
    55  
    56  func NewTimeBasedRetentionPolicy(
    57  	logger log.Logger,
    58  	overrides Overrides,
    59  	maxTombstones int,
    60  	gracePeriod time.Duration,
    61  	now time.Time,
    62  ) *TimeBasedRetentionPolicy {
    63  	defaults, tenantOverrides := overrides.Retention()
    64  	rp := TimeBasedRetentionPolicy{
    65  		logger:        logger,
    66  		overrides:     make(map[string]*marker),
    67  		tombstones:    make([]*metastorev1.Tombstones, 0, maxTombstones),
    68  		maxTombstones: maxTombstones,
    69  		gracePeriod:   gracePeriod,
    70  	}
    71  	// Markers indicate the time before which data should be deleted
    72  	// for a given tenant.
    73  	for tenantID, override := range tenantOverrides {
    74  		if defaults.RetentionPeriod == override.RetentionPeriod {
    75  			continue
    76  		}
    77  		// An override is defined for the tenant, so we need to adjust the
    78  		// retention period for it. By default, we assume that the retention
    79  		// period is not defined, i.e. is infinite.
    80  		var timestamp time.Time // zero value means no retention period.
    81  		if period := time.Duration(override.RetentionPeriod); period > 0 {
    82  			timestamp = now.Add(-period)
    83  		}
    84  		m := &marker{
    85  			tenantID:  tenantID,
    86  			timestamp: timestamp,
    87  		}
    88  		rp.markers = append(rp.markers, m)
    89  		rp.overrides[tenantID] = m
    90  	}
    91  	// The default retention period is handled separately: we won't create
    92  	// the marker if the retention period is not set. This allows us to avoid
    93  	// checking all partition tenant shards, and instead only check specific
    94  	// tenants that have a defined retention policy.
    95  	if defaults.RetentionPeriod > 0 {
    96  		rp.defaultPeriod = &marker{timestamp: now.Add(-time.Duration(defaults.RetentionPeriod))}
    97  		rp.markers = append(rp.markers, rp.defaultPeriod)
    98  	}
    99  	// It is fine if there are marker pointing to the same time: for example,
   100  	// if an override is set explicitly for a tenant, but it matches the
   101  	// default value.
   102  	slices.SortFunc(rp.markers, func(a, b *marker) int {
   103  		return a.timestamp.Compare(b.timestamp)
   104  	})
   105  
   106  	return &rp
   107  }
   108  
   109  func (rp *TimeBasedRetentionPolicy) CreateTombstones(tx *bbolt.Tx, partitions iter.Seq[indexstore.Partition]) []*metastorev1.Tombstones {
   110  	if len(rp.markers) == 0 {
   111  		level.Debug(rp.logger).Log("msg", "no retention policies defined, skipping")
   112  		return nil
   113  	}
   114  	for _, m := range rp.markers {
   115  		level.Debug(rp.logger).Log("msg", "found retention marker", "tenant", m.tenantID, "timestamp", m.timestamp)
   116  	}
   117  	rp.tombstones = rp.tombstones[:0]
   118  	for p := range partitions {
   119  		if len(rp.tombstones) >= rp.maxTombstones {
   120  			break
   121  		}
   122  		if !rp.processPartition(tx, p) {
   123  			break
   124  		}
   125  	}
   126  	return rp.tombstones
   127  }
   128  
   129  func (rp *TimeBasedRetentionPolicy) processPartition(tx *bbolt.Tx, p indexstore.Partition) bool {
   130  	// We want to find the markers that are before the partition end, i.e. the
   131  	// markers that indicate the time before which data should be deleted. For
   132  	// tenants D and E we need to inspect the partition. Otherwise, if there
   133  	// are no markers after the partition end, we stop.
   134  	//
   135  	//            | partition            |
   136  	//            | start            end |             t
   137  	//  ----------|----------------------x------------->
   138  	//            *         *            *  *      *
   139  	//  markers:  A         B            C  D      E
   140  	//
   141  	// Note that we also add a grace period to the partition end time, so that
   142  	// we won't be checking it for this period. Since tombstones are only
   143  	// created after the shard max time is before the marker timestamp, and the
   144  	// distance between them might be large (hours), we would be wasting time
   145  	// if were inspecting the partition right away.
   146  	partitionEnd := &marker{timestamp: p.EndTime().Add(rp.gracePeriod)}
   147  	logger := log.With(rp.logger, "partition", p.String())
   148  	level.Debug(logger).Log(
   149  		"msg", "processing partition",
   150  		"partition_end_marker", partitionEnd.timestamp,
   151  		"retention_markers", len(rp.markers),
   152  	)
   153  
   154  	i, _ := slices.BinarySearchFunc(rp.markers, partitionEnd, func(a, b *marker) int {
   155  		return a.timestamp.Compare(b.timestamp)
   156  	})
   157  	if i >= len(rp.markers) {
   158  		// All markers are before the partition end: it can't be deleted.
   159  		// We can stop here: no partitions after this one will have deletion
   160  		// markers that are before the partition end.
   161  		level.Debug(logger).Log("msg", "partition has not passed the retention period, skipping")
   162  		return false
   163  	}
   164  
   165  	q := p.Query(tx)
   166  	if q == nil {
   167  		level.Warn(logger).Log("msg", "cannot find partition, skipping")
   168  		return true
   169  	}
   170  
   171  	// The anonymous tenant is ignored here, we only collect tombstones for the
   172  	// specific tenants, which have a defined retention policy.
   173  	if rp.defaultPeriod == nil || rp.defaultPeriod.timestamp.Before(partitionEnd.timestamp) {
   174  		// Fast path for the case when there are markers very far in the future
   175  		// relatively the default marker, or no default marker at all.
   176  		//
   177  		// The default retention period has not expired yet, so we don't need
   178  		// to inspect all the tenants. Instead, we can just examine the markers
   179  		// that are after the partition end: these tenants have retention
   180  		// period shorter than the default one. This is useful in case if the
   181  		// tenant data is deleted by setting very short retention period: we
   182  		// won't check each and every partition tenant shard.
   183  		level.Debug(logger).Log("msg", "creating tombstones for tenant markers", "retention_markers", len(rp.markers[i:]))
   184  		rp.createTombstonesForMarkers(q, rp.markers[i:])
   185  	} else {
   186  		// Otherwise, we need to inspect all the tenants in the partition.
   187  		// There's no point in checking the markers: either most of them
   188  		// will result in tombstones, or have already been deleted (e.g.,
   189  		// there's one tenant with an infinite retention period).
   190  		level.Debug(logger).Log("msg", "creating tombstones for partition tenants")
   191  		rp.createTombstonesForTenants(q, partitionEnd)
   192  	}
   193  
   194  	// Finally, we need to check if the anonymous tenant has any tombstones to
   195  	// collect. We only delete it if there are no other tenant shards in the
   196  	// partition: this guarantees that we don't delete data that is still
   197  	// needed, as we'd have the named tenant shards otherwise. Note that the
   198  	// tombstones we created this far are not resulted in the deletion of
   199  	// shards yet, so we will only delete the anonymous tenant on a second
   200  	// pass.
   201  	//
   202  	// NOTE(kolesnikovae):
   203  	//
   204  	// The approach may result in keeping the anonymous tenant data longer than
   205  	// necessary, but it should not be a problem in practice, as we assume that
   206  	// it contains no blocks: those are removed at L0 compaction. However, the
   207  	// shard-level structures such as tenant-shard buckets and string tables
   208  	// are not removed until the shard is deleted, which may also affect the
   209  	// index size. Ideally, we should seal partitions (create checkpoints) at
   210  	// some point that would protect it from modifications; then, we could
   211  	// delete the anon tenant shards safely, if there's no uncompacted data.
   212  	//
   213  	// An alternative approach would be to mark anon shards as we remove blocks
   214  	// from them, or create tombstones. We cannot do this as a side effect, to
   215  	// avoid state drift between the replicas (although this is arguable – such
   216  	// deletion is a side effect per se, and it only concerns the local state),
   217  	// but we can find the marks during the cleanup job. That could be
   218  	// implemented as a separate retention policy.
   219  	rp.createTombstonesForAnonTenant(q)
   220  	return len(rp.tombstones) < rp.maxTombstones
   221  }
   222  
   223  func (rp *TimeBasedRetentionPolicy) createTombstonesForMarkers(q *indexstore.PartitionQuery, markers []*marker) {
   224  	for _, m := range markers {
   225  		if m.tenantID == "" {
   226  			continue
   227  		}
   228  		if !rp.createTombstones(q, m) {
   229  			return
   230  		}
   231  	}
   232  }
   233  
   234  func (rp *TimeBasedRetentionPolicy) createTombstonesForTenants(q *indexstore.PartitionQuery, partitionEnd *marker) {
   235  	for tenantID := range q.Tenants() {
   236  		if tenantID == "" {
   237  			continue
   238  		}
   239  		var m *marker
   240  		if o, ok := rp.overrides[tenantID]; ok {
   241  			m = o
   242  		} else if rp.defaultPeriod != nil {
   243  			// Tenant-specific marker using the default retention period.
   244  			m = &marker{tenantID: tenantID, timestamp: rp.defaultPeriod.timestamp}
   245  		} else {
   246  			// No retention policy for this tenant, and no default:
   247  			// we retain the data indefinitely.
   248  			continue
   249  		}
   250  		if m.timestamp.After(partitionEnd.timestamp) {
   251  			if !rp.createTombstones(q, m) {
   252  				return
   253  			}
   254  		}
   255  	}
   256  }
   257  
   258  func (rp *TimeBasedRetentionPolicy) createTombstonesForAnonTenant(q *indexstore.PartitionQuery) {
   259  	if rp.hasTenants(q) {
   260  		// We have at least one tenant other than the anonymous one.
   261  		// We cannot delete the anonymous tenant shard yet – continue.
   262  		return
   263  	}
   264  	// Once shard max time passes the partition end time, we can
   265  	// create tombstones for the anonymous tenant shard.
   266  	level.Debug(rp.logger).Log("msg", "creating tombstones for anonymous tenant")
   267  	// We want to bypass the timestamp check for the anonymous tenant:
   268  	// we know that if all the other tenants have been processed, it's
   269  	// safe to create tombstones for the anonymous tenant.
   270  	rp.createTombstones(q, &marker{timestamp: time.UnixMilli(math.MaxInt64)})
   271  }
   272  
   273  func (rp *TimeBasedRetentionPolicy) hasTenants(q *indexstore.PartitionQuery) bool {
   274  	var n int
   275  	for tenant := range q.Tenants() {
   276  		n++
   277  		if n > 1 || tenant != "" {
   278  			return true
   279  		}
   280  	}
   281  	return false
   282  }
   283  
   284  func (rp *TimeBasedRetentionPolicy) createTombstones(q *indexstore.PartitionQuery, m *marker) bool {
   285  	for shard := range q.Shards(m.tenantID) {
   286  		if len(rp.tombstones) >= rp.maxTombstones {
   287  			return false
   288  		}
   289  		maxTime := time.Unix(0, shard.ShardIndex.MaxTime)
   290  		if maxTime.Before(m.timestamp) {
   291  			// The shard does not contain data before the marker.
   292  			name := shard.TombstoneName()
   293  			level.Debug(rp.logger).Log("msg", "creating tombstone", "name", name)
   294  			rp.tombstones = append(rp.tombstones, &metastorev1.Tombstones{
   295  				Shard: &metastorev1.ShardTombstone{
   296  					Name:      name,
   297  					Timestamp: q.Timestamp.UnixNano(),
   298  					Duration:  q.Duration.Nanoseconds(),
   299  					Shard:     shard.Shard,
   300  					Tenant:    shard.Tenant,
   301  				},
   302  			})
   303  		}
   304  	}
   305  	return true
   306  }