github.com/grafana/pyroscope@v1.18.0/pkg/compactionworker/worker.go (about)

     1  package compactionworker
     2  
     3  import (
     4  	"context"
     5  	"encoding/binary"
     6  	"flag"
     7  	"fmt"
     8  	"os"
     9  	"path/filepath"
    10  	"runtime"
    11  	"strconv"
    12  	"strings"
    13  	"sync"
    14  	"sync/atomic"
    15  	"time"
    16  
    17  	"github.com/cespare/xxhash/v2"
    18  	"github.com/go-kit/log"
    19  	"github.com/go-kit/log/level"
    20  	"github.com/grafana/dskit/services"
    21  	"github.com/oklog/ulid/v2"
    22  	"github.com/opentracing/opentracing-go"
    23  	"github.com/pkg/errors"
    24  	"github.com/prometheus/client_golang/prometheus"
    25  	"github.com/prometheus/prometheus/model/labels"
    26  	thanosstore "github.com/thanos-io/objstore"
    27  	_ "go.uber.org/automaxprocs"
    28  
    29  	metastorev1 "github.com/grafana/pyroscope/api/gen/proto/go/metastore/v1"
    30  	"github.com/grafana/pyroscope/pkg/block"
    31  	"github.com/grafana/pyroscope/pkg/block/metadata"
    32  	"github.com/grafana/pyroscope/pkg/metrics"
    33  	"github.com/grafana/pyroscope/pkg/objstore"
    34  	"github.com/grafana/pyroscope/pkg/util"
    35  )
    36  
    37  type Config struct {
    38  	JobConcurrency     int            `yaml:"job_capacity"`
    39  	JobPollInterval    time.Duration  `yaml:"job_poll_interval"`
    40  	SmallObjectSize    int            `yaml:"small_object_size_bytes"`
    41  	TempDir            string         `yaml:"temp_dir"`
    42  	RequestTimeout     time.Duration  `yaml:"request_timeout"`
    43  	CleanupMaxDuration time.Duration  `yaml:"cleanup_max_duration"`
    44  	MetricsExporter    metrics.Config `yaml:"metrics_exporter"`
    45  }
    46  
    47  func (cfg *Config) RegisterFlags(f *flag.FlagSet) {
    48  	const prefix = "compaction-worker."
    49  	f.IntVar(&cfg.JobConcurrency, prefix+"job-concurrency", 0, "Number of concurrent jobs compaction worker will run. Defaults to the number of CPU cores.")
    50  	f.DurationVar(&cfg.JobPollInterval, prefix+"job-poll-interval", 5*time.Second, "Interval between job requests")
    51  	f.DurationVar(&cfg.RequestTimeout, prefix+"request-timeout", 5*time.Second, "Job request timeout.")
    52  	f.DurationVar(&cfg.CleanupMaxDuration, prefix+"cleanup-max-duration", 15*time.Second, "Maximum duration of the cleanup operations.")
    53  	f.IntVar(&cfg.SmallObjectSize, prefix+"small-object-size-bytes", 8<<20, "Size of the object that can be loaded in memory.")
    54  	f.StringVar(&cfg.TempDir, prefix+"temp-dir", os.TempDir(), "Temporary directory for compaction jobs.")
    55  	cfg.MetricsExporter.RegisterFlags(f)
    56  }
    57  
    58  type Worker struct {
    59  	service services.Service
    60  
    61  	logger    log.Logger
    62  	config    Config
    63  	client    MetastoreClient
    64  	storage   objstore.Bucket
    65  	compactFn compactFunc
    66  	metrics   *workerMetrics
    67  
    68  	jobs     map[string]*compactionJob
    69  	queue    chan *compactionJob
    70  	threads  int
    71  	capacity atomic.Int32
    72  
    73  	deleterPool *deleterPool
    74  
    75  	stopped   atomic.Bool
    76  	closeOnce sync.Once
    77  	wg        sync.WaitGroup
    78  
    79  	exporter metrics.Exporter
    80  	ruler    metrics.Ruler
    81  }
    82  
    83  type compactionJob struct {
    84  	*metastorev1.CompactionJob
    85  
    86  	ctx    context.Context
    87  	cancel context.CancelFunc
    88  	done   atomic.Bool
    89  
    90  	blocks     []*metastorev1.BlockMeta
    91  	assignment *metastorev1.CompactionJobAssignment
    92  	compacted  *metastorev1.CompactedBlocks
    93  }
    94  
    95  type compactFunc func(context.Context, []*metastorev1.BlockMeta, objstore.Bucket, ...block.CompactionOption) ([]*metastorev1.BlockMeta, error)
    96  
    97  type MetastoreClient interface {
    98  	metastorev1.CompactionServiceClient
    99  	metastorev1.IndexServiceClient
   100  }
   101  
   102  func New(
   103  	logger log.Logger,
   104  	config Config,
   105  	client MetastoreClient,
   106  	storage objstore.Bucket,
   107  	reg prometheus.Registerer,
   108  	ruler metrics.Ruler,
   109  	exporter metrics.Exporter,
   110  ) (*Worker, error) {
   111  	config.TempDir = filepath.Join(filepath.Clean(config.TempDir), "pyroscope-compactor")
   112  	_ = os.RemoveAll(config.TempDir)
   113  	if err := os.MkdirAll(config.TempDir, 0o777); err != nil {
   114  		return nil, fmt.Errorf("failed to create compactor directory: %w", err)
   115  	}
   116  	w := &Worker{
   117  		config:    config,
   118  		logger:    logger,
   119  		client:    client,
   120  		storage:   storage,
   121  		compactFn: block.Compact,
   122  		metrics:   newMetrics(reg),
   123  		ruler:     ruler,
   124  		exporter:  exporter,
   125  	}
   126  	w.threads = config.JobConcurrency
   127  	if w.threads < 1 {
   128  		w.threads = runtime.GOMAXPROCS(-1)
   129  	}
   130  	w.queue = make(chan *compactionJob, 2*w.threads)
   131  	w.jobs = make(map[string]*compactionJob, 2*w.threads)
   132  	w.capacity.Store(int32(w.threads))
   133  	w.deleterPool = newDeleterPool(16 * w.threads)
   134  	w.service = services.NewBasicService(w.starting, w.running, w.stopping)
   135  	return w, nil
   136  }
   137  
   138  func (w *Worker) Service() services.Service { return w.service }
   139  
   140  func (w *Worker) starting(context.Context) (err error) { return nil }
   141  
   142  func (w *Worker) stopping(error) error { return nil }
   143  
   144  func (w *Worker) running(ctx context.Context) error {
   145  	ticker := time.NewTicker(w.config.JobPollInterval)
   146  	stopPolling := make(chan struct{})
   147  	pollingDone := make(chan struct{})
   148  	go func() {
   149  		defer close(pollingDone)
   150  		for {
   151  			select {
   152  			case <-stopPolling:
   153  				// Now that all the threads are done, we need to
   154  				// send the final status updates.
   155  				w.poll()
   156  				return
   157  
   158  			case <-ticker.C:
   159  				w.poll()
   160  			}
   161  		}
   162  	}()
   163  
   164  	w.wg.Add(w.threads)
   165  	for i := 0; i < w.threads; i++ {
   166  		go func() {
   167  			defer w.wg.Done()
   168  			level.Info(w.logger).Log("msg", "compaction worker thread started")
   169  			for job := range w.queue {
   170  				w.capacity.Add(-1)
   171  				util.Recover(func() { w.runCompaction(job) })
   172  				job.done.Store(true)
   173  				w.capacity.Add(1)
   174  			}
   175  		}()
   176  	}
   177  
   178  	<-ctx.Done()
   179  	// Wait for all threads to finish their work, continuing to report status
   180  	// updates about the in-progress jobs. First, signal to the poll loop that
   181  	// we're done with new jobs.
   182  	w.stopped.Store(true)
   183  	level.Info(w.logger).Log("msg", "waiting for all jobs to finish")
   184  	w.wg.Wait()
   185  
   186  	// Now that all the threads are done, we stop the polling loop.
   187  	ticker.Stop()
   188  	close(stopPolling)
   189  	<-pollingDone
   190  	// Force exporter to send all staged samples (depends on the implementation)
   191  	// Must be a blocking call.
   192  	if w.exporter != nil {
   193  		w.exporter.Flush()
   194  	}
   195  	w.deleterPool.close()
   196  	return nil
   197  }
   198  
   199  func (w *Worker) poll() {
   200  	// Check if we want to stop polling for new jobs.
   201  	// Close the queue if this is not the case.
   202  	var capacity uint32
   203  	if w.stopped.Load() {
   204  		w.closeOnce.Do(func() {
   205  			level.Info(w.logger).Log("msg", "closing job queue")
   206  			close(w.queue)
   207  		})
   208  	} else {
   209  		// We report the number of free workers in a hope to get more jobs.
   210  		// Note that cap(w.queue) - len(w.queue) will only report 0 when all
   211  		// the workers are busy and the queue is full (in fact, doubling the
   212  		// reported capacity).
   213  		if c := w.capacity.Load(); c > 0 {
   214  			capacity = uint32(c)
   215  		}
   216  	}
   217  
   218  	updates := w.collectUpdates()
   219  	if len(updates) == 0 && capacity == 0 {
   220  		level.Info(w.logger).Log("msg", "skipping polling", "updates", len(updates), "capacity", capacity)
   221  		return
   222  	}
   223  
   224  	level.Info(w.logger).Log("msg", "polling compaction jobs", "updates", len(updates), "capacity", capacity)
   225  	ctx, cancel := context.WithTimeout(context.Background(), w.config.RequestTimeout)
   226  	defer cancel()
   227  	resp, err := w.client.PollCompactionJobs(ctx, &metastorev1.PollCompactionJobsRequest{
   228  		StatusUpdates: updates,
   229  		JobCapacity:   capacity,
   230  	})
   231  	if err != nil {
   232  		level.Error(w.logger).Log("msg", "failed to poll compaction jobs", "err", err)
   233  		return
   234  	}
   235  
   236  	w.cleanup(updates)
   237  	newJobs := w.handleResponse(resp)
   238  	for _, job := range newJobs {
   239  		select {
   240  		case w.queue <- job:
   241  		default:
   242  			level.Warn(w.logger).Log("msg", "dropping job", "job_name", job.Name)
   243  			w.remove(job)
   244  		}
   245  	}
   246  }
   247  
   248  func (w *Worker) collectUpdates() []*metastorev1.CompactionJobStatusUpdate {
   249  	updates := make([]*metastorev1.CompactionJobStatusUpdate, 0, len(w.jobs))
   250  	for _, job := range w.jobs {
   251  		update := &metastorev1.CompactionJobStatusUpdate{
   252  			Name:  job.Name,
   253  			Token: job.assignment.Token,
   254  		}
   255  
   256  		switch done := job.done.Load(); {
   257  		case done && job.compacted != nil:
   258  			level.Info(w.logger).Log("msg", "sending update for completed job", "job", job.Name)
   259  			update.Status = metastorev1.CompactionJobStatus_COMPACTION_STATUS_SUCCESS
   260  			update.CompactedBlocks = job.compacted
   261  			updates = append(updates, update)
   262  
   263  		case done && job.compacted == nil:
   264  			// We're not sending the status update for the job and expect that the
   265  			// assigment is to be revoked. The job is to be removed at the next
   266  			// poll response handling: all jobs without assignments are canceled
   267  			// and removed.
   268  			level.Warn(w.logger).Log("msg", "skipping update for abandoned job", "job", job.Name)
   269  
   270  		default:
   271  			level.Info(w.logger).Log("msg", "sending update for in-progress job", "job", job.Name)
   272  			update.Status = metastorev1.CompactionJobStatus_COMPACTION_STATUS_IN_PROGRESS
   273  			updates = append(updates, update)
   274  		}
   275  	}
   276  
   277  	return updates
   278  }
   279  
   280  func (w *Worker) cleanup(updates []*metastorev1.CompactionJobStatusUpdate) {
   281  	for _, update := range updates {
   282  		if job := w.jobs[update.Name]; job != nil && job.done.Load() {
   283  			switch update.Status {
   284  			case metastorev1.CompactionJobStatus_COMPACTION_STATUS_SUCCESS:
   285  				// In the vast majority of cases, we end up here.
   286  				w.remove(job)
   287  
   288  			case metastorev1.CompactionJobStatus_COMPACTION_STATUS_IN_PROGRESS:
   289  				// It is possible that the job has been completed after we
   290  				// prepared the status update: keep the job for the next
   291  				// poll iteration.
   292  
   293  			default:
   294  				// Workers never send other statuses. It's unexpected to get here.
   295  				level.Warn(w.logger).Log("msg", "unexpected job status transition; removing the job", "job", job.Name)
   296  				w.remove(job)
   297  			}
   298  		}
   299  	}
   300  }
   301  
   302  func (w *Worker) remove(job *compactionJob) {
   303  	delete(w.jobs, job.Name)
   304  	job.cancel()
   305  }
   306  
   307  func (w *Worker) handleResponse(resp *metastorev1.PollCompactionJobsResponse) (newJobs []*compactionJob) {
   308  	// Assignments by job name.
   309  	assignments := make(map[string]*metastorev1.CompactionJobAssignment, len(resp.Assignments))
   310  	for _, assignment := range resp.Assignments {
   311  		assignments[assignment.Name] = assignment
   312  	}
   313  
   314  	for _, job := range w.jobs {
   315  		if assignment, ok := assignments[job.assignment.Name]; ok {
   316  			// In theory, we should respect the lease expiration time.
   317  			// In practice, we have a static polling interval.
   318  			job.assignment = assignment
   319  		} else {
   320  			// The job is running without an assigment.
   321  			// We don't care how and when it ends.
   322  			level.Warn(w.logger).Log("msg", "job re-assigned to another worker; cancelling", "job", job.Name)
   323  			w.remove(job)
   324  		}
   325  	}
   326  
   327  	for _, newJob := range resp.CompactionJobs {
   328  		if running, found := w.jobs[newJob.Name]; found {
   329  			level.Warn(w.logger).Log("msg", "job re-assigned to the same worker", "job", running.Name)
   330  			// We're free to choose what to do. For now, we update the
   331  			// assignment (in case the token has changed) and let the
   332  			// running job finish.
   333  			if running.assignment = assignments[running.Name]; running.assignment != nil {
   334  				continue
   335  			}
   336  		}
   337  		job := &compactionJob{CompactionJob: newJob}
   338  		if job.assignment = assignments[newJob.Name]; job.assignment == nil {
   339  			// That should not be possible, logging it here just in case.
   340  			level.Warn(w.logger).Log("msg", "found a job without assigment", "job", job.Name)
   341  			continue
   342  		}
   343  		job.ctx, job.cancel = context.WithCancel(context.Background())
   344  		newJobs = append(newJobs, job)
   345  		w.jobs[job.Name] = job
   346  	}
   347  
   348  	return newJobs
   349  }
   350  
   351  // Status is only used in metrics and logging.
   352  type status string
   353  
   354  const (
   355  	statusSuccess          status = "success"
   356  	statusFailure          status = "failure"
   357  	statusCanceled         status = "canceled"
   358  	statusMetadataNotFound status = "metadata_not_found"
   359  	statusBlockNotFound    status = "block_not_found"
   360  )
   361  
   362  func (w *Worker) runCompaction(job *compactionJob) {
   363  	start := time.Now()
   364  	metricLabels := []string{job.Tenant, strconv.Itoa(int(job.CompactionLevel))}
   365  	statusName := statusFailure
   366  	defer func() {
   367  		labelsWithStatus := append(metricLabels, string(statusName))
   368  		w.metrics.jobDuration.WithLabelValues(labelsWithStatus...).Observe(time.Since(start).Seconds())
   369  		w.metrics.jobsCompleted.WithLabelValues(labelsWithStatus...).Inc()
   370  		w.metrics.jobsInProgress.WithLabelValues(metricLabels...).Dec()
   371  	}()
   372  
   373  	w.metrics.jobsInProgress.WithLabelValues(metricLabels...).Inc()
   374  	sp, ctx := opentracing.StartSpanFromContext(job.ctx, "runCompaction",
   375  		opentracing.Tag{Key: "Job", Value: job.String()},
   376  		opentracing.Tag{Key: "Tenant", Value: job.Tenant},
   377  		opentracing.Tag{Key: "Shard", Value: job.Shard},
   378  		opentracing.Tag{Key: "CompactionLevel", Value: job.CompactionLevel},
   379  		opentracing.Tag{Key: "SourceBlocks", Value: len(job.SourceBlocks)},
   380  		opentracing.Tag{Key: "Tombstones", Value: len(job.Tombstones)},
   381  	)
   382  	defer sp.Finish()
   383  
   384  	logger := log.With(w.logger, "job", job.Name)
   385  	level.Info(logger).Log("msg", "starting compaction job", "source_blocks", strings.Join(job.SourceBlocks, " "))
   386  
   387  	// FIXME(kolesnikovae): Read metadata from blocks: it's located in the
   388  	//   blocks footer. The start offest and CRC are the last 8 bytes (BE).
   389  	//   See metadata.Encode and metadata.Decode.
   390  	//   We use metadata to download objects: in fact we need to know only
   391  	//   tenant, shard, level, and ID: the information which we already have
   392  	//   in the job. We definitely don't need the full metadata entry with
   393  	//   datasets: this part can be set once we download the block and read
   394  	//   meta locally. Or, we can just fetch the metadata from the objects
   395  	//   directly, before downloading them.
   396  	if err := w.getBlockMetadata(logger, job); err != nil {
   397  		// The error is likely to be transient, therefore the job is not failed,
   398  		// but just abandoned – another worker will pick it up and try again.
   399  		return
   400  	}
   401  
   402  	if len(job.Tombstones) > 0 {
   403  		// Handle tombstones asynchronously on the best effort basis:
   404  		// if deletion fails, leftovers will be cleaned up eventually.
   405  		//
   406  		// There are following reasons why we may not be able to delete:
   407  		//  1. General storage unavailability: compaction jobs will be
   408  		//     retried either way, and the tombstones will be handled again.
   409  		//  2. Permission issues. In this case, retry will not help.
   410  		//  3. Worker crash: jobs will be retried.
   411  		//
   412  		// A worker is given a limited time to finish the cleanup. If worker
   413  		// didn't finish the cleanup before shutdown and after the compaction
   414  		// job was finished (so no retry is expected), the data will be deleted
   415  		// eventually due to time-based retention policy. However, if no more
   416  		// tombstones are created for the shard, the data will remain in the
   417  		// storage. This should be handled by the index cleaner: some garbage
   418  		// collection should happen in the background.
   419  		w.handleTombstones(logger, job.Tombstones...)
   420  	}
   421  
   422  	if len(job.blocks) == 0 {
   423  		// This is a very bad situation that we do not expect, unless the
   424  		// metastore is restored from a snapshot: no metadata found for the
   425  		// job source blocks. There's no point in retrying or failing the
   426  		// job (which is likely to be retried by another worker), so we just
   427  		// skip it. The same for the situation when no block objects can be
   428  		// found in storage, which may happen if the blocks are deleted manually.
   429  		level.Error(logger).Log("msg", "no block metadata found; skipping")
   430  		job.compacted = &metastorev1.CompactedBlocks{SourceBlocks: new(metastorev1.BlockList)}
   431  		statusName = statusMetadataNotFound
   432  		return
   433  	}
   434  
   435  	tempdir := filepath.Join(w.config.TempDir, job.Name)
   436  	sourcedir := filepath.Join(tempdir, "source")
   437  	options := []block.CompactionOption{
   438  		block.WithCompactionTempDir(tempdir),
   439  		block.WithCompactionObjectOptions(
   440  			block.WithObjectMaxSizeLoadInMemory(w.config.SmallObjectSize),
   441  			block.WithObjectDownload(sourcedir),
   442  		),
   443  	}
   444  
   445  	if observer := w.buildSampleObserver(job.blocks[0]); observer != nil {
   446  		defer observer.Close()
   447  		options = append(options, block.WithSampleObserver(observer))
   448  	}
   449  
   450  	compacted, err := w.compactFn(ctx, job.blocks, w.storage, options...)
   451  	defer func() {
   452  		if err = os.RemoveAll(tempdir); err != nil {
   453  			level.Warn(logger).Log("msg", "failed to remove compaction directory", "path", tempdir, "err", err)
   454  		}
   455  	}()
   456  
   457  	switch {
   458  	case err == nil:
   459  		level.Info(logger).Log(
   460  			"msg", "compaction finished successfully",
   461  			"input_blocks", len(job.SourceBlocks),
   462  			"output_blocks", len(compacted),
   463  		)
   464  		for _, c := range compacted {
   465  			level.Debug(logger).Log(
   466  				"msg", "new compacted block",
   467  				"block_id", c.Id,
   468  				"block_tenant", metadata.Tenant(c),
   469  				"block_shard", c.Shard,
   470  				"block_compaction_level", c.CompactionLevel,
   471  				"block_min_time", c.MinTime,
   472  				"block_max_time", c.MaxTime,
   473  				"block_size", c.Size,
   474  				"datasets", len(c.Datasets),
   475  			)
   476  		}
   477  		job.compacted = &metastorev1.CompactedBlocks{
   478  			NewBlocks: compacted,
   479  			SourceBlocks: &metastorev1.BlockList{
   480  				Tenant: job.Tenant,
   481  				Shard:  job.Shard,
   482  				Blocks: job.SourceBlocks,
   483  			},
   484  		}
   485  
   486  		firstBlock := metadata.Timestamp(job.blocks[0])
   487  		w.metrics.timeToCompaction.WithLabelValues(metricLabels...).Observe(time.Since(firstBlock).Seconds())
   488  		statusName = statusSuccess
   489  
   490  	case errors.Is(err, context.Canceled):
   491  		level.Warn(logger).Log("msg", "compaction cancelled")
   492  		statusName = statusCanceled
   493  
   494  	case objstore.IsNotExist(w.storage, err):
   495  		level.Error(logger).Log("msg", "failed to find blocks", "err", err)
   496  		job.compacted = &metastorev1.CompactedBlocks{SourceBlocks: new(metastorev1.BlockList)}
   497  		statusName = statusBlockNotFound
   498  
   499  	default:
   500  		level.Error(logger).Log("msg", "failed to compact blocks", "err", err)
   501  		statusName = statusFailure
   502  	}
   503  }
   504  
   505  func (w *Worker) buildSampleObserver(md *metastorev1.BlockMeta) *metrics.SampleObserver {
   506  	if !w.config.MetricsExporter.Enabled || md.CompactionLevel > 0 {
   507  		return nil
   508  	}
   509  	recordingTime := int64(ulid.MustParse(md.Id).Time())
   510  	pyroscopeInstanceLabel := labels.New(labels.Label{
   511  		Name:  "pyroscope_instance",
   512  		Value: pyroscopeInstanceHash(md.Shard, uint32(md.CreatedBy)),
   513  	})
   514  	return metrics.NewSampleObserver(recordingTime, w.exporter, w.ruler, pyroscopeInstanceLabel)
   515  }
   516  
   517  func pyroscopeInstanceHash(shard uint32, createdBy uint32) string {
   518  	buf := make([]byte, 8)
   519  	binary.BigEndian.PutUint32(buf[0:4], shard)
   520  	binary.BigEndian.PutUint32(buf[4:8], createdBy)
   521  	return fmt.Sprintf("%x", xxhash.Sum64(buf))
   522  }
   523  
   524  func (w *Worker) getBlockMetadata(logger log.Logger, job *compactionJob) error {
   525  	ctx, cancel := context.WithTimeout(job.ctx, w.config.RequestTimeout)
   526  	defer cancel()
   527  
   528  	resp, err := w.client.GetBlockMetadata(ctx, &metastorev1.GetBlockMetadataRequest{
   529  		Blocks: &metastorev1.BlockList{
   530  			Tenant: job.Tenant,
   531  			Shard:  job.Shard,
   532  			Blocks: job.SourceBlocks,
   533  		},
   534  	})
   535  	if err != nil {
   536  		level.Error(logger).Log("msg", "failed to get block metadata", "err", err)
   537  		return err
   538  	}
   539  
   540  	job.blocks = resp.GetBlocks()
   541  	// Update the plan to reflect the actual compaction job state.
   542  	job.SourceBlocks = job.SourceBlocks[:0]
   543  	for _, b := range job.blocks {
   544  		job.SourceBlocks = append(job.SourceBlocks, b.Id)
   545  	}
   546  
   547  	return nil
   548  }
   549  
   550  func (w *Worker) handleTombstones(logger log.Logger, tombstones ...*metastorev1.Tombstones) {
   551  	for _, t := range tombstones {
   552  		w.deleterPool.add(w.newDeleter(logger, t), w.config.CleanupMaxDuration)
   553  	}
   554  }
   555  
   556  func (w *Worker) newDeleter(logger log.Logger, tombstone *metastorev1.Tombstones) *deleter {
   557  	return &deleter{
   558  		logger:    logger,
   559  		bucket:    w.storage,
   560  		metrics:   w.metrics,
   561  		tombstone: tombstone,
   562  	}
   563  }
   564  
   565  type deleter struct {
   566  	logger    log.Logger
   567  	bucket    objstore.Bucket
   568  	metrics   *workerMetrics
   569  	tombstone *metastorev1.Tombstones
   570  	wg        sync.WaitGroup
   571  }
   572  
   573  func (d *deleter) run(ctx context.Context, p *deleterPool) {
   574  	if t := d.tombstone.GetBlocks(); t != nil {
   575  		d.handleBlockTombstones(ctx, p, t)
   576  	}
   577  	if t := d.tombstone.GetShard(); t != nil {
   578  		d.handleShardTombstone(ctx, p, t)
   579  	}
   580  }
   581  
   582  func (d *deleter) wait() { d.wg.Wait() }
   583  
   584  func (d *deleter) handleBlockTombstones(ctx context.Context, pool *deleterPool, t *metastorev1.BlockTombstones) {
   585  	logger := log.With(d.logger, "tombstone_name", t.Name)
   586  	level.Info(logger).Log("msg", "deleting blocks", "blocks", strings.Join(t.Blocks, " "))
   587  	for _, b := range t.Blocks {
   588  		d.wg.Add(1)
   589  		pool.run(func() {
   590  			defer d.wg.Done()
   591  			d.delete(ctx, block.BuildObjectPath(t.Tenant, t.Shard, t.CompactionLevel, b))
   592  		})
   593  	}
   594  }
   595  
   596  func (d *deleter) handleShardTombstone(ctx context.Context, pool *deleterPool, t *metastorev1.ShardTombstone) {
   597  	// It's safe to delete blocks in the shard that are older than the
   598  	// maximum time specified in the tombstone.
   599  	minTime := time.Unix(0, t.Timestamp)
   600  	maxTime := minTime.Add(time.Duration(t.Duration))
   601  	dir := block.BuildObjectDir(t.Tenant, t.Shard)
   602  
   603  	logger := log.With(d.logger, "tombstone_name", t.Name)
   604  	level.Info(logger).Log("msg", "cleaning up shard", "max_time", maxTime, "dir", dir)
   605  
   606  	// Workaround for MinIO/S3 ListObjects: if we stop consuming before cancelling,
   607  	// the producer goroutine can block on a final send. Cancel first and keep
   608  	// draining so the producer exits cleanly. Thanos Iter does not drain on early
   609  	// return, so we do it here.
   610  	// See: https://github.com/minio/minio-go/blame/f64cdbde257f48f1a44b0f5aeee0475bad7e0e8d/api-list.go#L784
   611  	iterCtx, iterCancel := context.WithCancel(ctx)
   612  	defer iterCancel()
   613  
   614  	deleteBlock := func(path string) error {
   615  		// After we cancel iterCtx, the provider (e.g., MinIO ListObjects) may do
   616  		// one final blocking send on its results channel. Returning nil here keeps
   617  		// draining without scheduling new work so the producer isn't left blocked.
   618  		if iterCtx.Err() != nil {
   619  			return nil
   620  		}
   621  		blockID, err := block.ParseBlockIDFromPath(path)
   622  		if err != nil {
   623  			level.Warn(logger).Log("msg", "failed to parse block ID from path", "path", path, "err", err)
   624  			return nil
   625  		}
   626  		// Note that although we could skip blocks that are older than the
   627  		// minimum time, we do not do it here: we want to make sure we deleted
   628  		// everything before the maximum time, as previous jobs could fail
   629  		// to do so. In the worst case, this may result in a competition between
   630  		// workers that try to clean up the same shard. This is not an issue
   631  		// in practice, because there are not so many cleanup jobs for the
   632  		// same shard are running concurrently, and the cleanup is fast.
   633  		blockTs := time.UnixMilli(int64(blockID.Time()))
   634  		if !blockTs.Before(maxTime) {
   635  			level.Debug(logger).Log("msg", "reached range end, exiting", "path", path)
   636  			// Cancel the iterator so the underlying producer exits promptly.
   637  			// Keep consuming to drain any buffered items and allow the producer's
   638  			// final send on ctx.Done() to be received.
   639  			iterCancel()
   640  			return nil
   641  		}
   642  		d.wg.Add(1)
   643  		pool.run(func() {
   644  			defer d.wg.Done()
   645  			d.delete(ctx, path)
   646  		})
   647  		return nil
   648  	}
   649  
   650  	if err := d.bucket.Iter(iterCtx, dir, deleteBlock, thanosstore.WithRecursiveIter()); err != nil {
   651  		if errors.Is(err, context.Canceled) {
   652  			// Expected when the iteration context is cancelled.
   653  			return
   654  		}
   655  		// It's only possible if the error is returned by the iterator itself.
   656  		level.Error(logger).Log("msg", "failed to cleanup shard", "err", err)
   657  	}
   658  }
   659  
   660  func (d *deleter) delete(ctx context.Context, path string) {
   661  	var statusName status
   662  	switch err := d.bucket.Delete(ctx, path); {
   663  	case err == nil:
   664  		statusName = statusSuccess
   665  
   666  	case objstore.IsNotExist(d.bucket, err):
   667  		level.Info(d.logger).Log("msg", "block not found while attempting to delete it", "path", path, "err", err)
   668  		statusName = statusBlockNotFound
   669  
   670  	case errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded):
   671  		level.Warn(d.logger).Log("msg", "block delete attempt canceled", "path", path, "err", err)
   672  		statusName = statusCanceled
   673  
   674  	default:
   675  		level.Error(d.logger).Log("msg", "failed to delete block", "path", path, "err", err)
   676  		statusName = statusFailure
   677  	}
   678  
   679  	d.metrics.blocksDeleted.WithLabelValues(string(statusName)).Inc()
   680  }
   681  
   682  type deleterPool struct {
   683  	deletersWg sync.WaitGroup
   684  	stop       chan struct{}
   685  
   686  	threadsWg sync.WaitGroup
   687  	queue     chan func()
   688  }
   689  
   690  func newDeleterPool(threads int) *deleterPool {
   691  	p := &deleterPool{
   692  		queue: make(chan func(), threads),
   693  		stop:  make(chan struct{}),
   694  	}
   695  	p.threadsWg.Add(threads)
   696  	for i := 0; i < threads; i++ {
   697  		go func() {
   698  			defer p.threadsWg.Done()
   699  			for fn := range p.queue {
   700  				fn()
   701  			}
   702  		}()
   703  	}
   704  	return p
   705  }
   706  
   707  // If too many tombstones are created for the same tenant-shard, of if there
   708  // are too many blocks to delete so a single worker does not cope up, multiple
   709  // workers may end up deleting same blocks as they process the shard from the
   710  // very beginning. The timeout aims to reduce the competition factor: at any
   711  // time, the number of workers that cleanup the same shard is limited. This is
   712  // difficult to achieve in practice, and may happen if the retention is enabled
   713  // for the first time, and large number of blocks are deleted at once.
   714  func (p *deleterPool) deleterContext(timeout time.Duration) (context.Context, context.CancelFunc) {
   715  	ctx := context.Background()
   716  	if timeout > 0 {
   717  		return context.WithTimeout(ctx, timeout)
   718  	}
   719  	return context.WithCancel(ctx)
   720  }
   721  
   722  func (p *deleterPool) add(deleter *deleter, timeout time.Duration) {
   723  	ctx, cancel := p.deleterContext(timeout)
   724  	done := make(chan struct{})
   725  	p.deletersWg.Add(1)
   726  	go func() {
   727  		deleter.run(ctx, p)
   728  		deleter.wait()
   729  		p.deletersWg.Done()
   730  		// Notify the other goroutine that the deleter is done
   731  		// and there's no need to wait for it anymore.
   732  		close(done)
   733  	}()
   734  	go func() {
   735  		// Wait for the deleter to finish or for the stop signal,
   736  		// or for the timeout to expire, whichever comes first.
   737  		defer cancel()
   738  		select {
   739  		case <-done:
   740  		case <-ctx.Done():
   741  		case <-p.stop:
   742  			// We don't want to halt the deletion abruptly when
   743  			// the worker is stopped. In most cases, the deletion
   744  			// will be finished by the time the worker is stopped.
   745  			// Otherwise, we may wait up to CleanupMaxDuration.
   746  			select {
   747  			case <-done:
   748  			case <-ctx.Done():
   749  			}
   750  		}
   751  	}()
   752  }
   753  
   754  func (p *deleterPool) run(fn func()) { p.queue <- fn }
   755  
   756  // It is guaranteed that no [add] calls will be made at this point:
   757  // all compaction jobs are done, and no new jobs can be queued.
   758  func (p *deleterPool) close() {
   759  	// Wait for all the deleters to finish.
   760  	close(p.stop)
   761  	p.deletersWg.Wait()
   762  	// No new deletions can be queued.
   763  	// We can close the queue now.
   764  	close(p.queue)
   765  	p.threadsWg.Wait()
   766  }