github.com/grafana/pyroscope@v1.18.0/pkg/compactionworker/worker.go (about) 1 package compactionworker 2 3 import ( 4 "context" 5 "encoding/binary" 6 "flag" 7 "fmt" 8 "os" 9 "path/filepath" 10 "runtime" 11 "strconv" 12 "strings" 13 "sync" 14 "sync/atomic" 15 "time" 16 17 "github.com/cespare/xxhash/v2" 18 "github.com/go-kit/log" 19 "github.com/go-kit/log/level" 20 "github.com/grafana/dskit/services" 21 "github.com/oklog/ulid/v2" 22 "github.com/opentracing/opentracing-go" 23 "github.com/pkg/errors" 24 "github.com/prometheus/client_golang/prometheus" 25 "github.com/prometheus/prometheus/model/labels" 26 thanosstore "github.com/thanos-io/objstore" 27 _ "go.uber.org/automaxprocs" 28 29 metastorev1 "github.com/grafana/pyroscope/api/gen/proto/go/metastore/v1" 30 "github.com/grafana/pyroscope/pkg/block" 31 "github.com/grafana/pyroscope/pkg/block/metadata" 32 "github.com/grafana/pyroscope/pkg/metrics" 33 "github.com/grafana/pyroscope/pkg/objstore" 34 "github.com/grafana/pyroscope/pkg/util" 35 ) 36 37 type Config struct { 38 JobConcurrency int `yaml:"job_capacity"` 39 JobPollInterval time.Duration `yaml:"job_poll_interval"` 40 SmallObjectSize int `yaml:"small_object_size_bytes"` 41 TempDir string `yaml:"temp_dir"` 42 RequestTimeout time.Duration `yaml:"request_timeout"` 43 CleanupMaxDuration time.Duration `yaml:"cleanup_max_duration"` 44 MetricsExporter metrics.Config `yaml:"metrics_exporter"` 45 } 46 47 func (cfg *Config) RegisterFlags(f *flag.FlagSet) { 48 const prefix = "compaction-worker." 49 f.IntVar(&cfg.JobConcurrency, prefix+"job-concurrency", 0, "Number of concurrent jobs compaction worker will run. Defaults to the number of CPU cores.") 50 f.DurationVar(&cfg.JobPollInterval, prefix+"job-poll-interval", 5*time.Second, "Interval between job requests") 51 f.DurationVar(&cfg.RequestTimeout, prefix+"request-timeout", 5*time.Second, "Job request timeout.") 52 f.DurationVar(&cfg.CleanupMaxDuration, prefix+"cleanup-max-duration", 15*time.Second, "Maximum duration of the cleanup operations.") 53 f.IntVar(&cfg.SmallObjectSize, prefix+"small-object-size-bytes", 8<<20, "Size of the object that can be loaded in memory.") 54 f.StringVar(&cfg.TempDir, prefix+"temp-dir", os.TempDir(), "Temporary directory for compaction jobs.") 55 cfg.MetricsExporter.RegisterFlags(f) 56 } 57 58 type Worker struct { 59 service services.Service 60 61 logger log.Logger 62 config Config 63 client MetastoreClient 64 storage objstore.Bucket 65 compactFn compactFunc 66 metrics *workerMetrics 67 68 jobs map[string]*compactionJob 69 queue chan *compactionJob 70 threads int 71 capacity atomic.Int32 72 73 deleterPool *deleterPool 74 75 stopped atomic.Bool 76 closeOnce sync.Once 77 wg sync.WaitGroup 78 79 exporter metrics.Exporter 80 ruler metrics.Ruler 81 } 82 83 type compactionJob struct { 84 *metastorev1.CompactionJob 85 86 ctx context.Context 87 cancel context.CancelFunc 88 done atomic.Bool 89 90 blocks []*metastorev1.BlockMeta 91 assignment *metastorev1.CompactionJobAssignment 92 compacted *metastorev1.CompactedBlocks 93 } 94 95 type compactFunc func(context.Context, []*metastorev1.BlockMeta, objstore.Bucket, ...block.CompactionOption) ([]*metastorev1.BlockMeta, error) 96 97 type MetastoreClient interface { 98 metastorev1.CompactionServiceClient 99 metastorev1.IndexServiceClient 100 } 101 102 func New( 103 logger log.Logger, 104 config Config, 105 client MetastoreClient, 106 storage objstore.Bucket, 107 reg prometheus.Registerer, 108 ruler metrics.Ruler, 109 exporter metrics.Exporter, 110 ) (*Worker, error) { 111 config.TempDir = filepath.Join(filepath.Clean(config.TempDir), "pyroscope-compactor") 112 _ = os.RemoveAll(config.TempDir) 113 if err := os.MkdirAll(config.TempDir, 0o777); err != nil { 114 return nil, fmt.Errorf("failed to create compactor directory: %w", err) 115 } 116 w := &Worker{ 117 config: config, 118 logger: logger, 119 client: client, 120 storage: storage, 121 compactFn: block.Compact, 122 metrics: newMetrics(reg), 123 ruler: ruler, 124 exporter: exporter, 125 } 126 w.threads = config.JobConcurrency 127 if w.threads < 1 { 128 w.threads = runtime.GOMAXPROCS(-1) 129 } 130 w.queue = make(chan *compactionJob, 2*w.threads) 131 w.jobs = make(map[string]*compactionJob, 2*w.threads) 132 w.capacity.Store(int32(w.threads)) 133 w.deleterPool = newDeleterPool(16 * w.threads) 134 w.service = services.NewBasicService(w.starting, w.running, w.stopping) 135 return w, nil 136 } 137 138 func (w *Worker) Service() services.Service { return w.service } 139 140 func (w *Worker) starting(context.Context) (err error) { return nil } 141 142 func (w *Worker) stopping(error) error { return nil } 143 144 func (w *Worker) running(ctx context.Context) error { 145 ticker := time.NewTicker(w.config.JobPollInterval) 146 stopPolling := make(chan struct{}) 147 pollingDone := make(chan struct{}) 148 go func() { 149 defer close(pollingDone) 150 for { 151 select { 152 case <-stopPolling: 153 // Now that all the threads are done, we need to 154 // send the final status updates. 155 w.poll() 156 return 157 158 case <-ticker.C: 159 w.poll() 160 } 161 } 162 }() 163 164 w.wg.Add(w.threads) 165 for i := 0; i < w.threads; i++ { 166 go func() { 167 defer w.wg.Done() 168 level.Info(w.logger).Log("msg", "compaction worker thread started") 169 for job := range w.queue { 170 w.capacity.Add(-1) 171 util.Recover(func() { w.runCompaction(job) }) 172 job.done.Store(true) 173 w.capacity.Add(1) 174 } 175 }() 176 } 177 178 <-ctx.Done() 179 // Wait for all threads to finish their work, continuing to report status 180 // updates about the in-progress jobs. First, signal to the poll loop that 181 // we're done with new jobs. 182 w.stopped.Store(true) 183 level.Info(w.logger).Log("msg", "waiting for all jobs to finish") 184 w.wg.Wait() 185 186 // Now that all the threads are done, we stop the polling loop. 187 ticker.Stop() 188 close(stopPolling) 189 <-pollingDone 190 // Force exporter to send all staged samples (depends on the implementation) 191 // Must be a blocking call. 192 if w.exporter != nil { 193 w.exporter.Flush() 194 } 195 w.deleterPool.close() 196 return nil 197 } 198 199 func (w *Worker) poll() { 200 // Check if we want to stop polling for new jobs. 201 // Close the queue if this is not the case. 202 var capacity uint32 203 if w.stopped.Load() { 204 w.closeOnce.Do(func() { 205 level.Info(w.logger).Log("msg", "closing job queue") 206 close(w.queue) 207 }) 208 } else { 209 // We report the number of free workers in a hope to get more jobs. 210 // Note that cap(w.queue) - len(w.queue) will only report 0 when all 211 // the workers are busy and the queue is full (in fact, doubling the 212 // reported capacity). 213 if c := w.capacity.Load(); c > 0 { 214 capacity = uint32(c) 215 } 216 } 217 218 updates := w.collectUpdates() 219 if len(updates) == 0 && capacity == 0 { 220 level.Info(w.logger).Log("msg", "skipping polling", "updates", len(updates), "capacity", capacity) 221 return 222 } 223 224 level.Info(w.logger).Log("msg", "polling compaction jobs", "updates", len(updates), "capacity", capacity) 225 ctx, cancel := context.WithTimeout(context.Background(), w.config.RequestTimeout) 226 defer cancel() 227 resp, err := w.client.PollCompactionJobs(ctx, &metastorev1.PollCompactionJobsRequest{ 228 StatusUpdates: updates, 229 JobCapacity: capacity, 230 }) 231 if err != nil { 232 level.Error(w.logger).Log("msg", "failed to poll compaction jobs", "err", err) 233 return 234 } 235 236 w.cleanup(updates) 237 newJobs := w.handleResponse(resp) 238 for _, job := range newJobs { 239 select { 240 case w.queue <- job: 241 default: 242 level.Warn(w.logger).Log("msg", "dropping job", "job_name", job.Name) 243 w.remove(job) 244 } 245 } 246 } 247 248 func (w *Worker) collectUpdates() []*metastorev1.CompactionJobStatusUpdate { 249 updates := make([]*metastorev1.CompactionJobStatusUpdate, 0, len(w.jobs)) 250 for _, job := range w.jobs { 251 update := &metastorev1.CompactionJobStatusUpdate{ 252 Name: job.Name, 253 Token: job.assignment.Token, 254 } 255 256 switch done := job.done.Load(); { 257 case done && job.compacted != nil: 258 level.Info(w.logger).Log("msg", "sending update for completed job", "job", job.Name) 259 update.Status = metastorev1.CompactionJobStatus_COMPACTION_STATUS_SUCCESS 260 update.CompactedBlocks = job.compacted 261 updates = append(updates, update) 262 263 case done && job.compacted == nil: 264 // We're not sending the status update for the job and expect that the 265 // assigment is to be revoked. The job is to be removed at the next 266 // poll response handling: all jobs without assignments are canceled 267 // and removed. 268 level.Warn(w.logger).Log("msg", "skipping update for abandoned job", "job", job.Name) 269 270 default: 271 level.Info(w.logger).Log("msg", "sending update for in-progress job", "job", job.Name) 272 update.Status = metastorev1.CompactionJobStatus_COMPACTION_STATUS_IN_PROGRESS 273 updates = append(updates, update) 274 } 275 } 276 277 return updates 278 } 279 280 func (w *Worker) cleanup(updates []*metastorev1.CompactionJobStatusUpdate) { 281 for _, update := range updates { 282 if job := w.jobs[update.Name]; job != nil && job.done.Load() { 283 switch update.Status { 284 case metastorev1.CompactionJobStatus_COMPACTION_STATUS_SUCCESS: 285 // In the vast majority of cases, we end up here. 286 w.remove(job) 287 288 case metastorev1.CompactionJobStatus_COMPACTION_STATUS_IN_PROGRESS: 289 // It is possible that the job has been completed after we 290 // prepared the status update: keep the job for the next 291 // poll iteration. 292 293 default: 294 // Workers never send other statuses. It's unexpected to get here. 295 level.Warn(w.logger).Log("msg", "unexpected job status transition; removing the job", "job", job.Name) 296 w.remove(job) 297 } 298 } 299 } 300 } 301 302 func (w *Worker) remove(job *compactionJob) { 303 delete(w.jobs, job.Name) 304 job.cancel() 305 } 306 307 func (w *Worker) handleResponse(resp *metastorev1.PollCompactionJobsResponse) (newJobs []*compactionJob) { 308 // Assignments by job name. 309 assignments := make(map[string]*metastorev1.CompactionJobAssignment, len(resp.Assignments)) 310 for _, assignment := range resp.Assignments { 311 assignments[assignment.Name] = assignment 312 } 313 314 for _, job := range w.jobs { 315 if assignment, ok := assignments[job.assignment.Name]; ok { 316 // In theory, we should respect the lease expiration time. 317 // In practice, we have a static polling interval. 318 job.assignment = assignment 319 } else { 320 // The job is running without an assigment. 321 // We don't care how and when it ends. 322 level.Warn(w.logger).Log("msg", "job re-assigned to another worker; cancelling", "job", job.Name) 323 w.remove(job) 324 } 325 } 326 327 for _, newJob := range resp.CompactionJobs { 328 if running, found := w.jobs[newJob.Name]; found { 329 level.Warn(w.logger).Log("msg", "job re-assigned to the same worker", "job", running.Name) 330 // We're free to choose what to do. For now, we update the 331 // assignment (in case the token has changed) and let the 332 // running job finish. 333 if running.assignment = assignments[running.Name]; running.assignment != nil { 334 continue 335 } 336 } 337 job := &compactionJob{CompactionJob: newJob} 338 if job.assignment = assignments[newJob.Name]; job.assignment == nil { 339 // That should not be possible, logging it here just in case. 340 level.Warn(w.logger).Log("msg", "found a job without assigment", "job", job.Name) 341 continue 342 } 343 job.ctx, job.cancel = context.WithCancel(context.Background()) 344 newJobs = append(newJobs, job) 345 w.jobs[job.Name] = job 346 } 347 348 return newJobs 349 } 350 351 // Status is only used in metrics and logging. 352 type status string 353 354 const ( 355 statusSuccess status = "success" 356 statusFailure status = "failure" 357 statusCanceled status = "canceled" 358 statusMetadataNotFound status = "metadata_not_found" 359 statusBlockNotFound status = "block_not_found" 360 ) 361 362 func (w *Worker) runCompaction(job *compactionJob) { 363 start := time.Now() 364 metricLabels := []string{job.Tenant, strconv.Itoa(int(job.CompactionLevel))} 365 statusName := statusFailure 366 defer func() { 367 labelsWithStatus := append(metricLabels, string(statusName)) 368 w.metrics.jobDuration.WithLabelValues(labelsWithStatus...).Observe(time.Since(start).Seconds()) 369 w.metrics.jobsCompleted.WithLabelValues(labelsWithStatus...).Inc() 370 w.metrics.jobsInProgress.WithLabelValues(metricLabels...).Dec() 371 }() 372 373 w.metrics.jobsInProgress.WithLabelValues(metricLabels...).Inc() 374 sp, ctx := opentracing.StartSpanFromContext(job.ctx, "runCompaction", 375 opentracing.Tag{Key: "Job", Value: job.String()}, 376 opentracing.Tag{Key: "Tenant", Value: job.Tenant}, 377 opentracing.Tag{Key: "Shard", Value: job.Shard}, 378 opentracing.Tag{Key: "CompactionLevel", Value: job.CompactionLevel}, 379 opentracing.Tag{Key: "SourceBlocks", Value: len(job.SourceBlocks)}, 380 opentracing.Tag{Key: "Tombstones", Value: len(job.Tombstones)}, 381 ) 382 defer sp.Finish() 383 384 logger := log.With(w.logger, "job", job.Name) 385 level.Info(logger).Log("msg", "starting compaction job", "source_blocks", strings.Join(job.SourceBlocks, " ")) 386 387 // FIXME(kolesnikovae): Read metadata from blocks: it's located in the 388 // blocks footer. The start offest and CRC are the last 8 bytes (BE). 389 // See metadata.Encode and metadata.Decode. 390 // We use metadata to download objects: in fact we need to know only 391 // tenant, shard, level, and ID: the information which we already have 392 // in the job. We definitely don't need the full metadata entry with 393 // datasets: this part can be set once we download the block and read 394 // meta locally. Or, we can just fetch the metadata from the objects 395 // directly, before downloading them. 396 if err := w.getBlockMetadata(logger, job); err != nil { 397 // The error is likely to be transient, therefore the job is not failed, 398 // but just abandoned – another worker will pick it up and try again. 399 return 400 } 401 402 if len(job.Tombstones) > 0 { 403 // Handle tombstones asynchronously on the best effort basis: 404 // if deletion fails, leftovers will be cleaned up eventually. 405 // 406 // There are following reasons why we may not be able to delete: 407 // 1. General storage unavailability: compaction jobs will be 408 // retried either way, and the tombstones will be handled again. 409 // 2. Permission issues. In this case, retry will not help. 410 // 3. Worker crash: jobs will be retried. 411 // 412 // A worker is given a limited time to finish the cleanup. If worker 413 // didn't finish the cleanup before shutdown and after the compaction 414 // job was finished (so no retry is expected), the data will be deleted 415 // eventually due to time-based retention policy. However, if no more 416 // tombstones are created for the shard, the data will remain in the 417 // storage. This should be handled by the index cleaner: some garbage 418 // collection should happen in the background. 419 w.handleTombstones(logger, job.Tombstones...) 420 } 421 422 if len(job.blocks) == 0 { 423 // This is a very bad situation that we do not expect, unless the 424 // metastore is restored from a snapshot: no metadata found for the 425 // job source blocks. There's no point in retrying or failing the 426 // job (which is likely to be retried by another worker), so we just 427 // skip it. The same for the situation when no block objects can be 428 // found in storage, which may happen if the blocks are deleted manually. 429 level.Error(logger).Log("msg", "no block metadata found; skipping") 430 job.compacted = &metastorev1.CompactedBlocks{SourceBlocks: new(metastorev1.BlockList)} 431 statusName = statusMetadataNotFound 432 return 433 } 434 435 tempdir := filepath.Join(w.config.TempDir, job.Name) 436 sourcedir := filepath.Join(tempdir, "source") 437 options := []block.CompactionOption{ 438 block.WithCompactionTempDir(tempdir), 439 block.WithCompactionObjectOptions( 440 block.WithObjectMaxSizeLoadInMemory(w.config.SmallObjectSize), 441 block.WithObjectDownload(sourcedir), 442 ), 443 } 444 445 if observer := w.buildSampleObserver(job.blocks[0]); observer != nil { 446 defer observer.Close() 447 options = append(options, block.WithSampleObserver(observer)) 448 } 449 450 compacted, err := w.compactFn(ctx, job.blocks, w.storage, options...) 451 defer func() { 452 if err = os.RemoveAll(tempdir); err != nil { 453 level.Warn(logger).Log("msg", "failed to remove compaction directory", "path", tempdir, "err", err) 454 } 455 }() 456 457 switch { 458 case err == nil: 459 level.Info(logger).Log( 460 "msg", "compaction finished successfully", 461 "input_blocks", len(job.SourceBlocks), 462 "output_blocks", len(compacted), 463 ) 464 for _, c := range compacted { 465 level.Debug(logger).Log( 466 "msg", "new compacted block", 467 "block_id", c.Id, 468 "block_tenant", metadata.Tenant(c), 469 "block_shard", c.Shard, 470 "block_compaction_level", c.CompactionLevel, 471 "block_min_time", c.MinTime, 472 "block_max_time", c.MaxTime, 473 "block_size", c.Size, 474 "datasets", len(c.Datasets), 475 ) 476 } 477 job.compacted = &metastorev1.CompactedBlocks{ 478 NewBlocks: compacted, 479 SourceBlocks: &metastorev1.BlockList{ 480 Tenant: job.Tenant, 481 Shard: job.Shard, 482 Blocks: job.SourceBlocks, 483 }, 484 } 485 486 firstBlock := metadata.Timestamp(job.blocks[0]) 487 w.metrics.timeToCompaction.WithLabelValues(metricLabels...).Observe(time.Since(firstBlock).Seconds()) 488 statusName = statusSuccess 489 490 case errors.Is(err, context.Canceled): 491 level.Warn(logger).Log("msg", "compaction cancelled") 492 statusName = statusCanceled 493 494 case objstore.IsNotExist(w.storage, err): 495 level.Error(logger).Log("msg", "failed to find blocks", "err", err) 496 job.compacted = &metastorev1.CompactedBlocks{SourceBlocks: new(metastorev1.BlockList)} 497 statusName = statusBlockNotFound 498 499 default: 500 level.Error(logger).Log("msg", "failed to compact blocks", "err", err) 501 statusName = statusFailure 502 } 503 } 504 505 func (w *Worker) buildSampleObserver(md *metastorev1.BlockMeta) *metrics.SampleObserver { 506 if !w.config.MetricsExporter.Enabled || md.CompactionLevel > 0 { 507 return nil 508 } 509 recordingTime := int64(ulid.MustParse(md.Id).Time()) 510 pyroscopeInstanceLabel := labels.New(labels.Label{ 511 Name: "pyroscope_instance", 512 Value: pyroscopeInstanceHash(md.Shard, uint32(md.CreatedBy)), 513 }) 514 return metrics.NewSampleObserver(recordingTime, w.exporter, w.ruler, pyroscopeInstanceLabel) 515 } 516 517 func pyroscopeInstanceHash(shard uint32, createdBy uint32) string { 518 buf := make([]byte, 8) 519 binary.BigEndian.PutUint32(buf[0:4], shard) 520 binary.BigEndian.PutUint32(buf[4:8], createdBy) 521 return fmt.Sprintf("%x", xxhash.Sum64(buf)) 522 } 523 524 func (w *Worker) getBlockMetadata(logger log.Logger, job *compactionJob) error { 525 ctx, cancel := context.WithTimeout(job.ctx, w.config.RequestTimeout) 526 defer cancel() 527 528 resp, err := w.client.GetBlockMetadata(ctx, &metastorev1.GetBlockMetadataRequest{ 529 Blocks: &metastorev1.BlockList{ 530 Tenant: job.Tenant, 531 Shard: job.Shard, 532 Blocks: job.SourceBlocks, 533 }, 534 }) 535 if err != nil { 536 level.Error(logger).Log("msg", "failed to get block metadata", "err", err) 537 return err 538 } 539 540 job.blocks = resp.GetBlocks() 541 // Update the plan to reflect the actual compaction job state. 542 job.SourceBlocks = job.SourceBlocks[:0] 543 for _, b := range job.blocks { 544 job.SourceBlocks = append(job.SourceBlocks, b.Id) 545 } 546 547 return nil 548 } 549 550 func (w *Worker) handleTombstones(logger log.Logger, tombstones ...*metastorev1.Tombstones) { 551 for _, t := range tombstones { 552 w.deleterPool.add(w.newDeleter(logger, t), w.config.CleanupMaxDuration) 553 } 554 } 555 556 func (w *Worker) newDeleter(logger log.Logger, tombstone *metastorev1.Tombstones) *deleter { 557 return &deleter{ 558 logger: logger, 559 bucket: w.storage, 560 metrics: w.metrics, 561 tombstone: tombstone, 562 } 563 } 564 565 type deleter struct { 566 logger log.Logger 567 bucket objstore.Bucket 568 metrics *workerMetrics 569 tombstone *metastorev1.Tombstones 570 wg sync.WaitGroup 571 } 572 573 func (d *deleter) run(ctx context.Context, p *deleterPool) { 574 if t := d.tombstone.GetBlocks(); t != nil { 575 d.handleBlockTombstones(ctx, p, t) 576 } 577 if t := d.tombstone.GetShard(); t != nil { 578 d.handleShardTombstone(ctx, p, t) 579 } 580 } 581 582 func (d *deleter) wait() { d.wg.Wait() } 583 584 func (d *deleter) handleBlockTombstones(ctx context.Context, pool *deleterPool, t *metastorev1.BlockTombstones) { 585 logger := log.With(d.logger, "tombstone_name", t.Name) 586 level.Info(logger).Log("msg", "deleting blocks", "blocks", strings.Join(t.Blocks, " ")) 587 for _, b := range t.Blocks { 588 d.wg.Add(1) 589 pool.run(func() { 590 defer d.wg.Done() 591 d.delete(ctx, block.BuildObjectPath(t.Tenant, t.Shard, t.CompactionLevel, b)) 592 }) 593 } 594 } 595 596 func (d *deleter) handleShardTombstone(ctx context.Context, pool *deleterPool, t *metastorev1.ShardTombstone) { 597 // It's safe to delete blocks in the shard that are older than the 598 // maximum time specified in the tombstone. 599 minTime := time.Unix(0, t.Timestamp) 600 maxTime := minTime.Add(time.Duration(t.Duration)) 601 dir := block.BuildObjectDir(t.Tenant, t.Shard) 602 603 logger := log.With(d.logger, "tombstone_name", t.Name) 604 level.Info(logger).Log("msg", "cleaning up shard", "max_time", maxTime, "dir", dir) 605 606 // Workaround for MinIO/S3 ListObjects: if we stop consuming before cancelling, 607 // the producer goroutine can block on a final send. Cancel first and keep 608 // draining so the producer exits cleanly. Thanos Iter does not drain on early 609 // return, so we do it here. 610 // See: https://github.com/minio/minio-go/blame/f64cdbde257f48f1a44b0f5aeee0475bad7e0e8d/api-list.go#L784 611 iterCtx, iterCancel := context.WithCancel(ctx) 612 defer iterCancel() 613 614 deleteBlock := func(path string) error { 615 // After we cancel iterCtx, the provider (e.g., MinIO ListObjects) may do 616 // one final blocking send on its results channel. Returning nil here keeps 617 // draining without scheduling new work so the producer isn't left blocked. 618 if iterCtx.Err() != nil { 619 return nil 620 } 621 blockID, err := block.ParseBlockIDFromPath(path) 622 if err != nil { 623 level.Warn(logger).Log("msg", "failed to parse block ID from path", "path", path, "err", err) 624 return nil 625 } 626 // Note that although we could skip blocks that are older than the 627 // minimum time, we do not do it here: we want to make sure we deleted 628 // everything before the maximum time, as previous jobs could fail 629 // to do so. In the worst case, this may result in a competition between 630 // workers that try to clean up the same shard. This is not an issue 631 // in practice, because there are not so many cleanup jobs for the 632 // same shard are running concurrently, and the cleanup is fast. 633 blockTs := time.UnixMilli(int64(blockID.Time())) 634 if !blockTs.Before(maxTime) { 635 level.Debug(logger).Log("msg", "reached range end, exiting", "path", path) 636 // Cancel the iterator so the underlying producer exits promptly. 637 // Keep consuming to drain any buffered items and allow the producer's 638 // final send on ctx.Done() to be received. 639 iterCancel() 640 return nil 641 } 642 d.wg.Add(1) 643 pool.run(func() { 644 defer d.wg.Done() 645 d.delete(ctx, path) 646 }) 647 return nil 648 } 649 650 if err := d.bucket.Iter(iterCtx, dir, deleteBlock, thanosstore.WithRecursiveIter()); err != nil { 651 if errors.Is(err, context.Canceled) { 652 // Expected when the iteration context is cancelled. 653 return 654 } 655 // It's only possible if the error is returned by the iterator itself. 656 level.Error(logger).Log("msg", "failed to cleanup shard", "err", err) 657 } 658 } 659 660 func (d *deleter) delete(ctx context.Context, path string) { 661 var statusName status 662 switch err := d.bucket.Delete(ctx, path); { 663 case err == nil: 664 statusName = statusSuccess 665 666 case objstore.IsNotExist(d.bucket, err): 667 level.Info(d.logger).Log("msg", "block not found while attempting to delete it", "path", path, "err", err) 668 statusName = statusBlockNotFound 669 670 case errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded): 671 level.Warn(d.logger).Log("msg", "block delete attempt canceled", "path", path, "err", err) 672 statusName = statusCanceled 673 674 default: 675 level.Error(d.logger).Log("msg", "failed to delete block", "path", path, "err", err) 676 statusName = statusFailure 677 } 678 679 d.metrics.blocksDeleted.WithLabelValues(string(statusName)).Inc() 680 } 681 682 type deleterPool struct { 683 deletersWg sync.WaitGroup 684 stop chan struct{} 685 686 threadsWg sync.WaitGroup 687 queue chan func() 688 } 689 690 func newDeleterPool(threads int) *deleterPool { 691 p := &deleterPool{ 692 queue: make(chan func(), threads), 693 stop: make(chan struct{}), 694 } 695 p.threadsWg.Add(threads) 696 for i := 0; i < threads; i++ { 697 go func() { 698 defer p.threadsWg.Done() 699 for fn := range p.queue { 700 fn() 701 } 702 }() 703 } 704 return p 705 } 706 707 // If too many tombstones are created for the same tenant-shard, of if there 708 // are too many blocks to delete so a single worker does not cope up, multiple 709 // workers may end up deleting same blocks as they process the shard from the 710 // very beginning. The timeout aims to reduce the competition factor: at any 711 // time, the number of workers that cleanup the same shard is limited. This is 712 // difficult to achieve in practice, and may happen if the retention is enabled 713 // for the first time, and large number of blocks are deleted at once. 714 func (p *deleterPool) deleterContext(timeout time.Duration) (context.Context, context.CancelFunc) { 715 ctx := context.Background() 716 if timeout > 0 { 717 return context.WithTimeout(ctx, timeout) 718 } 719 return context.WithCancel(ctx) 720 } 721 722 func (p *deleterPool) add(deleter *deleter, timeout time.Duration) { 723 ctx, cancel := p.deleterContext(timeout) 724 done := make(chan struct{}) 725 p.deletersWg.Add(1) 726 go func() { 727 deleter.run(ctx, p) 728 deleter.wait() 729 p.deletersWg.Done() 730 // Notify the other goroutine that the deleter is done 731 // and there's no need to wait for it anymore. 732 close(done) 733 }() 734 go func() { 735 // Wait for the deleter to finish or for the stop signal, 736 // or for the timeout to expire, whichever comes first. 737 defer cancel() 738 select { 739 case <-done: 740 case <-ctx.Done(): 741 case <-p.stop: 742 // We don't want to halt the deletion abruptly when 743 // the worker is stopped. In most cases, the deletion 744 // will be finished by the time the worker is stopped. 745 // Otherwise, we may wait up to CleanupMaxDuration. 746 select { 747 case <-done: 748 case <-ctx.Done(): 749 } 750 } 751 }() 752 } 753 754 func (p *deleterPool) run(fn func()) { p.queue <- fn } 755 756 // It is guaranteed that no [add] calls will be made at this point: 757 // all compaction jobs are done, and no new jobs can be queued. 758 func (p *deleterPool) close() { 759 // Wait for all the deleters to finish. 760 close(p.stop) 761 p.deletersWg.Wait() 762 // No new deletions can be queued. 763 // We can close the queue now. 764 close(p.queue) 765 p.threadsWg.Wait() 766 }