github.com/grafana/pyroscope@v1.18.0/pkg/compactor/blocks_cleaner.go (about) 1 // SPDX-License-Identifier: AGPL-3.0-only 2 // Provenance-includes-location: https://github.com/grafana/mimir/blob/main/pkg/compactor/blocks_cleaner.go 3 // Provenance-includes-license: Apache-2.0 4 // Provenance-includes-copyright: The Cortex Authors. 5 6 package compactor 7 8 import ( 9 "context" 10 "fmt" 11 "strconv" 12 "strings" 13 "sync" 14 "time" 15 16 "github.com/go-kit/log" 17 "github.com/go-kit/log/level" 18 "github.com/grafana/dskit/concurrency" 19 "github.com/grafana/dskit/services" 20 "github.com/oklog/ulid/v2" 21 "github.com/pkg/errors" 22 "github.com/prometheus/client_golang/prometheus" 23 "github.com/prometheus/client_golang/prometheus/promauto" 24 thanos_objstore "github.com/thanos-io/objstore" 25 26 "github.com/grafana/pyroscope/pkg/objstore" 27 "github.com/grafana/pyroscope/pkg/phlaredb/block" 28 "github.com/grafana/pyroscope/pkg/phlaredb/bucket" 29 "github.com/grafana/pyroscope/pkg/phlaredb/bucketindex" 30 "github.com/grafana/pyroscope/pkg/util" 31 "github.com/grafana/pyroscope/pkg/validation" 32 ) 33 34 const ( 35 defaultDeleteBlocksConcurrency = 16 36 ) 37 38 type BlocksCleanerConfig struct { 39 DeletionDelay time.Duration 40 CleanupInterval time.Duration 41 CleanupConcurrency int 42 TenantCleanupDelay time.Duration // Delay before removing tenant deletion mark and "debug". 43 DeleteBlocksConcurrency int 44 NoBlocksFileCleanupEnabled bool 45 } 46 47 type BlocksCleaner struct { 48 services.Service 49 50 cfg BlocksCleanerConfig 51 cfgProvider ConfigProvider 52 logger log.Logger 53 bucketClient objstore.Bucket 54 tenantsScanner *bucket.TenantsScanner 55 ownUser func(userID string) (bool, error) 56 singleFlight *concurrency.LimitedConcurrencySingleFlight 57 58 // Keep track of the last owned users. 59 lastOwnedUsers []string 60 61 // Metrics. 62 runsStarted prometheus.Counter 63 runsCompleted prometheus.Counter 64 runsFailed prometheus.Counter 65 runsLastSuccess prometheus.Gauge 66 blocksCleanedTotal prometheus.Counter 67 blocksFailedTotal prometheus.Counter 68 blocksMarkedForDeletion prometheus.Counter 69 partialBlocksMarkedForDeletion prometheus.Counter 70 tenantBlocks *prometheus.GaugeVec 71 tenantMarkedBlocks *prometheus.GaugeVec 72 tenantPartialBlocks *prometheus.GaugeVec 73 tenantBucketIndexLastUpdate *prometheus.GaugeVec 74 } 75 76 func NewBlocksCleaner(cfg BlocksCleanerConfig, bucketClient objstore.Bucket, ownUser func(userID string) (bool, error), cfgProvider ConfigProvider, logger log.Logger, reg prometheus.Registerer) *BlocksCleaner { 77 c := &BlocksCleaner{ 78 cfg: cfg, 79 bucketClient: bucketClient, 80 tenantsScanner: bucket.NewTenantsScanner(bucketClient, ownUser, logger), 81 ownUser: ownUser, 82 cfgProvider: cfgProvider, 83 singleFlight: concurrency.NewLimitedConcurrencySingleFlight(cfg.CleanupConcurrency), 84 logger: log.With(logger, "component", "cleaner"), 85 runsStarted: promauto.With(reg).NewCounter(prometheus.CounterOpts{ 86 Name: "pyroscope_compactor_block_cleanup_started_total", 87 Help: "Total number of blocks cleanup runs started.", 88 }), 89 runsCompleted: promauto.With(reg).NewCounter(prometheus.CounterOpts{ 90 Name: "pyroscope_compactor_block_cleanup_completed_total", 91 Help: "Total number of blocks cleanup runs successfully completed.", 92 }), 93 runsFailed: promauto.With(reg).NewCounter(prometheus.CounterOpts{ 94 Name: "pyroscope_compactor_block_cleanup_failed_total", 95 Help: "Total number of blocks cleanup runs failed.", 96 }), 97 runsLastSuccess: promauto.With(reg).NewGauge(prometheus.GaugeOpts{ 98 Name: "pyroscope_compactor_block_cleanup_last_successful_run_timestamp_seconds", 99 Help: "Unix timestamp of the last successful blocks cleanup run.", 100 }), 101 blocksCleanedTotal: promauto.With(reg).NewCounter(prometheus.CounterOpts{ 102 Name: "pyroscope_compactor_blocks_cleaned_total", 103 Help: "Total number of blocks deleted.", 104 }), 105 blocksFailedTotal: promauto.With(reg).NewCounter(prometheus.CounterOpts{ 106 Name: "pyroscope_compactor_block_cleanup_failures_total", 107 Help: "Total number of blocks failed to be deleted.", 108 }), 109 blocksMarkedForDeletion: promauto.With(reg).NewCounter(prometheus.CounterOpts{ 110 Name: blocksMarkedForDeletionName, 111 Help: blocksMarkedForDeletionHelp, 112 ConstLabels: prometheus.Labels{"reason": "retention"}, 113 }), 114 partialBlocksMarkedForDeletion: promauto.With(reg).NewCounter(prometheus.CounterOpts{ 115 Name: blocksMarkedForDeletionName, 116 Help: blocksMarkedForDeletionHelp, 117 ConstLabels: prometheus.Labels{"reason": "partial"}, 118 }), 119 120 // The following metrics don't have the "pyroscope_compactor" prefix because not strictly related to 121 // the compactor. They're just tracked by the compactor because it's the most logical place where these 122 // metrics can be tracked. 123 tenantBlocks: promauto.With(reg).NewGaugeVec(prometheus.GaugeOpts{ 124 Name: "pyroscope_bucket_blocks_count", 125 Help: "Total number of blocks in the bucket. Includes blocks marked for deletion, but not partial blocks.", 126 }, []string{"user", "compaction_level"}), 127 tenantMarkedBlocks: promauto.With(reg).NewGaugeVec(prometheus.GaugeOpts{ 128 Name: "pyroscope_bucket_blocks_marked_for_deletion_count", 129 Help: "Total number of blocks marked for deletion in the bucket.", 130 }, []string{"user"}), 131 tenantPartialBlocks: promauto.With(reg).NewGaugeVec(prometheus.GaugeOpts{ 132 Name: "pyroscope_bucket_blocks_partials_count", 133 Help: "Total number of partial blocks.", 134 }, []string{"user"}), 135 tenantBucketIndexLastUpdate: promauto.With(reg).NewGaugeVec(prometheus.GaugeOpts{ 136 Name: "pyroscope_bucket_index_last_successful_update_timestamp_seconds", 137 Help: "Timestamp of the last successful update of a tenant's bucket index.", 138 }, []string{"user"}), 139 } 140 141 c.Service = services.NewTimerService(cfg.CleanupInterval, c.starting, c.ticker, c.stopping) 142 143 return c 144 } 145 146 func (c *BlocksCleaner) stopping(error) error { 147 c.singleFlight.Wait() 148 return nil 149 } 150 151 func (c *BlocksCleaner) starting(ctx context.Context) error { 152 // Run an initial cleanup in starting state. (Note that compactor no longer waits 153 // for blocks cleaner to finish starting before it starts compactions.) 154 c.runCleanup(ctx, false) 155 156 return nil 157 } 158 159 func (c *BlocksCleaner) ticker(ctx context.Context) error { 160 c.runCleanup(ctx, true) 161 162 return nil 163 } 164 165 func (c *BlocksCleaner) runCleanup(ctx context.Context, async bool) { 166 // Wrap logger with some unique ID so if runCleanUp does run in parallel with itself, we can 167 // at least differentiate the logs in this function for each run. 168 logger := log.With(c.logger, 169 "run_id", strconv.FormatInt(time.Now().Unix(), 10), 170 "task", "clean_up_users", 171 ) 172 173 c.instrumentStartedCleanupRun(logger) 174 175 allUsers, isDeleted, err := c.refreshOwnedUsers(ctx) 176 if err != nil { 177 c.instrumentFinishedCleanupRun(err, logger) 178 return 179 } 180 181 doCleanup := func() { 182 err := c.cleanUsers(ctx, allUsers, isDeleted, logger) 183 c.instrumentFinishedCleanupRun(err, logger) 184 } 185 186 if async { 187 go doCleanup() 188 } else { 189 doCleanup() 190 } 191 } 192 193 func (c *BlocksCleaner) instrumentStartedCleanupRun(logger log.Logger) { 194 level.Info(logger).Log("msg", "started blocks cleanup and maintenance") 195 c.runsStarted.Inc() 196 } 197 198 func (c *BlocksCleaner) instrumentFinishedCleanupRun(err error, logger log.Logger) { 199 if err == nil { 200 level.Info(logger).Log("msg", "successfully completed blocks cleanup and maintenance") 201 c.runsCompleted.Inc() 202 c.runsLastSuccess.SetToCurrentTime() 203 } else if errors.Is(err, context.Canceled) { 204 level.Info(logger).Log("msg", "canceled blocks cleanup and maintenance", "err", err) 205 return 206 } else { 207 level.Error(logger).Log("msg", "failed to run blocks cleanup and maintenance", "err", err.Error()) 208 c.runsFailed.Inc() 209 } 210 } 211 212 // refreshOwnedUsers is not required to be concurrency safe, but a single instance of this function 213 // could run concurrently with the cleanup job for any tenant. 214 func (c *BlocksCleaner) refreshOwnedUsers(ctx context.Context) ([]string, map[string]bool, error) { 215 users, deleted, err := c.tenantsScanner.ScanTenants(ctx) 216 if err != nil { 217 return nil, nil, errors.Wrap(err, "failed to discover users from bucket") 218 } 219 220 isActive := util.StringsMap(users) 221 isDeleted := util.StringsMap(deleted) 222 allUsers := append(users, deleted...) 223 224 // Delete per-tenant metrics for all tenants not belonging anymore to this shard. 225 // Such tenants have been moved to a different shard, so their updated metrics will 226 // be exported by the new shard. 227 for _, userID := range c.lastOwnedUsers { 228 if !isActive[userID] && !isDeleted[userID] { 229 c.tenantBlocks.DeleteLabelValues(userID) 230 c.tenantMarkedBlocks.DeleteLabelValues(userID) 231 c.tenantPartialBlocks.DeleteLabelValues(userID) 232 c.tenantBucketIndexLastUpdate.DeleteLabelValues(userID) 233 } 234 } 235 c.lastOwnedUsers = allUsers 236 return allUsers, isDeleted, nil 237 } 238 239 // cleanUsers must be concurrency-safe because some invocations may take longer and overlap with the next periodic invocation. 240 func (c *BlocksCleaner) cleanUsers(ctx context.Context, allUsers []string, isDeleted map[string]bool, logger log.Logger) error { 241 return c.singleFlight.ForEachNotInFlight(ctx, allUsers, func(ctx context.Context, userID string) error { 242 own, err := c.ownUser(userID) 243 if err != nil || !own { 244 // This returns error only if err != nil. ForEachUser keeps working for other users. 245 return errors.Wrap(err, "check own user") 246 } 247 248 userLogger := util.LoggerWithUserID(userID, logger) 249 if isDeleted[userID] { 250 return errors.Wrapf(c.deleteUserMarkedForDeletion(ctx, userID, userLogger), "failed to delete user marked for deletion: %s", userID) 251 } 252 return errors.Wrapf(c.cleanUser(ctx, userID, userLogger), "failed to delete blocks for user: %s", userID) 253 }) 254 } 255 256 // deleteRemainingData removes any additional files that may remain when a user has no blocks. Should only 257 // be called when there no more blocks remaining. 258 func (c *BlocksCleaner) deleteRemainingData(ctx context.Context, userBucket objstore.Bucket, userID string, userLogger log.Logger) error { 259 // Delete bucket index 260 if err := bucketindex.DeleteIndex(ctx, c.bucketClient, userID, c.cfgProvider); err != nil { 261 return errors.Wrap(err, "failed to delete bucket index file") 262 } 263 level.Info(userLogger).Log("msg", "deleted bucket index for tenant with no blocks remaining") 264 265 // Delete markers folder 266 if deleted, err := objstore.DeletePrefix(ctx, userBucket, block.MarkersPathname, userLogger); err != nil { 267 return errors.Wrap(err, "failed to delete marker files") 268 } else if deleted > 0 { 269 level.Info(userLogger).Log("msg", "deleted marker files for tenant with no blocks remaining", "count", deleted) 270 } 271 272 return nil 273 } 274 275 // deleteUserMarkedForDeletion removes blocks and remaining data for tenant marked for deletion. 276 func (c *BlocksCleaner) deleteUserMarkedForDeletion(ctx context.Context, userID string, userLogger log.Logger) error { 277 userBucket := objstore.NewTenantBucketClient(userID, c.bucketClient, c.cfgProvider) 278 279 level.Info(userLogger).Log("msg", "deleting blocks for tenant marked for deletion") 280 281 // We immediately delete the bucket index, to signal to its consumers that 282 // the tenant has "no blocks" in the storage. 283 if err := bucketindex.DeleteIndex(ctx, c.bucketClient, userID, c.cfgProvider); err != nil { 284 return err 285 } 286 c.tenantBucketIndexLastUpdate.DeleteLabelValues(userID) 287 288 var deletedBlocks, failed int 289 err := userBucket.Iter(ctx, "", func(name string) error { 290 if err := ctx.Err(); err != nil { 291 return err 292 } 293 294 id, ok := block.IsBlockDir(name) 295 if !ok { 296 return nil 297 } 298 299 err := block.Delete(ctx, userLogger, userBucket, id) 300 if err != nil { 301 failed++ 302 c.blocksFailedTotal.Inc() 303 level.Warn(userLogger).Log("msg", "failed to delete block", "block", id, "err", err) 304 return nil // Continue with other blocks. 305 } 306 307 deletedBlocks++ 308 c.blocksCleanedTotal.Inc() 309 level.Info(userLogger).Log("msg", "deleted block", "block", id) 310 return nil 311 }) 312 if err != nil { 313 return err 314 } 315 316 if failed > 0 { 317 // The number of blocks left in the storage is equal to the number of blocks we failed 318 // to delete. We also consider them all marked for deletion given the next run will try 319 // to delete them again. 320 c.tenantBlocks.WithLabelValues(userID).Set(float64(failed)) 321 c.tenantMarkedBlocks.WithLabelValues(userID).Set(float64(failed)) 322 c.tenantPartialBlocks.WithLabelValues(userID).Set(0) 323 324 return errors.Errorf("failed to delete %d blocks", failed) 325 } 326 327 // Given all blocks have been deleted, we can also remove the metrics. 328 c.tenantBlocks.DeleteLabelValues(userID) 329 c.tenantMarkedBlocks.DeleteLabelValues(userID) 330 c.tenantPartialBlocks.DeleteLabelValues(userID) 331 332 if deletedBlocks > 0 { 333 level.Info(userLogger).Log("msg", "deleted blocks for tenant marked for deletion", "deletedBlocks", deletedBlocks) 334 } 335 336 mark, err := bucket.ReadTenantDeletionMark(ctx, c.bucketClient, userID) 337 if err != nil { 338 return errors.Wrap(err, "failed to read tenant deletion mark") 339 } 340 if mark == nil { 341 return fmt.Errorf("cannot find tenant deletion mark anymore") 342 } 343 344 // If we have just deleted some blocks, update "finished" time. Also update "finished" time if it wasn't set yet, but there are no blocks. 345 // Note: this UPDATES the tenant deletion mark. Components that use caching bucket will NOT SEE this update, 346 // but that is fine -- they only check whether tenant deletion marker exists or not. 347 if deletedBlocks > 0 || mark.FinishedTime == 0 { 348 level.Info(userLogger).Log("msg", "updating finished time in tenant deletion mark") 349 mark.FinishedTime = time.Now().Unix() 350 return errors.Wrap(bucket.WriteTenantDeletionMark(ctx, c.bucketClient, userID, c.cfgProvider, mark), "failed to update tenant deletion mark") 351 } 352 353 if time.Since(time.Unix(mark.FinishedTime, 0)) < c.cfg.TenantCleanupDelay { 354 return nil 355 } 356 357 level.Info(userLogger).Log("msg", "cleaning up remaining blocks data for tenant marked for deletion") 358 359 // Let's do final cleanup of markers. 360 if deleted, err := objstore.DeletePrefix(ctx, userBucket, block.MarkersPathname, userLogger); err != nil { 361 return errors.Wrap(err, "failed to delete marker files") 362 } else if deleted > 0 { 363 level.Info(userLogger).Log("msg", "deleted marker files for tenant marked for deletion", "count", deleted) 364 } 365 366 return nil 367 } 368 369 func (c *BlocksCleaner) cleanUser(ctx context.Context, userID string, userLogger log.Logger) (returnErr error) { 370 userBucket := objstore.NewTenantBucketClient(userID, c.bucketClient, c.cfgProvider) 371 startTime := time.Now() 372 373 level.Info(userLogger).Log("msg", "started blocks cleanup and maintenance") 374 defer func() { 375 if returnErr != nil { 376 level.Warn(userLogger).Log("msg", "failed blocks cleanup and maintenance", "err", returnErr, "duration", time.Since(startTime)) 377 } else { 378 level.Info(userLogger).Log("msg", "completed blocks cleanup and maintenance", "duration", time.Since(startTime)) 379 } 380 }() 381 382 // Read the bucket index. 383 idx, err := bucketindex.ReadIndex(ctx, c.bucketClient, userID, c.cfgProvider, userLogger) 384 if errors.Is(err, bucketindex.ErrIndexCorrupted) { 385 level.Warn(userLogger).Log("msg", "found a corrupted bucket index, recreating it") 386 } else if err != nil && !errors.Is(err, bucketindex.ErrIndexNotFound) { 387 return err 388 } 389 390 level.Info(userLogger).Log("msg", "fetched existing bucket index") 391 392 // Mark blocks for future deletion based on the retention period for the user. 393 // Note doing this before UpdateIndex, so it reads in the deletion marks. 394 // The trade-off being that retention is not applied if the index has to be 395 // built, but this is rare. 396 if idx != nil { 397 // We do not want to stop the remaining work in the cleaner if an 398 // error occurs here. Errors are logged in the function. 399 retention := c.cfgProvider.CompactorBlocksRetentionPeriod(userID) 400 c.applyUserRetentionPeriod(ctx, idx, retention, userBucket, userLogger) 401 } 402 403 // Generate an updated in-memory version of the bucket index. 404 w := bucketindex.NewUpdater(c.bucketClient, userID, c.cfgProvider, userLogger) 405 idx, partials, err := w.UpdateIndex(ctx, idx) 406 if err != nil { 407 return err 408 } 409 410 c.deleteBlocksMarkedForDeletion(ctx, idx, userBucket, userLogger) 411 412 // Partial blocks with a deletion mark can be cleaned up. This is a best effort, so we don't return 413 // error if the cleanup of partial blocks fail. 414 if len(partials) > 0 { 415 var partialDeletionCutoffTime time.Time // zero value, disabled. 416 if delay, valid := c.cfgProvider.CompactorPartialBlockDeletionDelay(userID); delay > 0 { 417 // enable cleanup of partial blocks without deletion marker 418 partialDeletionCutoffTime = time.Now().Add(-delay) 419 } else if !valid { 420 level.Warn(userLogger).Log("msg", "partial blocks deletion has been disabled for tenant because the delay has been set lower than the minimum value allowed", "minimum", validation.MinCompactorPartialBlockDeletionDelay) 421 } 422 423 c.cleanUserPartialBlocks(ctx, partials, idx, partialDeletionCutoffTime, userBucket, userLogger) 424 level.Info(userLogger).Log("msg", "cleaned up partial blocks", "partials", len(partials)) 425 } 426 427 // If there are no more blocks, clean up any remaining files 428 // Otherwise upload the updated index to the storage. 429 if c.cfg.NoBlocksFileCleanupEnabled && len(idx.Blocks) == 0 { 430 if err := c.deleteRemainingData(ctx, userBucket, userID, userLogger); err != nil { 431 return err 432 } 433 } else { 434 if err := bucketindex.WriteIndex(ctx, c.bucketClient, userID, c.cfgProvider, idx); err != nil { 435 return err 436 } 437 } 438 439 c.updateBlockCountMetrics(userID, idx) 440 c.tenantMarkedBlocks.WithLabelValues(userID).Set(float64(len(idx.BlockDeletionMarks))) 441 c.tenantPartialBlocks.WithLabelValues(userID).Set(float64(len(partials))) 442 c.tenantBucketIndexLastUpdate.WithLabelValues(userID).SetToCurrentTime() 443 444 return nil 445 } 446 447 func (c *BlocksCleaner) updateBlockCountMetrics(userID string, idx *bucketindex.Index) { 448 blocksPerCompactionLevel := make(map[int]int) 449 for _, blk := range idx.Blocks { 450 blocksPerCompactionLevel[blk.CompactionLevel]++ 451 } 452 c.tenantBlocks.DeletePartialMatch(map[string]string{"user": userID}) 453 for compactionLevel, count := range blocksPerCompactionLevel { 454 c.tenantBlocks.WithLabelValues(userID, strconv.Itoa(compactionLevel)).Set(float64(count)) 455 } 456 } 457 458 // Concurrently deletes blocks marked for deletion, and removes blocks from index. 459 func (c *BlocksCleaner) deleteBlocksMarkedForDeletion(ctx context.Context, idx *bucketindex.Index, userBucket objstore.Bucket, userLogger log.Logger) { 460 blocksToDelete := make([]ulid.ULID, 0, len(idx.BlockDeletionMarks)) 461 462 // Collect blocks marked for deletion into buffered channel. 463 for _, mark := range idx.BlockDeletionMarks { 464 if time.Since(mark.GetDeletionTime()).Seconds() <= c.cfg.DeletionDelay.Seconds() { 465 continue 466 } 467 blocksToDelete = append(blocksToDelete, mark.ID) 468 } 469 470 var mu sync.Mutex 471 472 // We don't want to return errors from our function, as that would stop ForEach loop early. 473 _ = concurrency.ForEachJob(ctx, len(blocksToDelete), c.cfg.DeleteBlocksConcurrency, func(ctx context.Context, jobIdx int) error { 474 blockID := blocksToDelete[jobIdx] 475 476 if err := block.Delete(ctx, userLogger, userBucket, blockID); err != nil { 477 c.blocksFailedTotal.Inc() 478 level.Warn(userLogger).Log("msg", "failed to delete block marked for deletion", "block", blockID, "err", err) 479 return nil 480 } 481 482 // Remove the block from the bucket index too. 483 mu.Lock() 484 idx.RemoveBlock(blockID) 485 mu.Unlock() 486 487 c.blocksCleanedTotal.Inc() 488 level.Info(userLogger).Log("msg", "deleted block marked for deletion", "block", blockID) 489 return nil 490 }) 491 } 492 493 // cleanUserPartialBlocks deletes partial blocks which are safe to be deleted. The provided index is updated accordingly. 494 // partialDeletionCutoffTime, if not zero, is used to find blocks without deletion marker that were last modified before this time. Such blocks will be marked for deletion. 495 func (c *BlocksCleaner) cleanUserPartialBlocks(ctx context.Context, partials map[ulid.ULID]error, idx *bucketindex.Index, partialDeletionCutoffTime time.Time, userBucket objstore.InstrumentedBucket, userLogger log.Logger) { 496 // Collect all blocks with missing meta.json into buffered channel. 497 blocks := make([]ulid.ULID, 0, len(partials)) 498 499 for blockID, blockErr := range partials { 500 // We can safely delete only blocks which are partial because the meta.json is missing. 501 if !errors.Is(blockErr, bucketindex.ErrBlockMetaNotFound) { 502 continue 503 } 504 blocks = append(blocks, blockID) 505 } 506 507 var mu sync.Mutex 508 var partialBlocksWithoutDeletionMarker []ulid.ULID 509 510 // We don't want to return errors from our function, as that would stop ForEach loop early. 511 _ = concurrency.ForEachJob(ctx, len(blocks), c.cfg.DeleteBlocksConcurrency, func(ctx context.Context, jobIdx int) error { 512 blockID := blocks[jobIdx] 513 514 // We can safely delete only partial blocks with a deletion mark. 515 err := block.ReadMarker(ctx, userLogger, userBucket, blockID.String(), &block.DeletionMark{}) 516 if errors.Is(err, block.ErrorMarkerNotFound) { 517 mu.Lock() 518 partialBlocksWithoutDeletionMarker = append(partialBlocksWithoutDeletionMarker, blockID) 519 mu.Unlock() 520 return nil 521 } 522 if err != nil { 523 level.Warn(userLogger).Log("msg", "error reading partial block deletion mark", "block", blockID, "err", err) 524 return nil 525 } 526 527 // Hard-delete partial blocks having a deletion mark, even if the deletion threshold has not 528 // been reached yet. 529 if err := block.Delete(ctx, userLogger, userBucket, blockID); err != nil { 530 c.blocksFailedTotal.Inc() 531 level.Warn(userLogger).Log("msg", "error deleting partial block marked for deletion", "block", blockID, "err", err) 532 return nil 533 } 534 535 // Remove the block from the bucket index too. 536 mu.Lock() 537 idx.RemoveBlock(blockID) 538 delete(partials, blockID) 539 mu.Unlock() 540 541 c.blocksCleanedTotal.Inc() 542 level.Info(userLogger).Log("msg", "deleted partial block marked for deletion", "block", blockID) 543 return nil 544 }) 545 546 // Check if partial blocks are older than delay period, and mark for deletion 547 if !partialDeletionCutoffTime.IsZero() { 548 for _, blockID := range partialBlocksWithoutDeletionMarker { 549 lastModified, err := stalePartialBlockLastModifiedTime(ctx, blockID, userBucket, partialDeletionCutoffTime) 550 if err != nil { 551 level.Warn(userLogger).Log("msg", "failed while determining if partial block should be marked for deletion", "block", blockID, "err", err) 552 continue 553 } 554 if !lastModified.IsZero() { 555 level.Info(userLogger).Log("msg", "stale partial block found: marking block for deletion", "block", blockID, "last modified", lastModified) 556 if err := block.MarkForDeletion(ctx, userLogger, userBucket, blockID, "stale partial block", false, c.partialBlocksMarkedForDeletion); err != nil { 557 level.Warn(userLogger).Log("msg", "failed to mark partial block for deletion", "block", blockID, "err", err) 558 } 559 } 560 } 561 } 562 } 563 564 // applyUserRetentionPeriod marks blocks for deletion which have aged past the retention period. 565 func (c *BlocksCleaner) applyUserRetentionPeriod(ctx context.Context, idx *bucketindex.Index, retention time.Duration, userBucket objstore.Bucket, userLogger log.Logger) { 566 // The retention period of zero is a special value indicating to never delete. 567 if retention <= 0 { 568 return 569 } 570 571 blocks := listBlocksOutsideRetentionPeriod(idx, time.Now().Add(-retention)) 572 573 // Attempt to mark all blocks. It is not critical if a marking fails, as 574 // the cleaner will retry applying the retention in its next cycle. 575 for _, b := range blocks { 576 level.Info(userLogger).Log("msg", "applied retention: marking block for deletion", "block", b.ID, "maxTime", b.MaxTime) 577 if err := block.MarkForDeletion(ctx, userLogger, userBucket, b.ID, fmt.Sprintf("block exceeding retention of %v", retention), false, c.blocksMarkedForDeletion); err != nil { 578 level.Warn(userLogger).Log("msg", "failed to mark block for deletion", "block", b.ID, "err", err) 579 } 580 } 581 level.Info(userLogger).Log("msg", "marked blocks for deletion", "num_blocks", len(blocks), "retention", retention.String()) 582 } 583 584 // listBlocksOutsideRetentionPeriod determines the blocks which have aged past 585 // the specified retention period, and are not already marked for deletion. 586 func listBlocksOutsideRetentionPeriod(idx *bucketindex.Index, threshold time.Time) (result bucketindex.Blocks) { 587 // Whilst re-marking a block is not harmful, it is wasteful and generates 588 // a warning log message. Use the block deletion marks already in-memory 589 // to prevent marking blocks already marked for deletion. 590 marked := make(map[ulid.ULID]struct{}, len(idx.BlockDeletionMarks)) 591 for _, d := range idx.BlockDeletionMarks { 592 marked[d.ID] = struct{}{} 593 } 594 595 for _, b := range idx.Blocks { 596 maxTime := time.Unix(int64(b.MaxTime)/1000, 0) 597 if maxTime.Before(threshold) { 598 if _, isMarked := marked[b.ID]; !isMarked { 599 result = append(result, b) 600 } 601 } 602 } 603 604 return 605 } 606 607 var errStopIter = errors.New("stop iteration") 608 609 // stalePartialBlockLastModifiedTime returns the most recent last modified time of a stale partial block, or the zero value of time.Time if the provided block wasn't a stale partial block 610 func stalePartialBlockLastModifiedTime(ctx context.Context, blockID ulid.ULID, userBucket objstore.InstrumentedBucket, partialDeletionCutoffTime time.Time) (time.Time, error) { 611 var lastModified time.Time 612 err := userBucket.WithExpectedErrs(func(err error) bool { 613 return errors.Is(err, errStopIter) // sentinel error 614 }).Iter(ctx, blockID.String(), func(name string) error { 615 if strings.HasSuffix(name, thanos_objstore.DirDelim) { 616 return nil 617 } 618 attrib, err := userBucket.Attributes(ctx, name) 619 if err != nil { 620 return errors.Wrapf(err, "failed to get attributes for %s", name) 621 } 622 if attrib.LastModified.After(partialDeletionCutoffTime) { 623 return errStopIter 624 } 625 if attrib.LastModified.After(lastModified) { 626 lastModified = attrib.LastModified 627 } 628 return nil 629 }, thanos_objstore.WithRecursiveIter()) 630 631 if errors.Is(err, errStopIter) { 632 return time.Time{}, nil 633 } 634 return lastModified, err 635 }