github.com/grafana/pyroscope@v1.18.0/pkg/compactor/compactor.go (about) 1 // SPDX-License-Identifier: AGPL-3.0-only 2 // Provenance-includes-location: https://github.com/grafana/mimir/blob/main/pkg/compactor/compactor.go 3 // Provenance-includes-license: Apache-2.0 4 // Provenance-includes-copyright: The Cortex Authors. 5 package compactor 6 7 import ( 8 "context" 9 "flag" 10 "fmt" 11 "hash/fnv" 12 "math/rand" 13 "os" 14 "path" 15 "path/filepath" 16 "strings" 17 "time" 18 19 "github.com/go-kit/log" 20 "github.com/go-kit/log/level" 21 "github.com/grafana/dskit/backoff" 22 "github.com/grafana/dskit/flagext" 23 "github.com/grafana/dskit/kv" 24 "github.com/grafana/dskit/ring" 25 "github.com/grafana/dskit/services" 26 "github.com/opentracing/opentracing-go" 27 "github.com/opentracing/opentracing-go/ext" 28 "github.com/pkg/errors" 29 "github.com/prometheus/client_golang/prometheus" 30 "github.com/prometheus/client_golang/prometheus/promauto" 31 "go.uber.org/atomic" 32 33 "github.com/grafana/pyroscope/pkg/objstore" 34 "github.com/grafana/pyroscope/pkg/phlaredb/block" 35 "github.com/grafana/pyroscope/pkg/phlaredb/bucket" 36 "github.com/grafana/pyroscope/pkg/tenant" 37 "github.com/grafana/pyroscope/pkg/util" 38 ) 39 40 const ( 41 // ringKey is the key under which we store the compactors ring in the KVStore. 42 ringKey = "compactor" 43 44 // ringAutoForgetUnhealthyPeriods is how many consecutive timeout periods an unhealthy instance 45 // in the ring will be automatically removed after. 46 ringAutoForgetUnhealthyPeriods = 10 47 ) 48 49 const ( 50 blocksMarkedForDeletionName = "pyroscope_compactor_blocks_marked_for_deletion_total" 51 blocksMarkedForDeletionHelp = "Total number of blocks marked for deletion in compactor." 52 ) 53 54 var ( 55 errInvalidBlockRanges = "compactor block range periods should be divisible by the previous one, but %s is not divisible by %s" 56 errInvalidBlockDuration = "compactor block range periods should be divisible by the max block duration, but %s is not divisible by %s" 57 errInvalidCompactionOrder = fmt.Errorf("unsupported compaction order (supported values: %s)", strings.Join(CompactionOrders, ", ")) 58 errInvalidCompactionSplitBy = fmt.Errorf("unsupported compaction split by (supported values: %s)", strings.Join(CompactionSplitBys, ", ")) 59 errInvalidMaxOpeningBlocksConcurrency = fmt.Errorf("invalid max-opening-blocks-concurrency value, must be positive") 60 RingOp = ring.NewOp([]ring.InstanceState{ring.ACTIVE}, nil) 61 ) 62 63 // BlocksGrouperFactory builds and returns the grouper to use to compact a tenant's blocks. 64 type BlocksGrouperFactory func( 65 ctx context.Context, 66 cfg Config, 67 cfgProvider ConfigProvider, 68 userID string, 69 logger log.Logger, 70 reg prometheus.Registerer, 71 ) Grouper 72 73 // BlocksCompactorFactory builds and returns the compactor and planner to use to compact a tenant's blocks. 74 type BlocksCompactorFactory func( 75 ctx context.Context, 76 cfg Config, 77 cfgProvider ConfigProvider, 78 userID string, 79 logger log.Logger, 80 metrics *CompactorMetrics, 81 ) (Compactor, error) 82 83 // BlocksPlannerFactory builds and returns the compactor and planner to use to compact a tenant's blocks. 84 type BlocksPlannerFactory func( 85 cfg Config, 86 ) Planner 87 88 // Config holds the MultitenantCompactor config. 89 type Config struct { 90 BlockRanges DurationList `yaml:"block_ranges" category:"advanced"` 91 BlockSyncConcurrency int `yaml:"block_sync_concurrency" category:"advanced"` 92 MetaSyncConcurrency int `yaml:"meta_sync_concurrency" category:"advanced"` 93 DataDir string `yaml:"data_dir"` 94 CompactionInterval time.Duration `yaml:"compaction_interval" category:"advanced"` 95 CompactionRetries int `yaml:"compaction_retries" category:"advanced"` 96 CompactionConcurrency int `yaml:"compaction_concurrency" category:"advanced"` 97 CompactionWaitPeriod time.Duration `yaml:"first_level_compaction_wait_period"` 98 CleanupInterval time.Duration `yaml:"cleanup_interval" category:"advanced"` 99 CleanupConcurrency int `yaml:"cleanup_concurrency" category:"advanced"` 100 DeletionDelay time.Duration `yaml:"deletion_delay" category:"advanced"` 101 TenantCleanupDelay time.Duration `yaml:"tenant_cleanup_delay" category:"advanced"` 102 MaxCompactionTime time.Duration `yaml:"max_compaction_time" category:"advanced"` 103 NoBlocksFileCleanupEnabled bool `yaml:"no_blocks_file_cleanup_enabled" category:"experimental"` 104 DownsamplerEnabled bool `yaml:"downsampler_enabled" category:"advanced"` 105 106 // Compactor concurrency options 107 MaxOpeningBlocksConcurrency int `yaml:"max_opening_blocks_concurrency" category:"advanced"` // Number of goroutines opening blocks before compaction. 108 // MaxClosingBlocksConcurrency int `yaml:"max_closing_blocks_concurrency" category:"advanced"` // Max number of blocks that can be closed concurrently during split compaction. Note that closing of newly compacted block uses a lot of memory for writing index. 109 110 EnabledTenants flagext.StringSliceCSV `yaml:"enabled_tenants" category:"advanced"` 111 DisabledTenants flagext.StringSliceCSV `yaml:"disabled_tenants" category:"advanced"` 112 113 // Compactors sharding. 114 ShardingRing RingConfig `yaml:"sharding_ring"` 115 116 CompactionJobsOrder string `yaml:"compaction_jobs_order" category:"advanced"` 117 CompactionSplitBy string `yaml:"compaction_split_by" category:"advanced"` 118 119 // No need to add options to customize the retry backoff, 120 // given the defaults should be fine, but allow to override 121 // it in tests. 122 retryMinBackoff time.Duration `yaml:"-"` 123 retryMaxBackoff time.Duration `yaml:"-"` 124 125 // Allow downstream projects to customise the blocks compactor. 126 BlocksGrouperFactory BlocksGrouperFactory `yaml:"-"` 127 BlocksCompactorFactory BlocksCompactorFactory `yaml:"-"` 128 BlocksPlannerFactory BlocksPlannerFactory `yaml:"-"` 129 } 130 131 // RegisterFlags registers the MultitenantCompactor flags. 132 func (cfg *Config) RegisterFlags(f *flag.FlagSet, logger log.Logger) { 133 cfg.ShardingRing.RegisterFlags(f, logger) 134 135 cfg.BlockRanges = DurationList{1 * time.Hour, 2 * time.Hour, 8 * time.Hour} 136 cfg.retryMinBackoff = 10 * time.Second 137 cfg.retryMaxBackoff = time.Minute 138 139 f.Var(&cfg.BlockRanges, "compactor.block-ranges", "List of compaction time ranges.") 140 f.IntVar(&cfg.BlockSyncConcurrency, "compactor.block-sync-concurrency", 8, "Number of Go routines to use when downloading blocks for compaction and uploading resulting blocks.") 141 f.IntVar(&cfg.MetaSyncConcurrency, "compactor.meta-sync-concurrency", 20, "Number of Go routines to use when syncing block meta files from the long term storage.") 142 f.StringVar(&cfg.DataDir, "compactor.data-dir", "./data-compactor", "Directory to temporarily store blocks during compaction. This directory is not required to be persisted between restarts.") 143 f.DurationVar(&cfg.CompactionInterval, "compactor.compaction-interval", 30*time.Minute, "The frequency at which the compaction runs") 144 f.DurationVar(&cfg.MaxCompactionTime, "compactor.max-compaction-time", time.Hour, "Max time for starting compactions for a single tenant. After this time no new compactions for the tenant are started before next compaction cycle. This can help in multi-tenant environments to avoid single tenant using all compaction time, but also in single-tenant environments to force new discovery of blocks more often. 0 = disabled.") 145 f.IntVar(&cfg.CompactionRetries, "compactor.compaction-retries", 3, "How many times to retry a failed compaction within a single compaction run.") 146 f.IntVar(&cfg.CompactionConcurrency, "compactor.compaction-concurrency", 1, "Max number of concurrent compactions running.") 147 f.DurationVar(&cfg.CompactionWaitPeriod, "compactor.first-level-compaction-wait-period", 25*time.Minute, "How long the compactor waits before compacting first-level blocks that are uploaded by the ingesters. This configuration option allows for the reduction of cases where the compactor begins to compact blocks before all ingesters have uploaded their blocks to the storage.") 148 f.DurationVar(&cfg.CleanupInterval, "compactor.cleanup-interval", 15*time.Minute, "How frequently compactor should run blocks cleanup and maintenance, as well as update the bucket index.") 149 f.IntVar(&cfg.CleanupConcurrency, "compactor.cleanup-concurrency", 20, "Max number of tenants for which blocks cleanup and maintenance should run concurrently.") 150 f.StringVar(&cfg.CompactionJobsOrder, "compactor.compaction-jobs-order", CompactionOrderOldestFirst, fmt.Sprintf("The sorting to use when deciding which compaction jobs should run first for a given tenant. Supported values are: %s.", strings.Join(CompactionOrders, ", "))) 151 f.StringVar(&cfg.CompactionSplitBy, "compactor.compaction-split-by", CompactionSplitByFingerprint, fmt.Sprintf("Experimental: The strategy to use when splitting blocks during compaction. Supported values are: %s.", strings.Join(CompactionSplitBys, ", "))) 152 f.DurationVar(&cfg.DeletionDelay, "compactor.deletion-delay", 12*time.Hour, "Time before a block marked for deletion is deleted from bucket. "+ 153 "If not 0, blocks will be marked for deletion and compactor component will permanently delete blocks marked for deletion from the bucket. "+ 154 "If 0, blocks will be deleted straight away. Note that deleting blocks immediately can cause query failures.") 155 // f.DurationVar(&cfg.TenantCleanupDelay, "compactor.tenant-cleanup-delay", 6*time.Hour, "For tenants marked for deletion, this is time between deleting of last block, and doing final cleanup (marker files, debug files) of the tenant.") 156 f.BoolVar(&cfg.NoBlocksFileCleanupEnabled, "compactor.no-blocks-file-cleanup-enabled", false, "If enabled, will delete the bucket-index, markers and debug files in the tenant bucket when there are no blocks left in the index.") 157 f.BoolVar(&cfg.DownsamplerEnabled, "compactor.downsampler-enabled", false, "If enabled, the compactor will downsample profiles in blocks at compaction level 3 and above. The original profiles are also kept.") 158 // compactor concurrency options 159 f.IntVar(&cfg.MaxOpeningBlocksConcurrency, "compactor.max-opening-blocks-concurrency", 16, "Number of goroutines opening blocks before compaction.") 160 161 f.Var(&cfg.EnabledTenants, "compactor.enabled-tenants", "Comma separated list of tenants that can be compacted. If specified, only these tenants will be compacted by compactor, otherwise all tenants can be compacted. Subject to sharding.") 162 f.Var(&cfg.DisabledTenants, "compactor.disabled-tenants", "Comma separated list of tenants that cannot be compacted by this compactor. If specified, and compactor would normally pick given tenant for compaction (via -compactor.enabled-tenants or sharding), it will be ignored instead.") 163 } 164 165 func (cfg *Config) Validate(maxBlockDuration time.Duration) error { 166 if len(cfg.BlockRanges) > 0 && cfg.BlockRanges[0]%maxBlockDuration != 0 { 167 return errors.Errorf(errInvalidBlockDuration, cfg.BlockRanges[0].String(), maxBlockDuration.String()) 168 } 169 // Each block range period should be divisible by the previous one. 170 for i := 1; i < len(cfg.BlockRanges); i++ { 171 if cfg.BlockRanges[i]%cfg.BlockRanges[i-1] != 0 { 172 return errors.Errorf(errInvalidBlockRanges, cfg.BlockRanges[i].String(), cfg.BlockRanges[i-1].String()) 173 } 174 } 175 176 if cfg.MaxOpeningBlocksConcurrency < 1 { 177 return errInvalidMaxOpeningBlocksConcurrency 178 } 179 180 if !util.StringsContain(CompactionOrders, cfg.CompactionJobsOrder) { 181 return errInvalidCompactionOrder 182 } 183 184 if !util.StringsContain(CompactionSplitBys, cfg.CompactionSplitBy) { 185 return errInvalidCompactionSplitBy 186 } 187 188 return nil 189 } 190 191 // ConfigProvider defines the per-tenant config provider for the MultitenantCompactor. 192 type ConfigProvider interface { 193 objstore.TenantConfigProvider 194 195 // CompactorBlocksRetentionPeriod returns the retention period for a given user. 196 CompactorBlocksRetentionPeriod(user string) time.Duration 197 198 // CompactorSplitAndMergeShards returns the number of shards to use when splitting blocks. 199 CompactorSplitAndMergeShards(userID string) int 200 201 // CompactorSplitAndMergeStageSize returns the number of stages split shards will be written to. 202 CompactorSplitAndMergeStageSize(userID string) int 203 204 // CompactorSplitGroups returns the number of groups that blocks used for splitting should 205 // be grouped into. Different groups are then split by different jobs. 206 CompactorSplitGroups(userID string) int 207 208 // CompactorTenantShardSize returns number of compactors that this user can use. 0 = all compactors. 209 CompactorTenantShardSize(userID string) int 210 211 // CompactorPartialBlockDeletionDelay returns the partial block delay time period for a given user, 212 // and whether the configured value was valid. If the value wasn't valid, the returned delay is the default one 213 // and the caller is responsible to warn the Mimir operator about it. 214 CompactorPartialBlockDeletionDelay(userID string) (delay time.Duration, valid bool) 215 216 // CompactorDownsamplerEnabled returns true if the downsampler is enabled for a given user. 217 CompactorDownsamplerEnabled(userId string) bool 218 } 219 220 // MultitenantCompactor is a multi-tenant TSDB blocks compactor based on Thanos. 221 type MultitenantCompactor struct { 222 services.Service 223 224 compactorCfg Config 225 cfgProvider ConfigProvider 226 logger log.Logger 227 parentLogger log.Logger 228 registerer prometheus.Registerer 229 230 // Functions that creates bucket client, grouper, planner and compactor using the context. 231 // Useful for injecting mock objects from tests. 232 blocksGrouperFactory BlocksGrouperFactory 233 blocksCompactorFactory BlocksCompactorFactory 234 blocksPlannerFactory BlocksPlannerFactory 235 236 // Blocks cleaner is responsible to hard delete blocks marked for deletion. 237 blocksCleaner *BlocksCleaner 238 239 // Underlying compactor and planner used to compact TSDB blocks. 240 blocksPlanner Planner 241 242 // Client used to run operations on the bucket storing blocks. 243 bucketClient objstore.Bucket 244 245 // Ring used for sharding compactions. 246 ringLifecycler *ring.BasicLifecycler 247 ring *ring.Ring 248 ringSubservices *services.Manager 249 ringSubservicesWatcher *services.FailureWatcher 250 251 shardingStrategy shardingStrategy 252 jobsOrder JobsOrderFunc 253 254 // Metrics. 255 compactionRunsStarted prometheus.Counter 256 compactionRunsCompleted prometheus.Counter 257 compactionRunsErred prometheus.Counter 258 compactionRunsShutdown prometheus.Counter 259 compactionRunsLastSuccess prometheus.Gauge 260 compactionRunDiscoveredTenants prometheus.Gauge 261 compactionRunSkippedTenants prometheus.Gauge 262 compactionRunSucceededTenants prometheus.Gauge 263 compactionRunFailedTenants prometheus.Gauge 264 compactionRunInterval prometheus.Gauge 265 blocksMarkedForDeletion prometheus.Counter 266 267 // Metrics shared across all BucketCompactor instances. 268 bucketCompactorMetrics *BucketCompactorMetrics 269 270 // TSDB syncer metrics 271 syncerMetrics *aggregatedSyncerMetrics 272 273 // Block upload metrics 274 blockUploadBlocks *prometheus.GaugeVec 275 blockUploadBytes *prometheus.GaugeVec 276 blockUploadFiles *prometheus.GaugeVec 277 blockUploadValidations atomic.Int64 278 279 // Compactor metrics 280 compactorMetrics *CompactorMetrics 281 } 282 283 // NewMultitenantCompactor makes a new MultitenantCompactor. 284 func NewMultitenantCompactor(compactorCfg Config, bucketClient objstore.Bucket, cfgProvider ConfigProvider, logger log.Logger, registerer prometheus.Registerer) (*MultitenantCompactor, error) { 285 // Configure the compactor and grouper factories only if they weren't already set by a downstream project. 286 if compactorCfg.BlocksGrouperFactory == nil || compactorCfg.BlocksCompactorFactory == nil { 287 configureSplitAndMergeCompactor(&compactorCfg) 288 } 289 290 blocksGrouperFactory := compactorCfg.BlocksGrouperFactory 291 blocksCompactorFactory := compactorCfg.BlocksCompactorFactory 292 blocksPlannerFactory := compactorCfg.BlocksPlannerFactory 293 294 c, err := newMultitenantCompactor(compactorCfg, bucketClient, cfgProvider, logger, registerer, blocksGrouperFactory, blocksCompactorFactory, blocksPlannerFactory) 295 if err != nil { 296 return nil, errors.Wrap(err, "failed to create blocks compactor") 297 } 298 299 return c, nil 300 } 301 302 func newMultitenantCompactor( 303 compactorCfg Config, 304 bucketClient objstore.Bucket, 305 cfgProvider ConfigProvider, 306 logger log.Logger, 307 registerer prometheus.Registerer, 308 blocksGrouperFactory BlocksGrouperFactory, 309 blocksCompactorFactory BlocksCompactorFactory, 310 blocksPlannerFactory BlocksPlannerFactory, 311 ) (*MultitenantCompactor, error) { 312 c := &MultitenantCompactor{ 313 compactorCfg: compactorCfg, 314 cfgProvider: cfgProvider, 315 parentLogger: logger, 316 logger: log.With(logger, "component", "compactor"), 317 registerer: registerer, 318 syncerMetrics: newAggregatedSyncerMetrics(registerer), 319 bucketClient: bucketClient, 320 blocksGrouperFactory: blocksGrouperFactory, 321 blocksCompactorFactory: blocksCompactorFactory, 322 blocksPlannerFactory: blocksPlannerFactory, 323 compactionRunsStarted: promauto.With(registerer).NewCounter(prometheus.CounterOpts{ 324 Name: "pyroscope_compactor_runs_started_total", 325 Help: "Total number of compaction runs started.", 326 }), 327 compactionRunsCompleted: promauto.With(registerer).NewCounter(prometheus.CounterOpts{ 328 Name: "pyroscope_compactor_runs_completed_total", 329 Help: "Total number of compaction runs successfully completed.", 330 }), 331 compactionRunsErred: promauto.With(registerer).NewCounter(prometheus.CounterOpts{ 332 Name: "pyroscope_compactor_runs_failed_total", 333 Help: "Total number of compaction runs failed.", 334 ConstLabels: map[string]string{"reason": "error"}, 335 }), 336 compactionRunsShutdown: promauto.With(registerer).NewCounter(prometheus.CounterOpts{ 337 Name: "pyroscope_compactor_runs_failed_total", 338 Help: "Total number of compaction runs failed.", 339 ConstLabels: map[string]string{"reason": "shutdown"}, 340 }), 341 compactionRunsLastSuccess: promauto.With(registerer).NewGauge(prometheus.GaugeOpts{ 342 Name: "pyroscope_compactor_last_successful_run_timestamp_seconds", 343 Help: "Unix timestamp of the last successful compaction run.", 344 }), 345 compactionRunDiscoveredTenants: promauto.With(registerer).NewGauge(prometheus.GaugeOpts{ 346 Name: "pyroscope_compactor_tenants_discovered", 347 Help: "Number of tenants discovered during the current compaction run. Reset to 0 when compactor is idle.", 348 }), 349 compactionRunSkippedTenants: promauto.With(registerer).NewGauge(prometheus.GaugeOpts{ 350 Name: "pyroscope_compactor_tenants_skipped", 351 Help: "Number of tenants skipped during the current compaction run. Reset to 0 when compactor is idle.", 352 }), 353 compactionRunSucceededTenants: promauto.With(registerer).NewGauge(prometheus.GaugeOpts{ 354 Name: "pyroscope_compactor_tenants_processing_succeeded", 355 Help: "Number of tenants successfully processed during the current compaction run. Reset to 0 when compactor is idle.", 356 }), 357 compactionRunFailedTenants: promauto.With(registerer).NewGauge(prometheus.GaugeOpts{ 358 Name: "pyroscope_compactor_tenants_processing_failed", 359 Help: "Number of tenants failed processing during the current compaction run. Reset to 0 when compactor is idle.", 360 }), 361 compactionRunInterval: promauto.With(registerer).NewGauge(prometheus.GaugeOpts{ 362 Name: "pyroscope_compactor_compaction_interval_seconds", 363 Help: "The configured interval on which compaction is run in seconds. Useful when compared to the last successful run metric to accurately detect multiple failed compaction runs.", 364 }), 365 blocksMarkedForDeletion: promauto.With(registerer).NewCounter(prometheus.CounterOpts{ 366 Name: blocksMarkedForDeletionName, 367 Help: blocksMarkedForDeletionHelp, 368 ConstLabels: prometheus.Labels{"reason": "compaction"}, 369 }), 370 blockUploadBlocks: promauto.With(registerer).NewGaugeVec(prometheus.GaugeOpts{ 371 Name: "pyroscope_block_upload_api_blocks_total", 372 Help: "Total number of blocks successfully uploaded and validated using the block upload API.", 373 }, []string{"user"}), 374 blockUploadBytes: promauto.With(registerer).NewGaugeVec(prometheus.GaugeOpts{ 375 Name: "pyroscope_block_upload_api_bytes_total", 376 Help: "Total number of bytes from successfully uploaded and validated blocks using block upload API.", 377 }, []string{"user"}), 378 blockUploadFiles: promauto.With(registerer).NewGaugeVec(prometheus.GaugeOpts{ 379 Name: "pyroscope_block_upload_api_files_total", 380 Help: "Total number of files from successfully uploaded and validated blocks using block upload API.", 381 }, []string{"user"}), 382 compactorMetrics: newCompactorMetrics(registerer), 383 } 384 385 promauto.With(registerer).NewGaugeFunc(prometheus.GaugeOpts{ 386 Name: "pyroscope_block_upload_validations_in_progress", 387 Help: "Number of block upload validations currently running.", 388 }, func() float64 { 389 return float64(c.blockUploadValidations.Load()) 390 }) 391 392 c.bucketCompactorMetrics = NewBucketCompactorMetrics(c.blocksMarkedForDeletion, registerer) 393 394 if len(compactorCfg.EnabledTenants) > 0 { 395 level.Info(c.logger).Log("msg", "compactor using enabled users", "enabled", strings.Join(compactorCfg.EnabledTenants, ", ")) 396 } 397 if len(compactorCfg.DisabledTenants) > 0 { 398 level.Info(c.logger).Log("msg", "compactor using disabled users", "disabled", strings.Join(compactorCfg.DisabledTenants, ", ")) 399 } 400 401 c.jobsOrder = GetJobsOrderFunction(compactorCfg.CompactionJobsOrder) 402 if c.jobsOrder == nil { 403 return nil, errInvalidCompactionOrder 404 } 405 406 c.Service = services.NewBasicService(c.starting, c.running, c.stopping) 407 408 // The last successful compaction run metric is exposed as seconds since epoch, so we need to use seconds for this metric. 409 c.compactionRunInterval.Set(c.compactorCfg.CompactionInterval.Seconds()) 410 411 return c, nil 412 } 413 414 // Start the compactor. 415 func (c *MultitenantCompactor) starting(ctx context.Context) error { 416 var err error 417 418 c.blocksPlanner = c.blocksPlannerFactory(c.compactorCfg) 419 420 // Wrap the bucket client to write block deletion marks in the global location too. 421 c.bucketClient = block.BucketWithGlobalMarkers(c.bucketClient) 422 423 // Initialize the compactors ring if sharding is enabled. 424 c.ring, c.ringLifecycler, err = newRingAndLifecycler(c.compactorCfg.ShardingRing, c.logger, c.registerer) 425 if err != nil { 426 return err 427 } 428 429 c.ringSubservices, err = services.NewManager(c.ringLifecycler, c.ring) 430 if err != nil { 431 return errors.Wrap(err, "unable to create compactor ring dependencies") 432 } 433 434 c.ringSubservicesWatcher = services.NewFailureWatcher() 435 c.ringSubservicesWatcher.WatchManager(c.ringSubservices) 436 if err = c.ringSubservices.StartAsync(ctx); err != nil { 437 return errors.Wrap(err, "unable to start compactor ring dependencies") 438 } 439 440 ctxTimeout, cancel := context.WithTimeout(ctx, c.compactorCfg.ShardingRing.WaitActiveInstanceTimeout) 441 defer cancel() 442 if err = c.ringSubservices.AwaitHealthy(ctxTimeout); err != nil { 443 return errors.Wrap(err, "unable to start compactor ring dependencies") 444 } 445 446 // If sharding is enabled we should wait until this instance is ACTIVE within the ring. This 447 // MUST be done before starting any other component depending on the users scanner, because 448 // the users scanner depends on the ring (to check whether a user belongs to this shard or not). 449 level.Info(c.logger).Log("msg", "waiting until compactor is ACTIVE in the ring") 450 if err = ring.WaitInstanceState(ctxTimeout, c.ring, c.ringLifecycler.GetInstanceID(), ring.ACTIVE); err != nil { 451 return errors.Wrap(err, "compactor failed to become ACTIVE in the ring") 452 } 453 454 level.Info(c.logger).Log("msg", "compactor is ACTIVE in the ring") 455 456 // In the event of a cluster cold start or scale up of 2+ compactor instances at the same 457 // time, we may end up in a situation where each new compactor instance starts at a slightly 458 // different time and thus each one starts with a different state of the ring. It's better 459 // to just wait a short time for ring stability. 460 if c.compactorCfg.ShardingRing.WaitStabilityMinDuration > 0 { 461 minWaiting := c.compactorCfg.ShardingRing.WaitStabilityMinDuration 462 maxWaiting := c.compactorCfg.ShardingRing.WaitStabilityMaxDuration 463 464 level.Info(c.logger).Log("msg", "waiting until compactor ring topology is stable", "min_waiting", minWaiting.String(), "max_waiting", maxWaiting.String()) 465 if err := ring.WaitRingStability(ctx, c.ring, RingOp, minWaiting, maxWaiting); err != nil { 466 level.Warn(c.logger).Log("msg", "compactor ring topology is not stable after the max waiting time, proceeding anyway") 467 } else { 468 level.Info(c.logger).Log("msg", "compactor ring topology is stable") 469 } 470 } 471 472 allowedTenants := tenant.NewAllowedTenants(c.compactorCfg.EnabledTenants, c.compactorCfg.DisabledTenants) 473 c.shardingStrategy = newSplitAndMergeShardingStrategy(allowedTenants, c.ring, c.ringLifecycler, c.cfgProvider) 474 475 // Create the blocks cleaner (service). 476 c.blocksCleaner = NewBlocksCleaner(BlocksCleanerConfig{ 477 DeletionDelay: c.compactorCfg.DeletionDelay, 478 CleanupInterval: util.DurationWithJitter(c.compactorCfg.CleanupInterval, 0.1), 479 CleanupConcurrency: c.compactorCfg.CleanupConcurrency, 480 TenantCleanupDelay: c.compactorCfg.TenantCleanupDelay, 481 DeleteBlocksConcurrency: defaultDeleteBlocksConcurrency, 482 NoBlocksFileCleanupEnabled: c.compactorCfg.NoBlocksFileCleanupEnabled, 483 }, c.bucketClient, c.shardingStrategy.blocksCleanerOwnUser, c.cfgProvider, c.parentLogger, c.registerer) 484 485 // Start blocks cleaner asynchronously, don't wait until initial cleanup is finished. 486 if err := c.blocksCleaner.StartAsync(ctx); err != nil { 487 c.ringSubservices.StopAsync() 488 return errors.Wrap(err, "failed to start the blocks cleaner") 489 } 490 491 return nil 492 } 493 494 func newRingAndLifecycler(cfg RingConfig, logger log.Logger, reg prometheus.Registerer) (*ring.Ring, *ring.BasicLifecycler, error) { 495 reg = prometheus.WrapRegistererWithPrefix("pyroscope_", reg) 496 kvStore, err := kv.NewClient(cfg.Common.KVStore, ring.GetCodec(), kv.RegistererWithKVName(reg, "compactor-lifecycler"), logger) 497 if err != nil { 498 return nil, nil, errors.Wrap(err, "failed to initialize compactors' KV store") 499 } 500 501 lifecyclerCfg, err := cfg.ToBasicLifecyclerConfig(logger) 502 if err != nil { 503 return nil, nil, errors.Wrap(err, "failed to build compactors' lifecycler config") 504 } 505 506 var delegate ring.BasicLifecyclerDelegate 507 delegate = ring.NewInstanceRegisterDelegate(ring.ACTIVE, lifecyclerCfg.NumTokens) 508 delegate = ring.NewLeaveOnStoppingDelegate(delegate, logger) 509 delegate = ring.NewAutoForgetDelegate(ringAutoForgetUnhealthyPeriods*lifecyclerCfg.HeartbeatTimeout, delegate, logger) 510 511 compactorsLifecycler, err := ring.NewBasicLifecycler(lifecyclerCfg, "compactor", ringKey, kvStore, delegate, logger, reg) 512 if err != nil { 513 return nil, nil, errors.Wrap(err, "failed to initialize compactors' lifecycler") 514 } 515 516 compactorsRing, err := ring.New(cfg.toRingConfig(), "compactor", ringKey, logger, reg) 517 if err != nil { 518 return nil, nil, errors.Wrap(err, "failed to initialize compactors' ring client") 519 } 520 521 return compactorsRing, compactorsLifecycler, nil 522 } 523 524 func (c *MultitenantCompactor) stopping(_ error) error { 525 ctx := context.Background() 526 527 services.StopAndAwaitTerminated(ctx, c.blocksCleaner) //nolint:errcheck 528 if c.ringSubservices != nil { 529 return services.StopManagerAndAwaitStopped(ctx, c.ringSubservices) 530 } 531 return nil 532 } 533 534 func (c *MultitenantCompactor) running(ctx context.Context) error { 535 // Run an initial compaction before starting the interval. 536 c.compactUsers(ctx) 537 538 ticker := time.NewTicker(util.DurationWithJitter(c.compactorCfg.CompactionInterval, 0.05)) 539 defer ticker.Stop() 540 541 for { 542 select { 543 case <-ticker.C: 544 c.compactUsers(ctx) 545 case <-ctx.Done(): 546 return nil 547 case err := <-c.ringSubservicesWatcher.Chan(): 548 return errors.Wrap(err, "compactor subservice failed") 549 } 550 } 551 } 552 553 func (c *MultitenantCompactor) compactUsers(ctx context.Context) { 554 sp, ctx := opentracing.StartSpanFromContext(ctx, "CompactUsers") 555 defer sp.Finish() 556 557 succeeded := false 558 compactionErrorCount := 0 559 560 c.compactionRunsStarted.Inc() 561 562 defer func() { 563 if succeeded && compactionErrorCount == 0 { 564 c.compactionRunsCompleted.Inc() 565 c.compactionRunsLastSuccess.SetToCurrentTime() 566 } else if compactionErrorCount == 0 { 567 c.compactionRunsShutdown.Inc() 568 } else { 569 c.compactionRunsErred.Inc() 570 } 571 sp.LogKV("error_count", compactionErrorCount) 572 573 // Reset progress metrics once done. 574 c.compactionRunDiscoveredTenants.Set(0) 575 c.compactionRunSkippedTenants.Set(0) 576 c.compactionRunSucceededTenants.Set(0) 577 c.compactionRunFailedTenants.Set(0) 578 }() 579 580 level.Info(c.logger).Log("msg", "discovering users from bucket") 581 users, err := c.discoverUsersWithRetries(ctx) 582 if err != nil { 583 if !errors.Is(err, context.Canceled) { 584 compactionErrorCount++ 585 level.Error(c.logger).Log("msg", "failed to discover users from bucket", "err", err) 586 } 587 return 588 } 589 sp.LogKV("discovered_user_count", len(users)) 590 level.Info(c.logger).Log("msg", "discovered users from bucket", "users", len(users)) 591 c.compactionRunDiscoveredTenants.Set(float64(len(users))) 592 593 // When starting multiple compactor replicas nearly at the same time, running in a cluster with 594 // a large number of tenants, we may end up in a situation where the 1st user is compacted by 595 // multiple replicas at the same time. Shuffling users helps reduce the likelihood this will happen. 596 rand.Shuffle(len(users), func(i, j int) { 597 users[i], users[j] = users[j], users[i] 598 }) 599 600 // Keep track of users owned by this shard, so that we can delete the local files for all other users. 601 ownedUsers := map[string]struct{}{} 602 defer func() { 603 sp.LogKV("owned_user_count", len(ownedUsers)) 604 }() 605 for _, userID := range users { 606 // Ensure the context has not been canceled (ie. compactor shutdown has been triggered). 607 if ctx.Err() != nil { 608 level.Info(c.logger).Log("msg", "interrupting compaction of user blocks", "err", err) 609 return 610 } 611 612 // Ensure the user ID belongs to our shard. 613 if owned, err := c.shardingStrategy.compactorOwnUser(userID); err != nil { 614 c.compactionRunSkippedTenants.Inc() 615 level.Warn(c.logger).Log("msg", "unable to check if user is owned by this shard", "tenant", userID, "err", err) 616 continue 617 } else if !owned { 618 c.compactionRunSkippedTenants.Inc() 619 level.Debug(c.logger).Log("msg", "skipping user because it is not owned by this shard", "tenant", userID) 620 continue 621 } 622 623 ownedUsers[userID] = struct{}{} 624 625 if markedForDeletion, err := bucket.TenantDeletionMarkExists(ctx, c.bucketClient, userID); err != nil { 626 c.compactionRunSkippedTenants.Inc() 627 level.Warn(c.logger).Log("msg", "unable to check if user is marked for deletion", "tenant", userID, "err", err) 628 continue 629 } else if markedForDeletion { 630 c.compactionRunSkippedTenants.Inc() 631 level.Debug(c.logger).Log("msg", "skipping user because it is marked for deletion", "tenant", userID) 632 continue 633 } 634 635 level.Info(c.logger).Log("msg", "starting compaction of user blocks", "tenant", userID) 636 637 if err = c.compactUserWithRetries(ctx, userID); err != nil { 638 switch { 639 case errors.Is(err, context.Canceled): 640 // We don't want to count shutdowns as failed compactions because we will pick up with the rest of the compaction after the restart. 641 level.Info(c.logger).Log("msg", "compaction for user was interrupted by a shutdown", "tenant", userID) 642 return 643 default: 644 c.compactionRunFailedTenants.Inc() 645 compactionErrorCount++ 646 level.Error(c.logger).Log("msg", "failed to compact user blocks", "tenant", userID, "err", err) 647 } 648 continue 649 } 650 651 c.compactionRunSucceededTenants.Inc() 652 level.Info(c.logger).Log("msg", "successfully compacted user blocks", "tenant", userID) 653 } 654 655 // Delete local files for unowned tenants, if there are any. This cleans up 656 // leftover local files for tenants that belong to different compactors now, 657 // or have been deleted completely. 658 for userID := range c.listTenantsWithMetaSyncDirectories() { 659 if _, owned := ownedUsers[userID]; owned { 660 continue 661 } 662 663 dir := c.metaSyncDirForUser(userID) 664 s, err := os.Stat(dir) 665 if err != nil { 666 if !os.IsNotExist(err) { 667 level.Warn(c.logger).Log("msg", "failed to stat local directory with user data", "dir", dir, "err", err) 668 } 669 continue 670 } 671 672 if s.IsDir() { 673 err := os.RemoveAll(dir) 674 if err == nil { 675 level.Info(c.logger).Log("msg", "deleted directory for user not owned by this shard", "dir", dir) 676 } else { 677 level.Warn(c.logger).Log("msg", "failed to delete directory for user not owned by this shard", "dir", dir, "err", err) 678 } 679 } 680 } 681 682 succeeded = true 683 } 684 685 func (c *MultitenantCompactor) compactUserWithRetries(ctx context.Context, userID string) error { 686 var lastErr error 687 688 retries := backoff.New(ctx, backoff.Config{ 689 MinBackoff: c.compactorCfg.retryMinBackoff, 690 MaxBackoff: c.compactorCfg.retryMaxBackoff, 691 MaxRetries: c.compactorCfg.CompactionRetries, 692 }) 693 694 for retries.Ongoing() { 695 sp, ctx := opentracing.StartSpanFromContext(ctx, "CompactUser", opentracing.Tag{Key: "tenantID", Value: userID}) 696 lastErr = c.compactUser(ctx, userID) 697 if lastErr == nil { 698 sp.Finish() 699 return nil 700 } 701 ext.LogError(sp, lastErr) 702 sp.Finish() 703 retries.Wait() 704 } 705 706 return lastErr 707 } 708 709 func (c *MultitenantCompactor) compactUser(ctx context.Context, userID string) error { 710 userBucket := objstore.NewTenantBucketClient(userID, c.bucketClient, c.cfgProvider) 711 reg := prometheus.NewRegistry() 712 defer c.syncerMetrics.gatherThanosSyncerMetrics(reg) 713 714 userLogger := util.LoggerWithUserID(userID, c.logger) 715 716 // Filters out duplicate blocks that can be formed from two or more overlapping 717 // blocks that fully submatches the source blocks of the older blocks. 718 deduplicateBlocksFilter := NewShardAwareDeduplicateFilter() 719 720 // List of filters to apply (order matters). 721 fetcherFilters := []block.MetadataFilter{ 722 deduplicateBlocksFilter, 723 // removes blocks that should not be compacted due to being marked so. 724 NewNoCompactionMarkFilter(userBucket, true), 725 } 726 727 fetcher, err := block.NewMetaFetcher( 728 userLogger, 729 c.compactorCfg.MetaSyncConcurrency, 730 userBucket, 731 c.metaSyncDirForUser(userID), 732 reg, 733 fetcherFilters, 734 ) 735 if err != nil { 736 return err 737 } 738 739 syncer, err := NewMetaSyncer( 740 userLogger, 741 reg, 742 userBucket, 743 fetcher, 744 deduplicateBlocksFilter, 745 c.blocksMarkedForDeletion, 746 ) 747 if err != nil { 748 return errors.Wrap(err, "failed to create syncer") 749 } 750 751 // Create blocks compactor dependencies. 752 blocksCompactor, err := c.blocksCompactorFactory(ctx, c.compactorCfg, c.cfgProvider, userID, c.logger, c.compactorMetrics) 753 if err != nil { 754 return errors.Wrap(err, "failed to initialize compactor dependencies") 755 } 756 757 compactor, err := NewBucketCompactor( 758 userLogger, 759 syncer, 760 c.blocksGrouperFactory(ctx, c.compactorCfg, c.cfgProvider, userID, userLogger, reg), 761 c.blocksPlanner, 762 blocksCompactor, 763 path.Join(c.compactorCfg.DataDir, "compact"), 764 userBucket, 765 c.compactorCfg.CompactionConcurrency, 766 c.shardingStrategy.ownJob, 767 c.jobsOrder, 768 c.compactorCfg.CompactionWaitPeriod, 769 c.compactorCfg.BlockSyncConcurrency, 770 c.bucketCompactorMetrics, 771 ) 772 if err != nil { 773 return errors.Wrap(err, "failed to create bucket compactor") 774 } 775 776 if err := compactor.Compact(ctx, c.compactorCfg.MaxCompactionTime); err != nil { 777 return errors.Wrap(err, "compaction") 778 } 779 780 return nil 781 } 782 783 func (c *MultitenantCompactor) discoverUsersWithRetries(ctx context.Context) ([]string, error) { 784 sp, ctx := opentracing.StartSpanFromContext(ctx, "DiscoverUsers") 785 defer sp.Finish() 786 787 var lastErr error 788 789 retries := backoff.New(ctx, backoff.Config{ 790 MinBackoff: c.compactorCfg.retryMinBackoff, 791 MaxBackoff: c.compactorCfg.retryMaxBackoff, 792 MaxRetries: c.compactorCfg.CompactionRetries, 793 }) 794 795 for retries.Ongoing() { 796 var users []string 797 798 users, lastErr = c.discoverUsers(ctx) 799 if lastErr == nil { 800 return users, nil 801 } 802 803 retries.Wait() 804 } 805 806 return nil, lastErr 807 } 808 809 func (c *MultitenantCompactor) discoverUsers(ctx context.Context) ([]string, error) { 810 return bucket.ListUsers(ctx, c.bucketClient) 811 } 812 813 // shardingStrategy describes whether compactor "owns" given user or job. 814 type shardingStrategy interface { 815 compactorOwnUser(userID string) (bool, error) 816 // blocksCleanerOwnUser must be concurrency-safe 817 blocksCleanerOwnUser(userID string) (bool, error) 818 ownJob(job *Job) (bool, error) 819 } 820 821 // splitAndMergeShardingStrategy is used by split-and-merge compactor when configured with sharding. 822 // All compactors from user's shard own the user for compaction purposes, and plan jobs. 823 // Each job is only owned and executed by single compactor. 824 // Only one of compactors from user's shard will do cleanup. 825 type splitAndMergeShardingStrategy struct { 826 allowedTenants *tenant.AllowedTenants 827 ring *ring.Ring 828 ringLifecycler *ring.BasicLifecycler 829 configProvider ConfigProvider 830 } 831 832 func newSplitAndMergeShardingStrategy(allowedTenants *tenant.AllowedTenants, ring *ring.Ring, ringLifecycler *ring.BasicLifecycler, configProvider ConfigProvider) *splitAndMergeShardingStrategy { 833 return &splitAndMergeShardingStrategy{ 834 allowedTenants: allowedTenants, 835 ring: ring, 836 ringLifecycler: ringLifecycler, 837 configProvider: configProvider, 838 } 839 } 840 841 // Only single instance in the subring can run blocks cleaner for given user. blocksCleanerOwnUser is concurrency-safe. 842 func (s *splitAndMergeShardingStrategy) blocksCleanerOwnUser(userID string) (bool, error) { 843 if !s.allowedTenants.IsAllowed(userID) { 844 return false, nil 845 } 846 847 r := s.ring.ShuffleShard(userID, s.configProvider.CompactorTenantShardSize(userID)) 848 849 return instanceOwnsTokenInRing(r, s.ringLifecycler.GetInstanceAddr(), userID) 850 } 851 852 // ALL compactors should plan jobs for all users. 853 func (s *splitAndMergeShardingStrategy) compactorOwnUser(userID string) (bool, error) { 854 if !s.allowedTenants.IsAllowed(userID) { 855 return false, nil 856 } 857 858 r := s.ring.ShuffleShard(userID, s.configProvider.CompactorTenantShardSize(userID)) 859 860 return r.HasInstance(s.ringLifecycler.GetInstanceID()), nil 861 } 862 863 // Only single compactor should execute the job. 864 func (s *splitAndMergeShardingStrategy) ownJob(job *Job) (bool, error) { 865 ok, err := s.compactorOwnUser(job.UserID()) 866 if err != nil || !ok { 867 return ok, err 868 } 869 870 r := s.ring.ShuffleShard(job.UserID(), s.configProvider.CompactorTenantShardSize(job.UserID())) 871 872 return instanceOwnsTokenInRing(r, s.ringLifecycler.GetInstanceAddr(), job.ShardingKey()) 873 } 874 875 func instanceOwnsTokenInRing(r ring.ReadRing, instanceAddr string, key string) (bool, error) { 876 // Hash the key. 877 hasher := fnv.New32a() 878 _, _ = hasher.Write([]byte(key)) 879 hash := hasher.Sum32() 880 881 // Check whether this compactor instance owns the token. 882 rs, err := r.Get(hash, RingOp, nil, nil, nil) 883 if err != nil { 884 return false, err 885 } 886 887 if len(rs.Instances) != 1 { 888 return false, fmt.Errorf("unexpected number of compactors in the shard (expected 1, got %d)", len(rs.Instances)) 889 } 890 891 return rs.Instances[0].Addr == instanceAddr, nil 892 } 893 894 const compactorMetaPrefix = "compactor-meta-" 895 896 // metaSyncDirForUser returns directory to store cached meta files. 897 // The fetcher stores cached metas in the "meta-syncer/" sub directory, 898 // but we prefix it with "compactor-meta-" in order to guarantee no clashing with 899 // the directory used by the Thanos Syncer, whatever is the user ID. 900 func (c *MultitenantCompactor) metaSyncDirForUser(userID string) string { 901 return filepath.Join(c.compactorCfg.DataDir, compactorMetaPrefix+userID) 902 } 903 904 // This function returns tenants with meta sync directories found on local disk. On error, it returns nil map. 905 func (c *MultitenantCompactor) listTenantsWithMetaSyncDirectories() map[string]struct{} { 906 result := map[string]struct{}{} 907 908 files, err := os.ReadDir(c.compactorCfg.DataDir) 909 if err != nil { 910 return nil 911 } 912 913 for _, f := range files { 914 if !f.IsDir() { 915 continue 916 } 917 918 if !strings.HasPrefix(f.Name(), compactorMetaPrefix) { 919 continue 920 } 921 922 result[f.Name()[len(compactorMetaPrefix):]] = struct{}{} 923 } 924 925 return result 926 }