github.com/grafana/pyroscope@v1.18.0/pkg/compactor/bucket_compactor.go (about) 1 // SPDX-License-Identifier: AGPL-3.0-only 2 // Provenance-includes-location: https://github.com/grafana/mimir/blob/main/pkg/compactor/bucket_compactor.go 3 // Provenance-includes-license: Apache-2.0 4 // Provenance-includes-copyright: The Cortex Authors. 5 6 package compactor 7 8 import ( 9 "context" 10 "fmt" 11 "os" 12 "path" 13 "path/filepath" 14 "strings" 15 "sync" 16 "time" 17 18 "github.com/go-kit/log" 19 "github.com/go-kit/log/level" 20 "github.com/grafana/dskit/concurrency" 21 "github.com/grafana/dskit/multierror" 22 "github.com/grafana/dskit/runutil" 23 "github.com/oklog/ulid/v2" 24 "github.com/opentracing/opentracing-go" 25 "github.com/opentracing/opentracing-go/ext" 26 "github.com/pkg/errors" 27 "github.com/prometheus/client_golang/prometheus" 28 "github.com/prometheus/client_golang/prometheus/promauto" 29 "github.com/prometheus/prometheus/model/labels" 30 "go.uber.org/atomic" 31 32 "github.com/grafana/pyroscope/pkg/objstore" 33 "github.com/grafana/pyroscope/pkg/objstore/client" 34 "github.com/grafana/pyroscope/pkg/objstore/providers/filesystem" 35 "github.com/grafana/pyroscope/pkg/phlaredb" 36 "github.com/grafana/pyroscope/pkg/phlaredb/block" 37 ) 38 39 type DeduplicateFilter interface { 40 block.MetadataFilter 41 42 // DuplicateIDs returns IDs of duplicate blocks generated by last call to Filter method. 43 DuplicateIDs() []ulid.ULID 44 } 45 46 // Syncer synchronizes block metas from a bucket into a local directory. 47 // It sorts them into compaction groups based on equal label sets. 48 type Syncer struct { 49 logger log.Logger 50 bkt objstore.Bucket 51 fetcher *block.MetaFetcher 52 mtx sync.Mutex 53 blocks map[ulid.ULID]*block.Meta 54 metrics *syncerMetrics 55 deduplicateBlocksFilter DeduplicateFilter 56 } 57 58 type syncerMetrics struct { 59 garbageCollections prometheus.Counter 60 garbageCollectionFailures prometheus.Counter 61 garbageCollectionDuration prometheus.Histogram 62 blocksMarkedForDeletion prometheus.Counter 63 } 64 65 func newSyncerMetrics(reg prometheus.Registerer, blocksMarkedForDeletion prometheus.Counter) *syncerMetrics { 66 var m syncerMetrics 67 68 m.garbageCollections = promauto.With(reg).NewCounter(prometheus.CounterOpts{ 69 Name: "thanos_compact_garbage_collection_total", 70 Help: "Total number of garbage collection operations.", 71 }) 72 m.garbageCollectionFailures = promauto.With(reg).NewCounter(prometheus.CounterOpts{ 73 Name: "thanos_compact_garbage_collection_failures_total", 74 Help: "Total number of failed garbage collection operations.", 75 }) 76 m.garbageCollectionDuration = promauto.With(reg).NewHistogram(prometheus.HistogramOpts{ 77 Name: "thanos_compact_garbage_collection_duration_seconds", 78 Help: "Time it took to perform garbage collection iteration.", 79 Buckets: []float64{0.01, 0.1, 0.3, 0.6, 1, 3, 6, 9, 20, 30, 60, 90, 120, 240, 360, 720}, 80 }) 81 82 m.blocksMarkedForDeletion = blocksMarkedForDeletion 83 84 return &m 85 } 86 87 // NewMetaSyncer returns a new Syncer for the given Bucket and directory. 88 // Blocks must be at least as old as the sync delay for being considered. 89 func NewMetaSyncer(logger log.Logger, reg prometheus.Registerer, bkt objstore.Bucket, fetcher *block.MetaFetcher, deduplicateBlocksFilter DeduplicateFilter, blocksMarkedForDeletion prometheus.Counter) (*Syncer, error) { 90 if logger == nil { 91 logger = log.NewNopLogger() 92 } 93 return &Syncer{ 94 logger: logger, 95 bkt: bkt, 96 fetcher: fetcher, 97 blocks: map[ulid.ULID]*block.Meta{}, 98 metrics: newSyncerMetrics(reg, blocksMarkedForDeletion), 99 deduplicateBlocksFilter: deduplicateBlocksFilter, 100 }, nil 101 } 102 103 // SyncMetas synchronizes local state of block metas with what we have in the bucket. 104 func (s *Syncer) SyncMetas(ctx context.Context) error { 105 sp, ctx := opentracing.StartSpanFromContext(ctx, "SyncMetas") 106 defer sp.Finish() 107 s.mtx.Lock() 108 defer s.mtx.Unlock() 109 110 // While fetching blocks, we filter out blocks that were marked for deletion. 111 // No deletion delay is used -- all blocks with deletion marker are ignored, and not considered for compaction. 112 metas, _, err := s.fetcher.FetchWithoutMarkedForDeletion(ctx) 113 if err != nil { 114 return err 115 } 116 s.blocks = metas 117 return nil 118 } 119 120 // Metas returns loaded metadata blocks since last sync. 121 func (s *Syncer) Metas() map[ulid.ULID]*block.Meta { 122 s.mtx.Lock() 123 defer s.mtx.Unlock() 124 125 return s.blocks 126 } 127 128 // GarbageCollect marks blocks for deletion from bucket if their data is available as part of a 129 // block with a higher compaction level. 130 // Call to SyncMetas function is required to populate duplicateIDs in duplicateBlocksFilter. 131 func (s *Syncer) GarbageCollect(ctx context.Context) error { 132 sp, ctx := opentracing.StartSpanFromContext(ctx, "GarbageCollect") 133 defer sp.Finish() 134 s.mtx.Lock() 135 defer s.mtx.Unlock() 136 137 begin := time.Now() 138 139 // The deduplication filter is applied after all blocks marked for deletion have been excluded 140 // (with no deletion delay), so we expect that all duplicated blocks have not been marked for 141 // deletion yet. Even in the remote case these blocks have already been marked for deletion, 142 // the block.MarkForDeletion() call will correctly handle it. 143 duplicateIDs := s.deduplicateBlocksFilter.DuplicateIDs() 144 145 for _, id := range duplicateIDs { 146 if ctx.Err() != nil { 147 return ctx.Err() 148 } 149 150 // Spawn a new context so we always mark a block for deletion in full on shutdown. 151 delCtx, cancel := context.WithTimeout(context.Background(), 5*time.Minute) 152 153 level.Info(s.logger).Log("msg", "marking outdated block for deletion", "block", id) 154 err := block.MarkForDeletion(delCtx, s.logger, s.bkt, id, "outdated block", false, s.metrics.blocksMarkedForDeletion) 155 cancel() 156 if err != nil { 157 s.metrics.garbageCollectionFailures.Inc() 158 return errors.Wrapf(err, "mark block %s for deletion", id) 159 } 160 161 // Immediately update our in-memory state so no further call to SyncMetas is needed 162 // after running garbage collection. 163 delete(s.blocks, id) 164 } 165 s.metrics.garbageCollections.Inc() 166 s.metrics.garbageCollectionDuration.Observe(time.Since(begin).Seconds()) 167 return nil 168 } 169 170 // Grouper is responsible to group all known blocks into compaction Job which are safe to be 171 // compacted concurrently. 172 type Grouper interface { 173 // Groups returns the compaction jobs for all blocks currently known to the syncer. 174 // It creates all jobs from the scratch on every call. 175 Groups(blocks map[ulid.ULID]*block.Meta) (res []*Job, err error) 176 } 177 178 // DefaultGroupKey returns a unique identifier for the group the block belongs to, based on 179 // the DefaultGrouper logic. It considers the downsampling resolution and the block's labels. 180 func DefaultGroupKey(meta block.Meta) string { 181 return defaultGroupKey(meta.Resolution, labels.FromMap(meta.Labels)) 182 } 183 184 func defaultGroupKey(res int64, lbls labels.Labels) string { 185 return fmt.Sprintf("%d@%v", res, labels.StableHash(lbls)) 186 } 187 188 func minTime(metas []*block.Meta) time.Time { 189 if len(metas) == 0 { 190 return time.Time{} 191 } 192 193 minT := metas[0].MinTime 194 for _, meta := range metas { 195 if meta.MinTime < minT { 196 minT = meta.MinTime 197 } 198 } 199 200 return time.Unix(0, int64(minT)*int64(time.Millisecond)).UTC() 201 } 202 203 func maxTime(metas []*block.Meta) time.Time { 204 if len(metas) == 0 { 205 return time.Time{} 206 } 207 208 maxT := metas[0].MaxTime 209 for _, meta := range metas { 210 if meta.MaxTime > maxT { 211 maxT = meta.MaxTime 212 } 213 } 214 215 return time.Unix(0, int64(maxT)*int64(time.Millisecond)).UTC() 216 } 217 218 // Planner returns blocks to compact. 219 type Planner interface { 220 // Plan returns a list of blocks that should be compacted into single one. 221 // The blocks can be overlapping. The provided metadata has to be ordered by minTime. 222 Plan(ctx context.Context, metasByMinTime []*block.Meta) ([]*block.Meta, error) 223 } 224 225 // Compactor provides compaction against an underlying storage of profiling data. 226 type Compactor interface { 227 // CompactWithSplitting merges and splits the source blocks into shardCount number of compacted blocks, 228 // and returns slice of block IDs. 229 // If given compacted block has no series, corresponding block ID will not be returned. 230 CompactWithSplitting(ctx context.Context, dst string, dirs []string, shardCount, stageSize uint64) (result []ulid.ULID, _ error) 231 } 232 233 const ( 234 CompactionSplitByFingerprint = "fingerprint" 235 CompactionSplitByStacktracePartition = "stacktracePartition" 236 ) 237 238 var CompactionSplitBys = []string{CompactionSplitByFingerprint, CompactionSplitByStacktracePartition} 239 240 func getCompactionSplitBy(name string) phlaredb.SplitByFunc { 241 switch name { 242 case CompactionSplitByFingerprint: 243 return phlaredb.SplitByFingerprint 244 case CompactionSplitByStacktracePartition: 245 return phlaredb.SplitByStacktracePartition 246 default: 247 return nil 248 } 249 } 250 251 type BlockCompactor struct { 252 blockOpenConcurrency int 253 downsamplerEnabled bool 254 splitBy phlaredb.SplitByFunc 255 logger log.Logger 256 metrics *CompactorMetrics 257 } 258 259 type CompactorMetrics struct { 260 Ran *prometheus.CounterVec 261 InProgress *prometheus.GaugeVec 262 OverlappingBlocks prometheus.Counter 263 Duration *prometheus.HistogramVec 264 Size *prometheus.HistogramVec 265 Samples *prometheus.HistogramVec 266 Range *prometheus.HistogramVec 267 Split *prometheus.HistogramVec 268 } 269 270 func newCompactorMetrics(r prometheus.Registerer) *CompactorMetrics { 271 m := &CompactorMetrics{} 272 273 m.Ran = prometheus.NewCounterVec(prometheus.CounterOpts{ 274 Name: "pyroscope_compactions_total", 275 Help: "Total number of compactions that were executed per level.", 276 }, []string{"level"}) 277 m.InProgress = prometheus.NewGaugeVec(prometheus.GaugeOpts{ 278 Name: "pyroscope_compactions_current", 279 Help: "The amount of compaction in progress per level", 280 }, []string{"level"}) 281 m.OverlappingBlocks = prometheus.NewCounter(prometheus.CounterOpts{ 282 Name: "pyroscope_vertical_compactions_total", 283 Help: "Total number of compactions done on overlapping blocks.", 284 }) 285 m.Duration = prometheus.NewHistogramVec(prometheus.HistogramOpts{ 286 Name: "pyroscope_compaction_duration_seconds", 287 Help: "Duration of compaction runs", 288 Buckets: prometheus.ExponentialBuckets(1, 2, 14), 289 }, []string{"level"}) 290 m.Size = prometheus.NewHistogramVec(prometheus.HistogramOpts{ 291 Name: "pyroscope_compaction_size_bytes", 292 Help: "Final block size after compaction by level", 293 Buckets: prometheus.ExponentialBuckets(32, 1.5, 12), 294 }, []string{"level"}) 295 m.Samples = prometheus.NewHistogramVec(prometheus.HistogramOpts{ 296 Name: "pyroscope_compaction_samples", 297 Help: "Final number of samples after compaction by level", 298 Buckets: prometheus.ExponentialBuckets(4, 1.5, 12), 299 }, []string{"level"}) 300 m.Range = prometheus.NewHistogramVec(prometheus.HistogramOpts{ 301 Name: "pyroscope_compaction_range_seconds", 302 Help: "Final time range after compaction by level.", 303 Buckets: prometheus.ExponentialBuckets(100, 4, 10), 304 }, []string{"level"}) 305 m.Split = prometheus.NewHistogramVec(prometheus.HistogramOpts{ 306 Name: "pyroscope_compaction_splits", 307 Help: "Compaction split factor by level.", 308 Buckets: []float64{1, 2, 4, 8, 16, 32, 64}, 309 }, []string{"level"}) 310 311 if r != nil { 312 r.MustRegister( 313 m.Ran, 314 m.InProgress, 315 m.OverlappingBlocks, 316 m.Duration, 317 m.Range, 318 m.Samples, 319 m.Size, 320 ) 321 } 322 return m 323 } 324 325 func (c *BlockCompactor) CompactWithSplitting(ctx context.Context, dest string, dirs []string, shardCount, stageSize uint64) ([]ulid.ULID, error) { 326 defer func() { 327 if err := recover(); err != nil { 328 level.Error(c.logger).Log("msg", "panic during compaction", "err", err, "dirs", strings.Join(dirs, ",")) 329 panic(err) 330 } 331 }() 332 localBucket, err := client.NewBucket(ctx, client.Config{ 333 StorageBackendConfig: client.StorageBackendConfig{ 334 Backend: client.Filesystem, 335 Filesystem: filesystem.Config{Directory: dest}, 336 }, 337 }, "local-compactor") 338 if err != nil { 339 return nil, errors.Wrap(err, "create local bucket") 340 } 341 defer localBucket.Close() 342 343 readers := make([]phlaredb.BlockReader, len(dirs)) 344 defer func() { 345 for _, b := range readers { 346 if b != nil { 347 if err := b.Close(); err != nil { 348 level.Warn(c.logger).Log("msg", "failed to close block", "err", err) 349 } 350 } 351 } 352 }() 353 354 err = func() error { 355 sp, ctx := opentracing.StartSpanFromContext(ctx, "OpenBlocks", opentracing.Tag{Key: "concurrency", Value: c.blockOpenConcurrency}) 356 defer sp.Finish() 357 // Open all blocks 358 return concurrency.ForEachJob(ctx, len(readers), c.blockOpenConcurrency, func(ctx context.Context, idx int) error { 359 dir := dirs[idx] 360 meta, err := block.ReadMetaFromDir(dir) 361 if err != nil { 362 return errors.Wrapf(err, "failed to read meta the block dir %s", dir) 363 } 364 b := phlaredb.NewSingleBlockQuerierFromMeta(ctx, localBucket, meta) 365 if err := b.Open(ctx); err != nil { 366 return errors.Wrapf(err, "open block %s", meta.ULID) 367 } 368 readers[idx] = b 369 return nil 370 }) 371 }() 372 if err != nil { 373 return nil, err 374 } 375 currentLevel := 0 376 for _, r := range readers { 377 lvl := r.Meta().Compaction.Level 378 if lvl > currentLevel { 379 currentLevel = lvl 380 } 381 } 382 currentLevel++ 383 if sp := opentracing.SpanFromContext(ctx); sp != nil { 384 sp.SetTag("compaction_level", currentLevel) 385 } 386 start := time.Now() 387 defer func() { 388 c.metrics.Duration.WithLabelValues(fmt.Sprintf("%d", currentLevel)).Observe(time.Since(start).Seconds()) 389 c.metrics.InProgress.WithLabelValues(fmt.Sprintf("%d", currentLevel)).Dec() 390 }() 391 c.metrics.InProgress.WithLabelValues(fmt.Sprintf("%d", currentLevel)).Inc() 392 c.metrics.Ran.WithLabelValues(fmt.Sprintf("%d", currentLevel)).Inc() 393 c.metrics.Split.WithLabelValues(fmt.Sprintf("%d", currentLevel)).Observe(float64(shardCount)) 394 395 metas, err := phlaredb.CompactWithSplitting(ctx, phlaredb.CompactWithSplittingOpts{ 396 Src: readers, 397 Dst: dest, 398 SplitCount: shardCount, 399 StageSize: stageSize, 400 SplitBy: c.splitBy, 401 DownsamplerEnabled: c.downsamplerEnabled, 402 Logger: c.logger, 403 }) 404 if err != nil { 405 return nil, errors.Wrapf(err, "compact blocks %v", dirs) 406 } 407 for _, m := range metas { 408 c.metrics.Range.WithLabelValues(fmt.Sprintf("%d", currentLevel)).Observe(float64(m.MaxTime-m.MinTime) / 1000) 409 c.metrics.Samples.WithLabelValues(fmt.Sprintf("%d", currentLevel)).Observe(float64(m.Stats.NumSamples)) 410 size := float64(0) 411 for _, f := range m.Files { 412 size += float64(f.SizeBytes) 413 } 414 c.metrics.Size.WithLabelValues(fmt.Sprintf("%d", currentLevel)).Observe(size) 415 } 416 result := make([]ulid.ULID, len(metas)) 417 for i := range metas { 418 result[i] = metas[i].ULID 419 } 420 return result, nil 421 } 422 423 // runCompactionJob plans and runs a single compaction against the provided job. The compacted result 424 // is uploaded into the bucket the blocks were retrieved from. 425 func (c *BucketCompactor) runCompactionJob(ctx context.Context, job *Job) (shouldRerun bool, compIDs []ulid.ULID, rerr error) { 426 jobBeginTime := time.Now() 427 428 jobLogger := log.With(c.logger, "groupKey", job.Key()) 429 subDir := filepath.Join(c.compactDir, job.Key()) 430 431 defer func() { 432 elapsed := time.Since(jobBeginTime) 433 434 if rerr == nil { 435 level.Info(jobLogger).Log("msg", "compaction job succeeded", "duration", elapsed, "duration_ms", elapsed.Milliseconds()) 436 } else { 437 level.Error(jobLogger).Log("msg", "compaction job failed", "duration", elapsed, "duration_ms", elapsed.Milliseconds(), "err", rerr) 438 } 439 440 if err := os.RemoveAll(subDir); err != nil { 441 level.Error(jobLogger).Log("msg", "failed to remove compaction group work directory", "path", subDir, "err", err) 442 } 443 }() 444 445 if err := os.MkdirAll(subDir, 0o750); err != nil { 446 return false, nil, errors.Wrap(err, "create compaction job dir") 447 } 448 449 toCompact, err := c.planner.Plan(ctx, job.metasByMinTime) 450 if err != nil { 451 return false, nil, errors.Wrap(err, "plan compaction") 452 } 453 if len(toCompact) == 0 { 454 // Nothing to do. 455 return false, nil, nil 456 } 457 458 // The planner returned some blocks to compact, so we can enrich the logger 459 // with the min/max time between all blocks to compact. 460 toCompactMinTime := minTime(toCompact) 461 toCompactMaxTime := maxTime(toCompact) 462 jobLogger = log.With(jobLogger, "minTime", toCompactMinTime.String(), "maxTime", toCompactMaxTime.String()) 463 464 level.Info(jobLogger).Log("msg", "compaction available and planned; downloading blocks", "blocks", len(toCompact), "plan", fmt.Sprintf("%v", toCompact)) 465 466 sp, ctx := opentracing.StartSpanFromContext(ctx, "CompactJob", 467 opentracing.Tag{Key: "GroupKey", Value: job.Key()}, 468 opentracing.Tag{Key: "Job", Value: job.String()}, 469 opentracing.Tag{Key: "Labels", Value: job.Labels().String()}, 470 opentracing.Tag{Key: "MinCompactionLevel", Value: job.MinCompactionLevel()}, 471 opentracing.Tag{Key: "Resolution", Value: job.Resolution()}, 472 opentracing.Tag{Key: "ShardKey", Value: job.ShardingKey()}, 473 opentracing.Tag{Key: "SplitStageSize", Value: job.SplitStageSize()}, 474 opentracing.Tag{Key: "UseSplitting", Value: job.UseSplitting()}, 475 opentracing.Tag{Key: "SplittingShards", Value: job.SplittingShards()}, 476 opentracing.Tag{Key: "BlockCount", Value: len(toCompact)}, 477 ) 478 defer sp.Finish() 479 480 blocksToCompactDirs := make([]string, len(toCompact)) 481 // Once we have a plan we need to download the actual data. 482 downloadBegin := time.Now() 483 484 err = func() error { 485 sp, ctx := opentracing.StartSpanFromContext(ctx, "DownloadBlocks") 486 defer func() { 487 elapsed := time.Since(downloadBegin) 488 level.Info(jobLogger).Log("msg", "downloaded and verified blocks; compacting blocks", "blocks", len(blocksToCompactDirs), "plan", fmt.Sprintf("%v", blocksToCompactDirs), "duration", elapsed, "duration_ms", elapsed.Milliseconds()) 489 sp.Finish() 490 }() 491 492 if err := concurrency.ForEachJob(ctx, len(toCompact), c.blockSyncConcurrency, func(ctx context.Context, idx int) error { 493 meta := toCompact[idx] 494 // Must be the same as in blocksToCompactDirs. 495 bdir := filepath.Join(subDir, meta.ULID.String()) 496 if err := block.Download(ctx, jobLogger, c.bkt, meta.ULID, bdir); err != nil { 497 return errors.Wrapf(err, "download block %s", meta.ULID) 498 } 499 500 return nil 501 }); err != nil { 502 return err 503 } 504 505 for ix, meta := range toCompact { 506 blocksToCompactDirs[ix] = filepath.Join(subDir, meta.ULID.String()) 507 } 508 return nil 509 }() 510 if err != nil { 511 ext.LogError(sp, err) 512 return false, nil, err 513 } 514 515 err = func() error { 516 sp, ctx := opentracing.StartSpanFromContext(ctx, "CompactBlocks") 517 compactionBegin := time.Now() 518 defer func() { 519 sp.Finish() 520 elapsed := time.Since(compactionBegin) 521 level.Info(jobLogger).Log("msg", "compacted blocks", "new", fmt.Sprintf("%v", compIDs), "blocks", fmt.Sprintf("%v", blocksToCompactDirs), "duration", elapsed, "duration_ms", elapsed.Milliseconds()) 522 }() 523 if job.UseSplitting() { 524 compIDs, err = c.comp.CompactWithSplitting(ctx, subDir, blocksToCompactDirs, uint64(job.SplittingShards()), uint64(job.SplitStageSize())) 525 } else { 526 compIDs, err = c.comp.CompactWithSplitting(ctx, subDir, blocksToCompactDirs, 1, 0) 527 } 528 outputDirs := make([]string, len(compIDs)) 529 for i, id := range compIDs { 530 outputDirs[i] = filepath.Join(subDir, id.String()) 531 } 532 sp.SetTag("input_dirs", blocksToCompactDirs) 533 sp.SetTag("output_dirs", outputDirs) 534 return err 535 }() 536 if err != nil { 537 ext.LogError(sp, err) 538 return false, nil, errors.Wrapf(err, "compact blocks %v", blocksToCompactDirs) 539 } 540 541 if err = verifyCompactedBlocksTimeRanges(compIDs, toCompactMinTime.UnixMilli(), toCompactMaxTime.UnixMilli(), subDir); err != nil { 542 level.Error(jobLogger).Log("msg", "compacted blocks verification failed", "err", err) 543 c.metrics.compactionBlocksVerificationFailed.Inc() 544 return false, nil, err 545 } 546 547 // Spawn a new context so we always finish uploading and marking a block for deletion in full on shutdown. 548 ctx, cancel := context.WithTimeout(context.Background(), 20*time.Minute) 549 ctx = opentracing.ContextWithSpan(ctx, sp) 550 defer cancel() 551 552 err = func() error { 553 sp, ctx := opentracing.StartSpanFromContext(ctx, "Uploading blocks", opentracing.Tag{Key: "count", Value: len(compIDs)}) 554 uploadBegin := time.Now() 555 uploadedBlocks := atomic.NewInt64(0) 556 defer func() { 557 elapsed := time.Since(uploadBegin) 558 level.Info(jobLogger).Log("msg", "uploaded all blocks", "blocks", uploadedBlocks, "duration", elapsed, "duration_ms", elapsed.Milliseconds()) 559 sp.Finish() 560 }() 561 return concurrency.ForEachJob(ctx, len(compIDs), c.blockSyncConcurrency, func(ctx context.Context, idx int) error { 562 ulidToUpload := compIDs[idx] 563 564 uploadedBlocks.Inc() 565 566 bdir := filepath.Join(subDir, ulidToUpload.String()) 567 568 newMeta, err := block.ReadMetaFromDir(bdir) 569 if err != nil { 570 return errors.Wrapf(err, "failed to read meta the block dir %s", bdir) 571 } 572 573 // Ensure the compacted block is valid. 574 if err := phlaredb.ValidateLocalBlock(ctx, bdir); err != nil { 575 return errors.Wrapf(err, "invalid result block %s", bdir) 576 } 577 578 begin := time.Now() 579 if err := block.Upload(ctx, jobLogger, c.bkt, bdir); err != nil { 580 return errors.Wrapf(err, "upload of %s failed", ulidToUpload) 581 } 582 583 elapsed := time.Since(begin) 584 level.Info(jobLogger).Log("msg", "uploaded block", "result_block", ulidToUpload, "duration", elapsed, "duration_ms", elapsed.Milliseconds(), "labels", labels.FromMap(newMeta.Labels)) 585 return nil 586 }) 587 }() 588 589 if err != nil { 590 ext.LogError(sp, err) 591 return false, nil, err 592 } 593 594 sp, ctx = opentracing.StartSpanFromContext(ctx, "Deleting blocks", opentracing.Tag{Key: "count", Value: len(compIDs)}) 595 defer sp.Finish() 596 // Mark for deletion the blocks we just compacted from the job and bucket so they do not get included 597 // into the next planning cycle. 598 // Eventually the block we just uploaded should get synced into the job again (including sync-delay). 599 for _, meta := range toCompact { 600 if err := deleteBlock(ctx, c.bkt, meta.ULID, filepath.Join(subDir, meta.ULID.String()), jobLogger, c.metrics.blocksMarkedForDeletion); err != nil { 601 return false, nil, errors.Wrapf(err, "mark old block for deletion from bucket") 602 } 603 } 604 605 return true, compIDs, nil 606 } 607 608 // verifyCompactedBlocksTimeRanges does a full run over the compacted blocks 609 // and verifies that they satisfy the min/maxTime from the source blocks 610 func verifyCompactedBlocksTimeRanges(compIDs []ulid.ULID, sourceBlocksMinTime, sourceBlocksMaxTime int64, subDir string) error { 611 sourceBlocksMinTimeFound := false 612 sourceBlocksMaxTimeFound := false 613 614 for _, compID := range compIDs { 615 // Skip empty block 616 if compID == (ulid.ULID{}) { 617 continue 618 } 619 620 bdir := filepath.Join(subDir, compID.String()) 621 meta, err := block.ReadMetaFromDir(bdir) 622 if err != nil { 623 return errors.Wrapf(err, "failed to read meta.json from %s during block time range verification", bdir) 624 } 625 626 // Ensure compacted block min/maxTime within source blocks min/maxTime 627 if int64(meta.MinTime) < sourceBlocksMinTime { 628 return fmt.Errorf("invalid minTime for block %s, compacted block minTime %d is before source minTime %d", compID.String(), meta.MinTime, sourceBlocksMinTime) 629 } 630 631 if int64(meta.MaxTime) > sourceBlocksMaxTime { 632 return fmt.Errorf("invalid maxTime for block %s, compacted block maxTime %d is after source maxTime %d", compID.String(), meta.MaxTime, sourceBlocksMaxTime) 633 } 634 635 if int64(meta.MinTime) == sourceBlocksMinTime { 636 sourceBlocksMinTimeFound = true 637 } 638 639 if int64(meta.MaxTime) == sourceBlocksMaxTime { 640 sourceBlocksMaxTimeFound = true 641 } 642 } 643 644 // Check that the minTime and maxTime from the source blocks 645 // are found at least once in the compacted blocks 646 if !sourceBlocksMinTimeFound || !sourceBlocksMaxTimeFound { 647 return fmt.Errorf("compacted block(s) do not contain minTime %d and maxTime %d from the source blocks", sourceBlocksMinTime, sourceBlocksMaxTime) 648 } 649 650 return nil 651 } 652 653 func deleteBlock(ctx context.Context, bkt objstore.Bucket, id ulid.ULID, bdir string, logger log.Logger, blocksMarkedForDeletion prometheus.Counter) error { 654 if err := os.RemoveAll(bdir); err != nil { 655 return errors.Wrapf(err, "remove old block dir %s", id) 656 } 657 level.Info(logger).Log("msg", "marking compacted block for deletion", "old_block", id) 658 if err := block.MarkForDeletion(ctx, logger, bkt, id, "source of compacted block", true, blocksMarkedForDeletion); err != nil { 659 return errors.Wrapf(err, "mark block %s for deletion from bucket", id) 660 } 661 return nil 662 } 663 664 // BucketCompactorMetrics holds the metrics tracked by BucketCompactor. 665 type BucketCompactorMetrics struct { 666 groupCompactionRunsStarted prometheus.Counter 667 groupCompactionRunsCompleted prometheus.Counter 668 groupCompactionRunsFailed prometheus.Counter 669 groupCompactions prometheus.Counter 670 compactionBlocksVerificationFailed prometheus.Counter 671 blocksMarkedForDeletion prometheus.Counter 672 blocksMarkedForNoCompact prometheus.Counter 673 blocksMaxTimeDelta prometheus.Histogram 674 } 675 676 // NewBucketCompactorMetrics makes a new BucketCompactorMetrics. 677 func NewBucketCompactorMetrics(blocksMarkedForDeletion prometheus.Counter, reg prometheus.Registerer) *BucketCompactorMetrics { 678 return &BucketCompactorMetrics{ 679 groupCompactionRunsStarted: promauto.With(reg).NewCounter(prometheus.CounterOpts{ 680 Name: "pyroscope_compactor_group_compaction_runs_started_total", 681 Help: "Total number of group compaction attempts.", 682 }), 683 groupCompactionRunsCompleted: promauto.With(reg).NewCounter(prometheus.CounterOpts{ 684 Name: "pyroscope_compactor_group_compaction_runs_completed_total", 685 Help: "Total number of group completed compaction runs. This also includes compactor group runs that resulted with no compaction.", 686 }), 687 groupCompactionRunsFailed: promauto.With(reg).NewCounter(prometheus.CounterOpts{ 688 Name: "pyroscope_compactor_group_compactions_failures_total", 689 Help: "Total number of failed group compactions.", 690 }), 691 groupCompactions: promauto.With(reg).NewCounter(prometheus.CounterOpts{ 692 Name: "pyroscope_compactor_group_compactions_total", 693 Help: "Total number of group compaction attempts that resulted in new block(s).", 694 }), 695 compactionBlocksVerificationFailed: promauto.With(reg).NewCounter(prometheus.CounterOpts{ 696 Name: "pyroscope_compactor_blocks_verification_failures_total", 697 Help: "Total number of failures when verifying min/max time ranges of compacted blocks.", 698 }), 699 blocksMarkedForDeletion: blocksMarkedForDeletion, 700 blocksMarkedForNoCompact: promauto.With(reg).NewCounter(prometheus.CounterOpts{ 701 Name: "pyroscope_compactor_blocks_marked_for_no_compaction_total", 702 Help: "Total number of blocks that were marked for no-compaction.", 703 ConstLabels: prometheus.Labels{"reason": block.OutOfOrderChunksNoCompactReason}, 704 }), 705 blocksMaxTimeDelta: promauto.With(reg).NewHistogram(prometheus.HistogramOpts{ 706 Name: "pyroscope_compactor_block_max_time_delta_seconds", 707 Help: "Difference between now and the max time of a block being compacted in seconds.", 708 Buckets: prometheus.LinearBuckets(86400, 43200, 8), // 1 to 5 days, in 12 hour intervals 709 }), 710 } 711 } 712 713 type ownCompactionJobFunc func(job *Job) (bool, error) 714 715 // ownAllJobs is a ownCompactionJobFunc that always return true. 716 var ownAllJobs = func(job *Job) (bool, error) { 717 return true, nil 718 } 719 720 // BucketCompactor compacts blocks in a bucket. 721 type BucketCompactor struct { 722 logger log.Logger 723 sy *Syncer 724 grouper Grouper 725 planner Planner 726 comp Compactor 727 compactDir string 728 bkt objstore.Bucket 729 concurrency int 730 ownJob ownCompactionJobFunc 731 sortJobs JobsOrderFunc 732 waitPeriod time.Duration 733 blockSyncConcurrency int 734 metrics *BucketCompactorMetrics 735 } 736 737 // NewBucketCompactor creates a new bucket compactor. 738 func NewBucketCompactor( 739 logger log.Logger, 740 sy *Syncer, 741 grouper Grouper, 742 planner Planner, 743 comp Compactor, 744 compactDir string, 745 bkt objstore.Bucket, 746 concurrency int, 747 ownJob ownCompactionJobFunc, 748 sortJobs JobsOrderFunc, 749 waitPeriod time.Duration, 750 blockSyncConcurrency int, 751 metrics *BucketCompactorMetrics, 752 ) (*BucketCompactor, error) { 753 if concurrency <= 0 { 754 return nil, errors.Errorf("invalid concurrency level (%d), concurrency level must be > 0", concurrency) 755 } 756 return &BucketCompactor{ 757 logger: logger, 758 sy: sy, 759 grouper: grouper, 760 planner: planner, 761 comp: comp, 762 compactDir: compactDir, 763 bkt: bkt, 764 concurrency: concurrency, 765 ownJob: ownJob, 766 sortJobs: sortJobs, 767 waitPeriod: waitPeriod, 768 blockSyncConcurrency: blockSyncConcurrency, 769 metrics: metrics, 770 }, nil 771 } 772 773 // Compact runs compaction over bucket. 774 // If maxCompactionTime is positive then after this time no more new compactions are started. 775 func (c *BucketCompactor) Compact(ctx context.Context, maxCompactionTime time.Duration) (rerr error) { 776 sp := opentracing.SpanFromContext(ctx) 777 if sp == nil { 778 sp, ctx = opentracing.StartSpanFromContext(ctx, "Compact") 779 } 780 sp.SetTag("max_compaction_time", maxCompactionTime) 781 sp.SetTag("concurrency", c.concurrency) 782 defer func() { 783 // Do not remove the compactDir if an error has occurred 784 // because potentially on the next run we would not have to download 785 // everything again. 786 if rerr != nil { 787 return 788 } 789 if err := os.RemoveAll(c.compactDir); err != nil { 790 level.Error(c.logger).Log("msg", "failed to remove compaction work directory", "path", c.compactDir, "err", err) 791 } 792 }() 793 794 var maxCompactionTimeChan <-chan time.Time 795 if maxCompactionTime > 0 { 796 maxCompactionTimeChan = time.After(maxCompactionTime) 797 } 798 799 // Loop over bucket and compact until there's no work left. 800 for { 801 var ( 802 wg sync.WaitGroup 803 workCtx, workCtxCancel = context.WithCancel(ctx) 804 jobChan = make(chan *Job) 805 errChan = make(chan error, c.concurrency) 806 finishedAllJobs = true 807 mtx sync.Mutex 808 ) 809 defer workCtxCancel() 810 811 // Set up workers who will compact the jobs when the jobs are ready. 812 // They will compact available jobs until they encounter an error, after which they will stop. 813 for i := 0; i < c.concurrency; i++ { 814 wg.Add(1) 815 go func() { 816 defer wg.Done() 817 for g := range jobChan { 818 // Ensure the job is still owned by the current compactor instance. 819 // If not, we shouldn't run it because another compactor instance may already 820 // process it (or will do it soon). 821 if ok, err := c.ownJob(g); err != nil { 822 level.Info(c.logger).Log("msg", "skipped compaction because unable to check whether the job is owned by the compactor instance", "groupKey", g.Key(), "err", err) 823 continue 824 } else if !ok { 825 level.Info(c.logger).Log("msg", "skipped compaction because job is not owned by the compactor instance anymore", "groupKey", g.Key()) 826 continue 827 } 828 829 c.metrics.groupCompactionRunsStarted.Inc() 830 831 shouldRerunJob, compactedBlockIDs, err := c.runCompactionJob(workCtx, g) 832 if err == nil { 833 c.metrics.groupCompactionRunsCompleted.Inc() 834 if hasNonZeroULIDs(compactedBlockIDs) { 835 c.metrics.groupCompactions.Inc() 836 } 837 838 if shouldRerunJob { 839 mtx.Lock() 840 finishedAllJobs = false 841 mtx.Unlock() 842 } 843 continue 844 } 845 846 // At this point the compaction has failed. 847 c.metrics.groupCompactionRunsFailed.Inc() 848 849 errChan <- errors.Wrapf(err, "group %s", g.Key()) 850 return 851 } 852 }() 853 } 854 855 level.Info(c.logger).Log("msg", "start sync of metas") 856 if err := c.sy.SyncMetas(ctx); err != nil { 857 ext.LogError(sp, err) 858 return errors.Wrap(err, "sync") 859 } 860 861 level.Info(c.logger).Log("msg", "start of GC") 862 // Blocks that were compacted are garbage collected after each Compaction. 863 // However if compactor crashes we need to resolve those on startup. 864 if err := c.sy.GarbageCollect(ctx); err != nil { 865 ext.LogError(sp, err) 866 return errors.Wrap(err, "blocks garbage collect") 867 } 868 869 jobs, err := c.grouper.Groups(c.sy.Metas()) 870 if err != nil { 871 ext.LogError(sp, err) 872 return errors.Wrap(err, "build compaction jobs") 873 } 874 sp.LogKV("discovered_jobs", len(jobs)) 875 876 // There is another check just before we start processing the job, but we can avoid sending it 877 // to the goroutine in the first place. 878 jobs, err = c.filterOwnJobs(jobs) 879 if err != nil { 880 return err 881 } 882 sp.LogKV("own_jobs", len(jobs)) 883 884 // Record the difference between now and the max time for a block being compacted. This 885 // is used to detect compactors not being able to keep up with the rate of blocks being 886 // created. The idea is that most blocks should be for within 24h or 48h. 887 now := time.Now() 888 for _, delta := range c.blockMaxTimeDeltas(now, jobs) { 889 c.metrics.blocksMaxTimeDelta.Observe(delta) 890 } 891 892 // Skip jobs for which the wait period hasn't been honored yet. 893 jobs = c.filterJobsByWaitPeriod(ctx, jobs) 894 sp.LogKV("filtered_jobs", len(jobs)) 895 896 // Sort jobs based on the configured ordering algorithm. 897 jobs = c.sortJobs(jobs) 898 899 ignoreDirs := []string{} 900 for _, gr := range jobs { 901 for _, grID := range gr.IDs() { 902 ignoreDirs = append(ignoreDirs, filepath.Join(gr.Key(), grID.String())) 903 } 904 } 905 906 if err := runutil.DeleteAll(c.compactDir, ignoreDirs...); err != nil { 907 level.Warn(c.logger).Log("msg", "failed deleting non-compaction job directories/files, some disk space usage might have leaked. Continuing", "err", err, "dir", c.compactDir) 908 } 909 910 level.Info(c.logger).Log("msg", "start of compactions") 911 912 maxCompactionTimeReached := false 913 // Send all jobs found during this pass to the compaction workers. 914 var jobErrs multierror.MultiError 915 jobLoop: 916 for _, g := range jobs { 917 select { 918 case jobErr := <-errChan: 919 ext.LogError(sp, jobErr) 920 jobErrs.Add(jobErr) 921 break jobLoop 922 case jobChan <- g: 923 case <-maxCompactionTimeChan: 924 maxCompactionTimeReached = true 925 level.Info(c.logger).Log("msg", "max compaction time reached, no more compactions will be started") 926 sp.LogKV("msg", "max compaction time reached, no more compactions will be started") 927 break jobLoop 928 } 929 } 930 close(jobChan) 931 wg.Wait() 932 933 // Collect any other error reported by the workers, or any error reported 934 // while we were waiting for the last batch of jobs to run the compaction. 935 close(errChan) 936 for jobErr := range errChan { 937 jobErrs.Add(jobErr) 938 } 939 940 workCtxCancel() 941 if len(jobErrs) > 0 { 942 return jobErrs.Err() 943 } 944 945 if maxCompactionTimeReached || finishedAllJobs { 946 break 947 } 948 } 949 level.Info(c.logger).Log("msg", "compaction iterations done") 950 return nil 951 } 952 953 // blockMaxTimeDeltas returns a slice of the difference between now and the MaxTime of each 954 // block that will be compacted as part of the provided jobs, in seconds. 955 func (c *BucketCompactor) blockMaxTimeDeltas(now time.Time, jobs []*Job) []float64 { 956 var out []float64 957 958 for _, j := range jobs { 959 for _, m := range j.Metas() { 960 out = append(out, now.Sub(time.UnixMilli(int64(m.MaxTime))).Seconds()) 961 } 962 } 963 964 return out 965 } 966 967 func (c *BucketCompactor) filterOwnJobs(jobs []*Job) ([]*Job, error) { 968 for ix := 0; ix < len(jobs); { 969 // Skip any job which doesn't belong to this compactor instance. 970 if ok, err := c.ownJob(jobs[ix]); err != nil { 971 return nil, errors.Wrap(err, "ownJob") 972 } else if !ok { 973 jobs = append(jobs[:ix], jobs[ix+1:]...) 974 } else { 975 ix++ 976 } 977 } 978 return jobs, nil 979 } 980 981 // filterJobsByWaitPeriod filters out jobs for which the configured wait period hasn't been honored yet. 982 func (c *BucketCompactor) filterJobsByWaitPeriod(ctx context.Context, jobs []*Job) []*Job { 983 for i := 0; i < len(jobs); { 984 if elapsed, notElapsedBlock, err := jobWaitPeriodElapsed(ctx, jobs[i], c.waitPeriod, c.bkt); err != nil { 985 level.Warn(c.logger).Log("msg", "not enforcing compaction wait period because the check if compaction job contains recently uploaded blocks has failed", "groupKey", jobs[i].Key(), "err", err) 986 987 // Keep the job. 988 i++ 989 } else if !elapsed { 990 level.Info(c.logger).Log("msg", "skipping compaction job because blocks in this job were uploaded too recently (within wait period)", "groupKey", jobs[i].Key(), "waitPeriodNotElapsedFor", notElapsedBlock.String()) 991 jobs = append(jobs[:i], jobs[i+1:]...) 992 } else { 993 i++ 994 } 995 } 996 997 return jobs 998 } 999 1000 var _ block.MetadataFilter = &NoCompactionMarkFilter{} 1001 1002 // NoCompactionMarkFilter is a block.Fetcher filter that finds all blocks with no-compact marker files, and optionally 1003 // removes them from synced metas. 1004 type NoCompactionMarkFilter struct { 1005 bkt objstore.BucketReader 1006 noCompactMarkedMap map[ulid.ULID]struct{} 1007 removeNoCompactBlocks bool 1008 } 1009 1010 // NewNoCompactionMarkFilter creates NoCompactionMarkFilter. 1011 func NewNoCompactionMarkFilter(bkt objstore.BucketReader, removeNoCompactBlocks bool) *NoCompactionMarkFilter { 1012 return &NoCompactionMarkFilter{ 1013 bkt: bkt, 1014 removeNoCompactBlocks: removeNoCompactBlocks, 1015 } 1016 } 1017 1018 // NoCompactMarkedBlocks returns block ids that were marked for no compaction. 1019 // It is safe to call this method only after Filter has finished, and it is also safe to manipulate the map between calls to Filter. 1020 func (f *NoCompactionMarkFilter) NoCompactMarkedBlocks() map[ulid.ULID]struct{} { 1021 return f.noCompactMarkedMap 1022 } 1023 1024 // Filter finds blocks that should not be compacted, and fills f.noCompactMarkedMap. If f.removeNoCompactBlocks is true, 1025 // blocks are also removed from metas. (Thanos version of the filter doesn't do removal). 1026 func (f *NoCompactionMarkFilter) Filter(ctx context.Context, metas map[ulid.ULID]*block.Meta, synced block.GaugeVec) error { 1027 noCompactMarkedMap := make(map[ulid.ULID]struct{}) 1028 1029 // Find all no-compact markers in the storage. 1030 err := f.bkt.Iter(ctx, block.MarkersPathname+"/", func(name string) error { 1031 if err := ctx.Err(); err != nil { 1032 return err 1033 } 1034 1035 if blockID, ok := block.IsNoCompactMarkFilename(path.Base(name)); ok { 1036 _, exists := metas[blockID] 1037 if exists { 1038 noCompactMarkedMap[blockID] = struct{}{} 1039 synced.WithLabelValues(block.MarkedForNoCompactionMeta).Inc() 1040 1041 if f.removeNoCompactBlocks { 1042 delete(metas, blockID) 1043 } 1044 } 1045 1046 } 1047 return nil 1048 }) 1049 if err != nil { 1050 return errors.Wrap(err, "list block no-compact marks") 1051 } 1052 1053 f.noCompactMarkedMap = noCompactMarkedMap 1054 return nil 1055 } 1056 1057 func hasNonZeroULIDs(ids []ulid.ULID) bool { 1058 for _, id := range ids { 1059 if id != (ulid.ULID{}) { 1060 return true 1061 } 1062 } 1063 1064 return false 1065 }