github.com/thanos-io/thanos@v0.32.5/pkg/compact/compact.go (about) 1 // Copyright (c) The Thanos Authors. 2 // Licensed under the Apache License 2.0. 3 4 package compact 5 6 import ( 7 "context" 8 "fmt" 9 "math" 10 "os" 11 "path/filepath" 12 "sort" 13 "sync" 14 "time" 15 16 "github.com/go-kit/log" 17 "github.com/go-kit/log/level" 18 "github.com/oklog/ulid" 19 "github.com/opentracing/opentracing-go" 20 "github.com/pkg/errors" 21 "github.com/prometheus/client_golang/prometheus" 22 "github.com/prometheus/client_golang/prometheus/promauto" 23 "github.com/prometheus/prometheus/model/labels" 24 "github.com/prometheus/prometheus/tsdb" 25 "github.com/thanos-io/objstore" 26 "golang.org/x/sync/errgroup" 27 28 "github.com/thanos-io/thanos/pkg/block" 29 "github.com/thanos-io/thanos/pkg/block/metadata" 30 "github.com/thanos-io/thanos/pkg/compact/downsample" 31 "github.com/thanos-io/thanos/pkg/errutil" 32 "github.com/thanos-io/thanos/pkg/runutil" 33 "github.com/thanos-io/thanos/pkg/tracing" 34 ) 35 36 type ResolutionLevel int64 37 38 const ( 39 ResolutionLevelRaw = ResolutionLevel(downsample.ResLevel0) 40 ResolutionLevel5m = ResolutionLevel(downsample.ResLevel1) 41 ResolutionLevel1h = ResolutionLevel(downsample.ResLevel2) 42 ) 43 44 const ( 45 // DedupAlgorithmPenalty is the penalty based compactor series merge algorithm. 46 // This is the same as the online deduplication of querier except counter reset handling. 47 DedupAlgorithmPenalty = "penalty" 48 ) 49 50 // Syncer synchronizes block metas from a bucket into a local directory. 51 // It sorts them into compaction groups based on equal label sets. 52 type Syncer struct { 53 logger log.Logger 54 reg prometheus.Registerer 55 bkt objstore.Bucket 56 fetcher block.MetadataFetcher 57 mtx sync.Mutex 58 blocks map[ulid.ULID]*metadata.Meta 59 partial map[ulid.ULID]error 60 metrics *syncerMetrics 61 duplicateBlocksFilter block.DeduplicateFilter 62 ignoreDeletionMarkFilter *block.IgnoreDeletionMarkFilter 63 } 64 65 type syncerMetrics struct { 66 garbageCollectedBlocks prometheus.Counter 67 garbageCollections prometheus.Counter 68 garbageCollectionFailures prometheus.Counter 69 garbageCollectionDuration prometheus.Histogram 70 blocksMarkedForDeletion prometheus.Counter 71 } 72 73 func newSyncerMetrics(reg prometheus.Registerer, blocksMarkedForDeletion, garbageCollectedBlocks prometheus.Counter) *syncerMetrics { 74 var m syncerMetrics 75 76 m.garbageCollectedBlocks = garbageCollectedBlocks 77 m.garbageCollections = promauto.With(reg).NewCounter(prometheus.CounterOpts{ 78 Name: "thanos_compact_garbage_collection_total", 79 Help: "Total number of garbage collection operations.", 80 }) 81 m.garbageCollectionFailures = promauto.With(reg).NewCounter(prometheus.CounterOpts{ 82 Name: "thanos_compact_garbage_collection_failures_total", 83 Help: "Total number of failed garbage collection operations.", 84 }) 85 m.garbageCollectionDuration = promauto.With(reg).NewHistogram(prometheus.HistogramOpts{ 86 Name: "thanos_compact_garbage_collection_duration_seconds", 87 Help: "Time it took to perform garbage collection iteration.", 88 Buckets: []float64{0.01, 0.1, 0.3, 0.6, 1, 3, 6, 9, 20, 30, 60, 90, 120, 240, 360, 720}, 89 }) 90 91 m.blocksMarkedForDeletion = blocksMarkedForDeletion 92 93 return &m 94 } 95 96 // NewMetaSyncer returns a new Syncer for the given Bucket and directory. 97 // Blocks must be at least as old as the sync delay for being considered. 98 func NewMetaSyncer(logger log.Logger, reg prometheus.Registerer, bkt objstore.Bucket, fetcher block.MetadataFetcher, duplicateBlocksFilter block.DeduplicateFilter, ignoreDeletionMarkFilter *block.IgnoreDeletionMarkFilter, blocksMarkedForDeletion, garbageCollectedBlocks prometheus.Counter) (*Syncer, error) { 99 if logger == nil { 100 logger = log.NewNopLogger() 101 } 102 return &Syncer{ 103 logger: logger, 104 reg: reg, 105 bkt: bkt, 106 fetcher: fetcher, 107 blocks: map[ulid.ULID]*metadata.Meta{}, 108 metrics: newSyncerMetrics(reg, blocksMarkedForDeletion, garbageCollectedBlocks), 109 duplicateBlocksFilter: duplicateBlocksFilter, 110 ignoreDeletionMarkFilter: ignoreDeletionMarkFilter, 111 }, nil 112 } 113 114 // UntilNextDownsampling calculates how long it will take until the next downsampling operation. 115 // Returns an error if there will be no downsampling. 116 func UntilNextDownsampling(m *metadata.Meta) (time.Duration, error) { 117 timeRange := time.Duration((m.MaxTime - m.MinTime) * int64(time.Millisecond)) 118 switch m.Thanos.Downsample.Resolution { 119 case downsample.ResLevel2: 120 return time.Duration(0), errors.New("no downsampling") 121 case downsample.ResLevel1: 122 return time.Duration(downsample.ResLevel2DownsampleRange*time.Millisecond) - timeRange, nil 123 case downsample.ResLevel0: 124 return time.Duration(downsample.ResLevel1DownsampleRange*time.Millisecond) - timeRange, nil 125 default: 126 panic(errors.Errorf("invalid resolution %v", m.Thanos.Downsample.Resolution)) 127 } 128 } 129 130 // SyncMetas synchronizes local state of block metas with what we have in the bucket. 131 func (s *Syncer) SyncMetas(ctx context.Context) error { 132 s.mtx.Lock() 133 defer s.mtx.Unlock() 134 135 metas, partial, err := s.fetcher.Fetch(ctx) 136 if err != nil { 137 return retry(err) 138 } 139 s.blocks = metas 140 s.partial = partial 141 return nil 142 } 143 144 // Partial returns partial blocks since last sync. 145 func (s *Syncer) Partial() map[ulid.ULID]error { 146 s.mtx.Lock() 147 defer s.mtx.Unlock() 148 149 return s.partial 150 } 151 152 // Metas returns loaded metadata blocks since last sync. 153 func (s *Syncer) Metas() map[ulid.ULID]*metadata.Meta { 154 s.mtx.Lock() 155 defer s.mtx.Unlock() 156 157 return s.blocks 158 } 159 160 // GarbageCollect marks blocks for deletion from bucket if their data is available as part of a 161 // block with a higher compaction level. 162 // Call to SyncMetas function is required to populate duplicateIDs in duplicateBlocksFilter. 163 func (s *Syncer) GarbageCollect(ctx context.Context) error { 164 s.mtx.Lock() 165 defer s.mtx.Unlock() 166 167 begin := time.Now() 168 169 // Ignore filter exists before deduplicate filter. 170 deletionMarkMap := s.ignoreDeletionMarkFilter.DeletionMarkBlocks() 171 duplicateIDs := s.duplicateBlocksFilter.DuplicateIDs() 172 173 // GarbageIDs contains the duplicateIDs, since these blocks can be replaced with other blocks. 174 // We also remove ids present in deletionMarkMap since these blocks are already marked for deletion. 175 garbageIDs := []ulid.ULID{} 176 for _, id := range duplicateIDs { 177 if _, exists := deletionMarkMap[id]; exists { 178 continue 179 } 180 garbageIDs = append(garbageIDs, id) 181 } 182 183 for _, id := range garbageIDs { 184 if ctx.Err() != nil { 185 return ctx.Err() 186 } 187 188 // Spawn a new context so we always mark a block for deletion in full on shutdown. 189 delCtx, cancel := context.WithTimeout(context.Background(), 5*time.Minute) 190 191 level.Info(s.logger).Log("msg", "marking outdated block for deletion", "block", id) 192 err := block.MarkForDeletion(delCtx, s.logger, s.bkt, id, "outdated block", s.metrics.blocksMarkedForDeletion) 193 cancel() 194 if err != nil { 195 s.metrics.garbageCollectionFailures.Inc() 196 return retry(errors.Wrapf(err, "mark block %s for deletion", id)) 197 } 198 199 // Immediately update our in-memory state so no further call to SyncMetas is needed 200 // after running garbage collection. 201 delete(s.blocks, id) 202 s.metrics.garbageCollectedBlocks.Inc() 203 } 204 s.metrics.garbageCollections.Inc() 205 s.metrics.garbageCollectionDuration.Observe(time.Since(begin).Seconds()) 206 return nil 207 } 208 209 // Grouper is responsible to group all known blocks into sub groups which are safe to be 210 // compacted concurrently. 211 type Grouper interface { 212 // Groups returns the compaction groups for all blocks currently known to the syncer. 213 // It creates all groups from the scratch on every call. 214 Groups(blocks map[ulid.ULID]*metadata.Meta) (res []*Group, err error) 215 } 216 217 // DefaultGrouper is the Thanos built-in grouper. It groups blocks based on downsample 218 // resolution and block's labels. 219 type DefaultGrouper struct { 220 bkt objstore.Bucket 221 logger log.Logger 222 acceptMalformedIndex bool 223 enableVerticalCompaction bool 224 compactions *prometheus.CounterVec 225 compactionRunsStarted *prometheus.CounterVec 226 compactionRunsCompleted *prometheus.CounterVec 227 compactionFailures *prometheus.CounterVec 228 verticalCompactions *prometheus.CounterVec 229 garbageCollectedBlocks prometheus.Counter 230 blocksMarkedForDeletion prometheus.Counter 231 blocksMarkedForNoCompact prometheus.Counter 232 hashFunc metadata.HashFunc 233 blockFilesConcurrency int 234 compactBlocksFetchConcurrency int 235 } 236 237 // NewDefaultGrouper makes a new DefaultGrouper. 238 func NewDefaultGrouper( 239 logger log.Logger, 240 bkt objstore.Bucket, 241 acceptMalformedIndex bool, 242 enableVerticalCompaction bool, 243 reg prometheus.Registerer, 244 blocksMarkedForDeletion prometheus.Counter, 245 garbageCollectedBlocks prometheus.Counter, 246 blocksMarkedForNoCompact prometheus.Counter, 247 hashFunc metadata.HashFunc, 248 blockFilesConcurrency int, 249 compactBlocksFetchConcurrency int, 250 ) *DefaultGrouper { 251 return &DefaultGrouper{ 252 bkt: bkt, 253 logger: logger, 254 acceptMalformedIndex: acceptMalformedIndex, 255 enableVerticalCompaction: enableVerticalCompaction, 256 compactions: promauto.With(reg).NewCounterVec(prometheus.CounterOpts{ 257 Name: "thanos_compact_group_compactions_total", 258 Help: "Total number of group compaction attempts that resulted in a new block.", 259 }, []string{"resolution"}), 260 compactionRunsStarted: promauto.With(reg).NewCounterVec(prometheus.CounterOpts{ 261 Name: "thanos_compact_group_compaction_runs_started_total", 262 Help: "Total number of group compaction attempts.", 263 }, []string{"resolution"}), 264 compactionRunsCompleted: promauto.With(reg).NewCounterVec(prometheus.CounterOpts{ 265 Name: "thanos_compact_group_compaction_runs_completed_total", 266 Help: "Total number of group completed compaction runs. This also includes compactor group runs that resulted with no compaction.", 267 }, []string{"resolution"}), 268 compactionFailures: promauto.With(reg).NewCounterVec(prometheus.CounterOpts{ 269 Name: "thanos_compact_group_compactions_failures_total", 270 Help: "Total number of failed group compactions.", 271 }, []string{"resolution"}), 272 verticalCompactions: promauto.With(reg).NewCounterVec(prometheus.CounterOpts{ 273 Name: "thanos_compact_group_vertical_compactions_total", 274 Help: "Total number of group compaction attempts that resulted in a new block based on overlapping blocks.", 275 }, []string{"resolution"}), 276 blocksMarkedForNoCompact: blocksMarkedForNoCompact, 277 garbageCollectedBlocks: garbageCollectedBlocks, 278 blocksMarkedForDeletion: blocksMarkedForDeletion, 279 hashFunc: hashFunc, 280 blockFilesConcurrency: blockFilesConcurrency, 281 compactBlocksFetchConcurrency: compactBlocksFetchConcurrency, 282 } 283 } 284 285 // Groups returns the compaction groups for all blocks currently known to the syncer. 286 // It creates all groups from the scratch on every call. 287 func (g *DefaultGrouper) Groups(blocks map[ulid.ULID]*metadata.Meta) (res []*Group, err error) { 288 groups := map[string]*Group{} 289 for _, m := range blocks { 290 groupKey := m.Thanos.GroupKey() 291 group, ok := groups[groupKey] 292 if !ok { 293 lbls := labels.FromMap(m.Thanos.Labels) 294 resolutionLabel := m.Thanos.ResolutionString() 295 group, err = NewGroup( 296 log.With(g.logger, "group", fmt.Sprintf("%s@%v", resolutionLabel, lbls.String()), "groupKey", groupKey), 297 g.bkt, 298 groupKey, 299 lbls, 300 m.Thanos.Downsample.Resolution, 301 g.acceptMalformedIndex, 302 g.enableVerticalCompaction, 303 g.compactions.WithLabelValues(resolutionLabel), 304 g.compactionRunsStarted.WithLabelValues(resolutionLabel), 305 g.compactionRunsCompleted.WithLabelValues(resolutionLabel), 306 g.compactionFailures.WithLabelValues(resolutionLabel), 307 g.verticalCompactions.WithLabelValues(resolutionLabel), 308 g.garbageCollectedBlocks, 309 g.blocksMarkedForDeletion, 310 g.blocksMarkedForNoCompact, 311 g.hashFunc, 312 g.blockFilesConcurrency, 313 g.compactBlocksFetchConcurrency, 314 ) 315 if err != nil { 316 return nil, errors.Wrap(err, "create compaction group") 317 } 318 groups[groupKey] = group 319 res = append(res, group) 320 } 321 if err := group.AppendMeta(m); err != nil { 322 return nil, errors.Wrap(err, "add compaction group") 323 } 324 } 325 sort.Slice(res, func(i, j int) bool { 326 return res[i].Key() < res[j].Key() 327 }) 328 return res, nil 329 } 330 331 // Group captures a set of blocks that have the same origin labels and downsampling resolution. 332 // Those blocks generally contain the same series and can thus efficiently be compacted. 333 type Group struct { 334 logger log.Logger 335 bkt objstore.Bucket 336 key string 337 labels labels.Labels 338 resolution int64 339 mtx sync.Mutex 340 metasByMinTime []*metadata.Meta 341 acceptMalformedIndex bool 342 enableVerticalCompaction bool 343 compactions prometheus.Counter 344 compactionRunsStarted prometheus.Counter 345 compactionRunsCompleted prometheus.Counter 346 compactionFailures prometheus.Counter 347 verticalCompactions prometheus.Counter 348 groupGarbageCollectedBlocks prometheus.Counter 349 blocksMarkedForDeletion prometheus.Counter 350 blocksMarkedForNoCompact prometheus.Counter 351 hashFunc metadata.HashFunc 352 blockFilesConcurrency int 353 compactBlocksFetchConcurrency int 354 extensions any 355 } 356 357 // NewGroup returns a new compaction group. 358 func NewGroup( 359 logger log.Logger, 360 bkt objstore.Bucket, 361 key string, 362 lset labels.Labels, 363 resolution int64, 364 acceptMalformedIndex bool, 365 enableVerticalCompaction bool, 366 compactions prometheus.Counter, 367 compactionRunsStarted prometheus.Counter, 368 compactionRunsCompleted prometheus.Counter, 369 compactionFailures prometheus.Counter, 370 verticalCompactions prometheus.Counter, 371 groupGarbageCollectedBlocks prometheus.Counter, 372 blocksMarkedForDeletion prometheus.Counter, 373 blocksMarkedForNoCompact prometheus.Counter, 374 hashFunc metadata.HashFunc, 375 blockFilesConcurrency int, 376 compactBlocksFetchConcurrency int, 377 ) (*Group, error) { 378 if logger == nil { 379 logger = log.NewNopLogger() 380 } 381 382 if blockFilesConcurrency <= 0 { 383 return nil, errors.Errorf("invalid concurrency level (%d), blockFilesConcurrency level must be > 0", blockFilesConcurrency) 384 } 385 386 g := &Group{ 387 logger: logger, 388 bkt: bkt, 389 key: key, 390 labels: lset, 391 resolution: resolution, 392 acceptMalformedIndex: acceptMalformedIndex, 393 enableVerticalCompaction: enableVerticalCompaction, 394 compactions: compactions, 395 compactionRunsStarted: compactionRunsStarted, 396 compactionRunsCompleted: compactionRunsCompleted, 397 compactionFailures: compactionFailures, 398 verticalCompactions: verticalCompactions, 399 groupGarbageCollectedBlocks: groupGarbageCollectedBlocks, 400 blocksMarkedForDeletion: blocksMarkedForDeletion, 401 blocksMarkedForNoCompact: blocksMarkedForNoCompact, 402 hashFunc: hashFunc, 403 blockFilesConcurrency: blockFilesConcurrency, 404 compactBlocksFetchConcurrency: compactBlocksFetchConcurrency, 405 } 406 return g, nil 407 } 408 409 // Key returns an identifier for the group. 410 func (cg *Group) Key() string { 411 return cg.key 412 } 413 414 func (cg *Group) deleteFromGroup(target map[ulid.ULID]struct{}) { 415 cg.mtx.Lock() 416 defer cg.mtx.Unlock() 417 var newGroupMeta []*metadata.Meta 418 for _, meta := range cg.metasByMinTime { 419 if _, found := target[meta.BlockMeta.ULID]; !found { 420 newGroupMeta = append(newGroupMeta, meta) 421 } 422 } 423 424 cg.metasByMinTime = newGroupMeta 425 } 426 427 // AppendMeta the block with the given meta to the group. 428 func (cg *Group) AppendMeta(meta *metadata.Meta) error { 429 cg.mtx.Lock() 430 defer cg.mtx.Unlock() 431 432 if !labels.Equal(cg.labels, labels.FromMap(meta.Thanos.Labels)) { 433 return errors.New("block and group labels do not match") 434 } 435 if cg.resolution != meta.Thanos.Downsample.Resolution { 436 return errors.New("block and group resolution do not match") 437 } 438 439 cg.metasByMinTime = append(cg.metasByMinTime, meta) 440 sort.Slice(cg.metasByMinTime, func(i, j int) bool { 441 return cg.metasByMinTime[i].MinTime < cg.metasByMinTime[j].MinTime 442 }) 443 return nil 444 } 445 446 // IDs returns all sorted IDs of blocks in the group. 447 func (cg *Group) IDs() (ids []ulid.ULID) { 448 cg.mtx.Lock() 449 defer cg.mtx.Unlock() 450 451 for _, m := range cg.metasByMinTime { 452 ids = append(ids, m.ULID) 453 } 454 sort.Slice(ids, func(i, j int) bool { 455 return ids[i].Compare(ids[j]) < 0 456 }) 457 return ids 458 } 459 460 // MinTime returns the min time across all group's blocks. 461 func (cg *Group) MinTime() int64 { 462 cg.mtx.Lock() 463 defer cg.mtx.Unlock() 464 465 if len(cg.metasByMinTime) > 0 { 466 return cg.metasByMinTime[0].MinTime 467 } 468 return math.MaxInt64 469 } 470 471 // MaxTime returns the max time across all group's blocks. 472 func (cg *Group) MaxTime() int64 { 473 cg.mtx.Lock() 474 defer cg.mtx.Unlock() 475 476 max := int64(math.MinInt64) 477 for _, m := range cg.metasByMinTime { 478 if m.MaxTime > max { 479 max = m.MaxTime 480 } 481 } 482 return max 483 } 484 485 // Labels returns the labels that all blocks in the group share. 486 func (cg *Group) Labels() labels.Labels { 487 return cg.labels 488 } 489 490 // Resolution returns the common downsampling resolution of blocks in the group. 491 func (cg *Group) Resolution() int64 { 492 return cg.resolution 493 } 494 495 func (cg *Group) Extensions() any { 496 return cg.extensions 497 } 498 499 func (cg *Group) SetExtensions(extensions any) { 500 cg.extensions = extensions 501 } 502 503 // CompactProgressMetrics contains Prometheus metrics related to compaction progress. 504 type CompactProgressMetrics struct { 505 NumberOfCompactionRuns prometheus.Gauge 506 NumberOfCompactionBlocks prometheus.Gauge 507 } 508 509 // ProgressCalculator calculates the progress of the compaction process for a given slice of Groups. 510 type ProgressCalculator interface { 511 ProgressCalculate(ctx context.Context, groups []*Group) error 512 } 513 514 // CompactionProgressCalculator contains a planner and ProgressMetrics, which are updated during the compaction simulation process. 515 type CompactionProgressCalculator struct { 516 planner Planner 517 *CompactProgressMetrics 518 } 519 520 // NewCompactProgressCalculator creates a new CompactionProgressCalculator. 521 func NewCompactionProgressCalculator(reg prometheus.Registerer, planner *tsdbBasedPlanner) *CompactionProgressCalculator { 522 return &CompactionProgressCalculator{ 523 planner: planner, 524 CompactProgressMetrics: &CompactProgressMetrics{ 525 NumberOfCompactionRuns: promauto.With(reg).NewGauge(prometheus.GaugeOpts{ 526 Name: "thanos_compact_todo_compactions", 527 Help: "number of compactions to be done", 528 }), 529 NumberOfCompactionBlocks: promauto.With(reg).NewGauge(prometheus.GaugeOpts{ 530 Name: "thanos_compact_todo_compaction_blocks", 531 Help: "number of blocks planned to be compacted", 532 }), 533 }, 534 } 535 } 536 537 // ProgressCalculate calculates the number of blocks and compaction runs in the planning process of the given groups. 538 func (ps *CompactionProgressCalculator) ProgressCalculate(ctx context.Context, groups []*Group) error { 539 groupCompactions := make(map[string]int, len(groups)) 540 groupBlocks := make(map[string]int, len(groups)) 541 542 for len(groups) > 0 { 543 tmpGroups := make([]*Group, 0, len(groups)) 544 for _, g := range groups { 545 if len(g.IDs()) == 1 { 546 continue 547 } 548 plan, err := ps.planner.Plan(ctx, g.metasByMinTime, nil, g.extensions) 549 if err != nil { 550 return errors.Wrapf(err, "could not plan") 551 } 552 if len(plan) == 0 { 553 continue 554 } 555 groupCompactions[g.key]++ 556 557 toRemove := make(map[ulid.ULID]struct{}, len(plan)) 558 metas := make([]*tsdb.BlockMeta, 0, len(plan)) 559 for _, p := range plan { 560 metas = append(metas, &p.BlockMeta) 561 toRemove[p.BlockMeta.ULID] = struct{}{} 562 } 563 g.deleteFromGroup(toRemove) 564 565 groupBlocks[g.key] += len(plan) 566 567 if len(g.metasByMinTime) == 0 { 568 continue 569 } 570 571 newMeta := tsdb.CompactBlockMetas(ulid.MustNew(uint64(time.Now().Unix()), nil), metas...) 572 if err := g.AppendMeta(&metadata.Meta{BlockMeta: *newMeta, Thanos: metadata.Thanos{Downsample: metadata.ThanosDownsample{Resolution: g.Resolution()}, Labels: g.Labels().Map()}}); err != nil { 573 return errors.Wrapf(err, "append meta") 574 } 575 tmpGroups = append(tmpGroups, g) 576 } 577 578 groups = tmpGroups 579 } 580 581 ps.CompactProgressMetrics.NumberOfCompactionRuns.Set(0) 582 ps.CompactProgressMetrics.NumberOfCompactionBlocks.Set(0) 583 584 for key, iters := range groupCompactions { 585 ps.CompactProgressMetrics.NumberOfCompactionRuns.Add(float64(iters)) 586 ps.CompactProgressMetrics.NumberOfCompactionBlocks.Add(float64(groupBlocks[key])) 587 } 588 589 return nil 590 } 591 592 // DownsampleProgressMetrics contains Prometheus metrics related to downsampling progress. 593 type DownsampleProgressMetrics struct { 594 NumberOfBlocksDownsampled prometheus.Gauge 595 } 596 597 // DownsampleProgressCalculator contains DownsampleMetrics, which are updated during the downsampling simulation process. 598 type DownsampleProgressCalculator struct { 599 *DownsampleProgressMetrics 600 } 601 602 // NewDownsampleProgressCalculator creates a new DownsampleProgressCalculator. 603 func NewDownsampleProgressCalculator(reg prometheus.Registerer) *DownsampleProgressCalculator { 604 return &DownsampleProgressCalculator{ 605 DownsampleProgressMetrics: &DownsampleProgressMetrics{ 606 NumberOfBlocksDownsampled: promauto.With(reg).NewGauge(prometheus.GaugeOpts{ 607 Name: "thanos_compact_todo_downsample_blocks", 608 Help: "number of blocks to be downsampled", 609 }), 610 }, 611 } 612 } 613 614 // ProgressCalculate calculates the number of blocks to be downsampled for the given groups. 615 func (ds *DownsampleProgressCalculator) ProgressCalculate(ctx context.Context, groups []*Group) error { 616 sources5m := map[ulid.ULID]struct{}{} 617 sources1h := map[ulid.ULID]struct{}{} 618 groupBlocks := make(map[string]int, len(groups)) 619 620 for _, group := range groups { 621 for _, m := range group.metasByMinTime { 622 switch m.Thanos.Downsample.Resolution { 623 case downsample.ResLevel0: 624 continue 625 case downsample.ResLevel1: 626 for _, id := range m.Compaction.Sources { 627 sources5m[id] = struct{}{} 628 } 629 case downsample.ResLevel2: 630 for _, id := range m.Compaction.Sources { 631 sources1h[id] = struct{}{} 632 } 633 default: 634 return errors.Errorf("unexpected downsampling resolution %d", m.Thanos.Downsample.Resolution) 635 } 636 637 } 638 } 639 640 for _, group := range groups { 641 for _, m := range group.metasByMinTime { 642 switch m.Thanos.Downsample.Resolution { 643 case downsample.ResLevel0: 644 missing := false 645 for _, id := range m.Compaction.Sources { 646 if _, ok := sources5m[id]; !ok { 647 missing = true 648 break 649 } 650 } 651 if !missing { 652 continue 653 } 654 655 if m.MaxTime-m.MinTime < downsample.ResLevel1DownsampleRange { 656 continue 657 } 658 groupBlocks[group.key]++ 659 case downsample.ResLevel1: 660 missing := false 661 for _, id := range m.Compaction.Sources { 662 if _, ok := sources1h[id]; !ok { 663 missing = true 664 break 665 } 666 } 667 if !missing { 668 continue 669 } 670 671 if m.MaxTime-m.MinTime < downsample.ResLevel2DownsampleRange { 672 continue 673 } 674 groupBlocks[group.key]++ 675 } 676 } 677 } 678 679 ds.DownsampleProgressMetrics.NumberOfBlocksDownsampled.Set(0) 680 for _, blocks := range groupBlocks { 681 ds.DownsampleProgressMetrics.NumberOfBlocksDownsampled.Add(float64(blocks)) 682 } 683 684 return nil 685 } 686 687 // RetentionProgressMetrics contains Prometheus metrics related to retention progress. 688 type RetentionProgressMetrics struct { 689 NumberOfBlocksToDelete prometheus.Gauge 690 } 691 692 // RetentionProgressCalculator contains RetentionProgressMetrics, which are updated during the retention simulation process. 693 type RetentionProgressCalculator struct { 694 *RetentionProgressMetrics 695 retentionByResolution map[ResolutionLevel]time.Duration 696 } 697 698 // NewRetentionProgressCalculator creates a new RetentionProgressCalculator. 699 func NewRetentionProgressCalculator(reg prometheus.Registerer, retentionByResolution map[ResolutionLevel]time.Duration) *RetentionProgressCalculator { 700 return &RetentionProgressCalculator{ 701 retentionByResolution: retentionByResolution, 702 RetentionProgressMetrics: &RetentionProgressMetrics{ 703 NumberOfBlocksToDelete: promauto.With(reg).NewGauge(prometheus.GaugeOpts{ 704 Name: "thanos_compact_todo_deletion_blocks", 705 Help: "number of blocks that have crossed their retention period", 706 }), 707 }, 708 } 709 } 710 711 // ProgressCalculate calculates the number of blocks to be retained for the given groups. 712 func (rs *RetentionProgressCalculator) ProgressCalculate(ctx context.Context, groups []*Group) error { 713 groupBlocks := make(map[string]int, len(groups)) 714 715 for _, group := range groups { 716 for _, m := range group.metasByMinTime { 717 retentionDuration := rs.retentionByResolution[ResolutionLevel(m.Thanos.Downsample.Resolution)] 718 if retentionDuration.Seconds() == 0 { 719 continue 720 } 721 maxTime := time.Unix(m.MaxTime/1000, 0) 722 if time.Now().After(maxTime.Add(retentionDuration)) { 723 groupBlocks[group.key]++ 724 } 725 } 726 } 727 728 rs.RetentionProgressMetrics.NumberOfBlocksToDelete.Set(0) 729 for _, blocks := range groupBlocks { 730 rs.RetentionProgressMetrics.NumberOfBlocksToDelete.Add(float64(blocks)) 731 } 732 733 return nil 734 } 735 736 // Planner returns blocks to compact. 737 type Planner interface { 738 // Plan returns a list of blocks that should be compacted into single one. 739 // The blocks can be overlapping. The provided metadata has to be ordered by minTime. 740 Plan(ctx context.Context, metasByMinTime []*metadata.Meta, errChan chan error, extensions any) ([]*metadata.Meta, error) 741 } 742 743 type BlockDeletableChecker interface { 744 CanDelete(group *Group, blockID ulid.ULID) bool 745 } 746 747 type DefaultBlockDeletableChecker struct { 748 } 749 750 func (c DefaultBlockDeletableChecker) CanDelete(_ *Group, _ ulid.ULID) bool { 751 return true 752 } 753 754 type CompactionLifecycleCallback interface { 755 PreCompactionCallback(ctx context.Context, logger log.Logger, group *Group, toCompactBlocks []*metadata.Meta) error 756 PostCompactionCallback(ctx context.Context, logger log.Logger, group *Group, blockID ulid.ULID) error 757 GetBlockPopulator(ctx context.Context, logger log.Logger, group *Group) (tsdb.BlockPopulator, error) 758 } 759 760 type DefaultCompactionLifecycleCallback struct { 761 } 762 763 func (c DefaultCompactionLifecycleCallback) PreCompactionCallback(_ context.Context, _ log.Logger, _ *Group, toCompactBlocks []*metadata.Meta) error { 764 // Due to #183 we verify that none of the blocks in the plan have overlapping sources. 765 // This is one potential source of how we could end up with duplicated chunks. 766 uniqueSources := map[ulid.ULID]struct{}{} 767 for _, m := range toCompactBlocks { 768 for _, s := range m.Compaction.Sources { 769 if _, ok := uniqueSources[s]; ok { 770 return halt(errors.Errorf("overlapping sources detected for plan %v", toCompactBlocks)) 771 } 772 uniqueSources[s] = struct{}{} 773 } 774 } 775 return nil 776 } 777 778 func (c DefaultCompactionLifecycleCallback) PostCompactionCallback(_ context.Context, _ log.Logger, _ *Group, _ ulid.ULID) error { 779 return nil 780 } 781 782 func (c DefaultCompactionLifecycleCallback) GetBlockPopulator(_ context.Context, _ log.Logger, _ *Group) (tsdb.BlockPopulator, error) { 783 return tsdb.DefaultBlockPopulator{}, nil 784 } 785 786 // Compactor provides compaction against an underlying storage of time series data. 787 // This is similar to tsdb.Compactor just without Plan method. 788 // TODO(bwplotka): Split the Planner from Compactor on upstream as well, so we can import it. 789 type Compactor interface { 790 // Write persists a Block into a directory. 791 // No Block is written when resulting Block has 0 samples, and returns empty ulid.ULID{}. 792 Write(dest string, b tsdb.BlockReader, mint, maxt int64, parent *tsdb.BlockMeta) (ulid.ULID, error) 793 794 // Compact runs compaction against the provided directories. Must 795 // only be called concurrently with results of Plan(). 796 // Can optionally pass a list of already open blocks, 797 // to avoid having to reopen them. 798 // When resulting Block has 0 samples 799 // * No block is written. 800 // * The source dirs are marked Deletable. 801 // * Returns empty ulid.ULID{}. 802 Compact(dest string, dirs []string, open []*tsdb.Block) (ulid.ULID, error) 803 CompactWithBlockPopulator(dest string, dirs []string, open []*tsdb.Block, blockPopulator tsdb.BlockPopulator) (ulid.ULID, error) 804 } 805 806 // Compact plans and runs a single compaction against the group. The compacted result 807 // is uploaded into the bucket the blocks were retrieved from. 808 func (cg *Group) Compact(ctx context.Context, dir string, planner Planner, comp Compactor, blockDeletableChecker BlockDeletableChecker, compactionLifecycleCallback CompactionLifecycleCallback) (shouldRerun bool, compID ulid.ULID, rerr error) { 809 cg.compactionRunsStarted.Inc() 810 811 subDir := filepath.Join(dir, cg.Key()) 812 813 defer func() { 814 // Leave the compact directory for inspection if it is a halt error 815 // or if it is not then so that possibly we would not have to download everything again. 816 if rerr != nil { 817 return 818 } 819 if err := os.RemoveAll(subDir); err != nil { 820 level.Error(cg.logger).Log("msg", "failed to remove compaction group work directory", "path", subDir, "err", err) 821 } 822 }() 823 824 if err := os.MkdirAll(subDir, 0750); err != nil { 825 return false, ulid.ULID{}, errors.Wrap(err, "create compaction group dir") 826 } 827 828 errChan := make(chan error, 1) 829 err := tracing.DoInSpanWithErr(ctx, "compaction_group", func(ctx context.Context) (err error) { 830 shouldRerun, compID, err = cg.compact(ctx, subDir, planner, comp, blockDeletableChecker, compactionLifecycleCallback, errChan) 831 return err 832 }, opentracing.Tags{"group.key": cg.Key()}) 833 errChan <- err 834 close(errChan) 835 if err != nil { 836 cg.compactionFailures.Inc() 837 return false, ulid.ULID{}, err 838 } 839 cg.compactionRunsCompleted.Inc() 840 return shouldRerun, compID, nil 841 } 842 843 // Issue347Error is a type wrapper for errors that should invoke repair process for broken block. 844 type Issue347Error struct { 845 err error 846 847 id ulid.ULID 848 } 849 850 func issue347Error(err error, brokenBlock ulid.ULID) Issue347Error { 851 return Issue347Error{err: err, id: brokenBlock} 852 } 853 854 func (e Issue347Error) Error() string { 855 return e.err.Error() 856 } 857 858 // IsIssue347Error returns true if the base error is a Issue347Error. 859 func IsIssue347Error(err error) bool { 860 _, ok := errors.Cause(err).(Issue347Error) 861 return ok 862 } 863 864 // OutOfOrderChunkError is a type wrapper for OOO chunk error from validating block index. 865 type OutOfOrderChunksError struct { 866 err error 867 id ulid.ULID 868 } 869 870 func (e OutOfOrderChunksError) Error() string { 871 return e.err.Error() 872 } 873 874 func outOfOrderChunkError(err error, brokenBlock ulid.ULID) OutOfOrderChunksError { 875 return OutOfOrderChunksError{err: err, id: brokenBlock} 876 } 877 878 // IsOutOfOrderChunkError returns true if the base error is a OutOfOrderChunkError. 879 func IsOutOfOrderChunkError(err error) bool { 880 _, ok := errors.Cause(err).(OutOfOrderChunksError) 881 return ok 882 } 883 884 // HaltError is a type wrapper for errors that should halt any further progress on compactions. 885 type HaltError struct { 886 err error 887 } 888 889 func halt(err error) HaltError { 890 return HaltError{err: err} 891 } 892 893 func (e HaltError) Error() string { 894 return e.err.Error() 895 } 896 897 // IsHaltError returns true if the base error is a HaltError. 898 // If a multierror is passed, any halt error will return true. 899 func IsHaltError(err error) bool { 900 if multiErr, ok := errors.Cause(err).(errutil.NonNilMultiError); ok { 901 for _, err := range multiErr { 902 if _, ok := errors.Cause(err).(HaltError); ok { 903 return true 904 } 905 } 906 return false 907 } 908 909 _, ok := errors.Cause(err).(HaltError) 910 return ok 911 } 912 913 // RetryError is a type wrapper for errors that should trigger warning log and retry whole compaction loop, but aborting 914 // current compaction further progress. 915 type RetryError struct { 916 err error 917 } 918 919 func retry(err error) error { 920 if IsHaltError(err) { 921 return err 922 } 923 return RetryError{err: err} 924 } 925 926 func (e RetryError) Error() string { 927 return e.err.Error() 928 } 929 930 // IsRetryError returns true if the base error is a RetryError. 931 // If a multierror is passed, all errors must be retriable. 932 func IsRetryError(err error) bool { 933 if multiErr, ok := errors.Cause(err).(errutil.NonNilMultiError); ok { 934 for _, err := range multiErr { 935 if _, ok := errors.Cause(err).(RetryError); !ok { 936 return false 937 } 938 } 939 return true 940 } 941 942 _, ok := errors.Cause(err).(RetryError) 943 return ok 944 } 945 946 func (cg *Group) areBlocksOverlapping(include *metadata.Meta, exclude ...*metadata.Meta) error { 947 var ( 948 metas []tsdb.BlockMeta 949 excludeMap = map[ulid.ULID]struct{}{} 950 ) 951 952 for _, meta := range exclude { 953 excludeMap[meta.ULID] = struct{}{} 954 } 955 956 for _, m := range cg.metasByMinTime { 957 if _, ok := excludeMap[m.ULID]; ok { 958 continue 959 } 960 metas = append(metas, m.BlockMeta) 961 } 962 963 if include != nil { 964 metas = append(metas, include.BlockMeta) 965 } 966 967 sort.Slice(metas, func(i, j int) bool { 968 return metas[i].MinTime < metas[j].MinTime 969 }) 970 if overlaps := tsdb.OverlappingBlocks(metas); len(overlaps) > 0 { 971 return errors.Errorf("overlaps found while gathering blocks. %s", overlaps) 972 } 973 return nil 974 } 975 976 // RepairIssue347 repairs the https://github.com/prometheus/tsdb/issues/347 issue when having issue347Error. 977 func RepairIssue347(ctx context.Context, logger log.Logger, bkt objstore.Bucket, blocksMarkedForDeletion prometheus.Counter, issue347Err error) error { 978 ie, ok := errors.Cause(issue347Err).(Issue347Error) 979 if !ok { 980 return errors.Errorf("Given error is not an issue347 error: %v", issue347Err) 981 } 982 983 level.Info(logger).Log("msg", "Repairing block broken by https://github.com/prometheus/tsdb/issues/347", "id", ie.id, "err", issue347Err) 984 985 tmpdir, err := os.MkdirTemp("", fmt.Sprintf("repair-issue-347-id-%s-", ie.id)) 986 if err != nil { 987 return err 988 } 989 990 defer func() { 991 if err := os.RemoveAll(tmpdir); err != nil { 992 level.Warn(logger).Log("msg", "failed to remote tmpdir", "err", err, "tmpdir", tmpdir) 993 } 994 }() 995 996 bdir := filepath.Join(tmpdir, ie.id.String()) 997 if err := block.Download(ctx, logger, bkt, ie.id, bdir); err != nil { 998 return retry(errors.Wrapf(err, "download block %s", ie.id)) 999 } 1000 1001 meta, err := metadata.ReadFromDir(bdir) 1002 if err != nil { 1003 return errors.Wrapf(err, "read meta from %s", bdir) 1004 } 1005 1006 resid, err := block.Repair(logger, tmpdir, ie.id, metadata.CompactorRepairSource, block.IgnoreIssue347OutsideChunk) 1007 if err != nil { 1008 return errors.Wrapf(err, "repair failed for block %s", ie.id) 1009 } 1010 1011 // Verify repaired id before uploading it. 1012 if err := block.VerifyIndex(logger, filepath.Join(tmpdir, resid.String(), block.IndexFilename), meta.MinTime, meta.MaxTime); err != nil { 1013 return errors.Wrapf(err, "repaired block is invalid %s", resid) 1014 } 1015 1016 level.Info(logger).Log("msg", "uploading repaired block", "newID", resid) 1017 if err = block.Upload(ctx, logger, bkt, filepath.Join(tmpdir, resid.String()), metadata.NoneFunc); err != nil { 1018 return retry(errors.Wrapf(err, "upload of %s failed", resid)) 1019 } 1020 1021 level.Info(logger).Log("msg", "deleting broken block", "id", ie.id) 1022 1023 // Spawn a new context so we always mark a block for deletion in full on shutdown. 1024 delCtx, cancel := context.WithTimeout(context.Background(), 5*time.Minute) 1025 defer cancel() 1026 1027 // TODO(bplotka): Issue with this will introduce overlap that will halt compactor. Automate that (fix duplicate overlaps caused by this). 1028 if err := block.MarkForDeletion(delCtx, logger, bkt, ie.id, "source of repaired block", blocksMarkedForDeletion); err != nil { 1029 return errors.Wrapf(err, "marking old block %s for deletion has failed", ie.id) 1030 } 1031 return nil 1032 } 1033 1034 func (cg *Group) compact(ctx context.Context, dir string, planner Planner, comp Compactor, blockDeletableChecker BlockDeletableChecker, compactionLifecycleCallback CompactionLifecycleCallback, errChan chan error) (shouldRerun bool, compID ulid.ULID, _ error) { 1035 cg.mtx.Lock() 1036 defer cg.mtx.Unlock() 1037 1038 // Check for overlapped blocks. 1039 overlappingBlocks := false 1040 if err := cg.areBlocksOverlapping(nil); err != nil { 1041 // TODO(bwplotka): It would really nice if we could still check for other overlaps than replica. In fact this should be checked 1042 // in syncer itself. Otherwise with vertical compaction enabled we will sacrifice this important check. 1043 if !cg.enableVerticalCompaction { 1044 return false, ulid.ULID{}, halt(errors.Wrap(err, "pre compaction overlap check")) 1045 } 1046 1047 overlappingBlocks = true 1048 } 1049 1050 var toCompact []*metadata.Meta 1051 if err := tracing.DoInSpanWithErr(ctx, "compaction_planning", func(ctx context.Context) (e error) { 1052 toCompact, e = planner.Plan(ctx, cg.metasByMinTime, errChan, cg.extensions) 1053 return e 1054 }); err != nil { 1055 return false, ulid.ULID{}, errors.Wrap(err, "plan compaction") 1056 } 1057 if len(toCompact) == 0 { 1058 // Nothing to do. 1059 return false, ulid.ULID{}, nil 1060 } 1061 1062 level.Info(cg.logger).Log("msg", "compaction available and planned", "plan", fmt.Sprintf("%v", toCompact)) 1063 1064 // Once we have a plan we need to download the actual data. 1065 groupCompactionBegin := time.Now() 1066 begin := groupCompactionBegin 1067 1068 if err := compactionLifecycleCallback.PreCompactionCallback(ctx, cg.logger, cg, toCompact); err != nil { 1069 return false, ulid.ULID{}, errors.Wrapf(err, "failed to run pre compaction callback for plan: %s", fmt.Sprintf("%v", toCompact)) 1070 } 1071 level.Info(cg.logger).Log("msg", "finished running pre compaction callback; downloading blocks", "plan", fmt.Sprintf("%v", toCompact), "duration", time.Since(begin), "duration_ms", time.Since(begin).Milliseconds()) 1072 1073 begin = time.Now() 1074 g, errCtx := errgroup.WithContext(ctx) 1075 g.SetLimit(cg.compactBlocksFetchConcurrency) 1076 1077 toCompactDirs := make([]string, 0, len(toCompact)) 1078 for _, m := range toCompact { 1079 bdir := filepath.Join(dir, m.ULID.String()) 1080 func(ctx context.Context, meta *metadata.Meta) { 1081 g.Go(func() error { 1082 start := time.Now() 1083 if err := tracing.DoInSpanWithErr(ctx, "compaction_block_download", func(ctx context.Context) error { 1084 return block.Download(ctx, cg.logger, cg.bkt, meta.ULID, bdir, objstore.WithFetchConcurrency(cg.blockFilesConcurrency)) 1085 }, opentracing.Tags{"block.id": meta.ULID}); err != nil { 1086 return retry(errors.Wrapf(err, "download block %s", meta.ULID)) 1087 } 1088 level.Debug(cg.logger).Log("msg", "downloaded block", "block", meta.ULID.String(), "duration", time.Since(start), "duration_ms", time.Since(start).Milliseconds()) 1089 1090 start = time.Now() 1091 // Ensure all input blocks are valid. 1092 var stats block.HealthStats 1093 if err := tracing.DoInSpanWithErr(ctx, "compaction_block_health_stats", func(ctx context.Context) (e error) { 1094 stats, e = block.GatherIndexHealthStats(cg.logger, filepath.Join(bdir, block.IndexFilename), meta.MinTime, meta.MaxTime) 1095 return e 1096 }, opentracing.Tags{"block.id": meta.ULID}); err != nil { 1097 return errors.Wrapf(err, "gather index issues for block %s", bdir) 1098 } 1099 1100 if err := stats.CriticalErr(); err != nil { 1101 return halt(errors.Wrapf(err, "block with not healthy index found %s; Compaction level %v; Labels: %v", bdir, meta.Compaction.Level, meta.Thanos.Labels)) 1102 } 1103 1104 if err := stats.OutOfOrderChunksErr(); err != nil { 1105 return outOfOrderChunkError(errors.Wrapf(err, "blocks with out-of-order chunks are dropped from compaction: %s", bdir), meta.ULID) 1106 } 1107 1108 if err := stats.Issue347OutsideChunksErr(); err != nil { 1109 return issue347Error(errors.Wrapf(err, "invalid, but reparable block %s", bdir), meta.ULID) 1110 } 1111 1112 if err := stats.OutOfOrderLabelsErr(); !cg.acceptMalformedIndex && err != nil { 1113 return errors.Wrapf(err, 1114 "block id %s, try running with --debug.accept-malformed-index", meta.ULID) 1115 } 1116 level.Debug(cg.logger).Log("msg", "verified block", "block", meta.ULID.String(), "duration", time.Since(start), "duration_ms", time.Since(start).Milliseconds()) 1117 return nil 1118 }) 1119 }(errCtx, m) 1120 1121 toCompactDirs = append(toCompactDirs, bdir) 1122 } 1123 sourceBlockStr := fmt.Sprintf("%v", toCompactDirs) 1124 1125 if err := g.Wait(); err != nil { 1126 return false, ulid.ULID{}, err 1127 } 1128 1129 level.Info(cg.logger).Log("msg", "downloaded and verified blocks; compacting blocks", "plan", sourceBlockStr, "duration", time.Since(begin), "duration_ms", time.Since(begin).Milliseconds()) 1130 1131 begin = time.Now() 1132 if err := tracing.DoInSpanWithErr(ctx, "compaction", func(ctx context.Context) (e error) { 1133 populateBlockFunc, e := compactionLifecycleCallback.GetBlockPopulator(ctx, cg.logger, cg) 1134 if e != nil { 1135 return e 1136 } 1137 compID, e = comp.CompactWithBlockPopulator(dir, toCompactDirs, nil, populateBlockFunc) 1138 return e 1139 }); err != nil { 1140 return false, ulid.ULID{}, halt(errors.Wrapf(err, "compact blocks %v", toCompactDirs)) 1141 } 1142 if compID == (ulid.ULID{}) { 1143 // Prometheus compactor found that the compacted block would have no samples. 1144 level.Info(cg.logger).Log("msg", "compacted block would have no samples, deleting source blocks", "blocks", sourceBlockStr) 1145 for _, meta := range toCompact { 1146 if meta.Stats.NumSamples == 0 { 1147 if err := cg.deleteBlock(meta.ULID, filepath.Join(dir, meta.ULID.String()), blockDeletableChecker); err != nil { 1148 level.Warn(cg.logger).Log("msg", "failed to mark for deletion an empty block found during compaction", "block", meta.ULID) 1149 } 1150 } 1151 } 1152 // Even though this block was empty, there may be more work to do. 1153 return true, ulid.ULID{}, nil 1154 } 1155 cg.compactions.Inc() 1156 if overlappingBlocks { 1157 cg.verticalCompactions.Inc() 1158 } 1159 level.Info(cg.logger).Log("msg", "compacted blocks", "new", compID, 1160 "blocks", sourceBlockStr, "duration", time.Since(begin), "duration_ms", time.Since(begin).Milliseconds(), "overlapping_blocks", overlappingBlocks) 1161 1162 bdir := filepath.Join(dir, compID.String()) 1163 index := filepath.Join(bdir, block.IndexFilename) 1164 1165 if err := os.Remove(filepath.Join(bdir, "tombstones")); err != nil { 1166 return false, ulid.ULID{}, errors.Wrap(err, "remove tombstones") 1167 } 1168 1169 newMeta, err := metadata.ReadFromDir(bdir) 1170 if err != nil { 1171 return false, ulid.ULID{}, errors.Wrap(err, "read new meta") 1172 } 1173 1174 var stats block.HealthStats 1175 // Ensure the output block is valid. 1176 err = tracing.DoInSpanWithErr(ctx, "compaction_verify_index", func(ctx context.Context) error { 1177 stats, err = block.GatherIndexHealthStats(cg.logger, index, newMeta.MinTime, newMeta.MaxTime) 1178 if err != nil { 1179 return err 1180 } 1181 return stats.AnyErr() 1182 }) 1183 if !cg.acceptMalformedIndex && err != nil { 1184 return false, ulid.ULID{}, halt(errors.Wrapf(err, "invalid result block %s", bdir)) 1185 } 1186 1187 thanosMeta := metadata.Thanos{ 1188 Labels: cg.labels.Map(), 1189 Downsample: metadata.ThanosDownsample{Resolution: cg.resolution}, 1190 Source: metadata.CompactorSource, 1191 SegmentFiles: block.GetSegmentFiles(bdir), 1192 Extensions: cg.extensions, 1193 } 1194 if stats.ChunkMaxSize > 0 { 1195 thanosMeta.IndexStats.ChunkMaxSize = stats.ChunkMaxSize 1196 } 1197 if stats.SeriesMaxSize > 0 { 1198 thanosMeta.IndexStats.SeriesMaxSize = stats.SeriesMaxSize 1199 } 1200 newMeta, err = metadata.InjectThanos(cg.logger, bdir, thanosMeta, nil) 1201 if err != nil { 1202 return false, ulid.ULID{}, errors.Wrapf(err, "failed to finalize the block %s", bdir) 1203 } 1204 1205 // Ensure the output block is not overlapping with anything else, 1206 // unless vertical compaction is enabled. 1207 if !cg.enableVerticalCompaction { 1208 if err := cg.areBlocksOverlapping(newMeta, toCompact...); err != nil { 1209 return false, ulid.ULID{}, halt(errors.Wrapf(err, "resulted compacted block %s overlaps with something", bdir)) 1210 } 1211 } 1212 1213 begin = time.Now() 1214 1215 err = tracing.DoInSpanWithErr(ctx, "compaction_block_upload", func(ctx context.Context) error { 1216 return block.Upload(ctx, cg.logger, cg.bkt, bdir, cg.hashFunc, objstore.WithUploadConcurrency(cg.blockFilesConcurrency)) 1217 }) 1218 if err != nil { 1219 return false, ulid.ULID{}, retry(errors.Wrapf(err, "upload of %s failed", compID)) 1220 } 1221 level.Info(cg.logger).Log("msg", "uploaded block", "result_block", compID, "duration", time.Since(begin), "duration_ms", time.Since(begin).Milliseconds()) 1222 1223 // Mark for deletion the blocks we just compacted from the group and bucket so they do not get included 1224 // into the next planning cycle. 1225 // Eventually the block we just uploaded should get synced into the group again (including sync-delay). 1226 for _, meta := range toCompact { 1227 err = tracing.DoInSpanWithErr(ctx, "compaction_block_delete", func(ctx context.Context) error { 1228 return cg.deleteBlock(meta.ULID, filepath.Join(dir, meta.ULID.String()), blockDeletableChecker) 1229 }, opentracing.Tags{"block.id": meta.ULID}) 1230 if err != nil { 1231 return false, ulid.ULID{}, retry(errors.Wrapf(err, "mark old block for deletion from bucket")) 1232 } 1233 cg.groupGarbageCollectedBlocks.Inc() 1234 } 1235 1236 level.Info(cg.logger).Log("msg", "running post compaction callback", "result_block", compID) 1237 if err := compactionLifecycleCallback.PostCompactionCallback(ctx, cg.logger, cg, compID); err != nil { 1238 return false, ulid.ULID{}, retry(errors.Wrapf(err, "failed to run post compaction callback for result block %s", compID)) 1239 } 1240 level.Info(cg.logger).Log("msg", "finished running post compaction callback", "result_block", compID) 1241 1242 level.Info(cg.logger).Log("msg", "finished compacting blocks", "result_block", compID, "source_blocks", sourceBlockStr, 1243 "duration", time.Since(groupCompactionBegin), "duration_ms", time.Since(groupCompactionBegin).Milliseconds()) 1244 return true, compID, nil 1245 } 1246 1247 func (cg *Group) deleteBlock(id ulid.ULID, bdir string, blockDeletableChecker BlockDeletableChecker) error { 1248 if err := os.RemoveAll(bdir); err != nil { 1249 return errors.Wrapf(err, "remove old block dir %s", id) 1250 } 1251 1252 if blockDeletableChecker.CanDelete(cg, id) { 1253 // Spawn a new context so we always mark a block for deletion in full on shutdown. 1254 delCtx, cancel := context.WithTimeout(context.Background(), 5*time.Minute) 1255 defer cancel() 1256 level.Info(cg.logger).Log("msg", "marking compacted block for deletion", "old_block", id) 1257 if err := block.MarkForDeletion(delCtx, cg.logger, cg.bkt, id, "source of compacted block", cg.blocksMarkedForDeletion); err != nil { 1258 return errors.Wrapf(err, "mark block %s for deletion from bucket", id) 1259 } 1260 } 1261 return nil 1262 } 1263 1264 // BucketCompactor compacts blocks in a bucket. 1265 type BucketCompactor struct { 1266 logger log.Logger 1267 sy *Syncer 1268 grouper Grouper 1269 comp Compactor 1270 planner Planner 1271 blockDeletableChecker BlockDeletableChecker 1272 compactionLifecycleCallback CompactionLifecycleCallback 1273 compactDir string 1274 bkt objstore.Bucket 1275 concurrency int 1276 skipBlocksWithOutOfOrderChunks bool 1277 } 1278 1279 // NewBucketCompactor creates a new bucket compactor. 1280 func NewBucketCompactor( 1281 logger log.Logger, 1282 sy *Syncer, 1283 grouper Grouper, 1284 planner Planner, 1285 comp Compactor, 1286 compactDir string, 1287 bkt objstore.Bucket, 1288 concurrency int, 1289 skipBlocksWithOutOfOrderChunks bool, 1290 ) (*BucketCompactor, error) { 1291 if concurrency <= 0 { 1292 return nil, errors.Errorf("invalid concurrency level (%d), concurrency level must be > 0", concurrency) 1293 } 1294 return NewBucketCompactorWithCheckerAndCallback( 1295 logger, 1296 sy, 1297 grouper, 1298 planner, 1299 comp, 1300 DefaultBlockDeletableChecker{}, 1301 DefaultCompactionLifecycleCallback{}, 1302 compactDir, 1303 bkt, 1304 concurrency, 1305 skipBlocksWithOutOfOrderChunks, 1306 ) 1307 } 1308 1309 func NewBucketCompactorWithCheckerAndCallback( 1310 logger log.Logger, 1311 sy *Syncer, 1312 grouper Grouper, 1313 planner Planner, 1314 comp Compactor, 1315 blockDeletableChecker BlockDeletableChecker, 1316 compactionLifecycleCallback CompactionLifecycleCallback, 1317 compactDir string, 1318 bkt objstore.Bucket, 1319 concurrency int, 1320 skipBlocksWithOutOfOrderChunks bool, 1321 ) (*BucketCompactor, error) { 1322 if concurrency <= 0 { 1323 return nil, errors.Errorf("invalid concurrency level (%d), concurrency level must be > 0", concurrency) 1324 } 1325 return &BucketCompactor{ 1326 logger: logger, 1327 sy: sy, 1328 grouper: grouper, 1329 planner: planner, 1330 comp: comp, 1331 blockDeletableChecker: blockDeletableChecker, 1332 compactionLifecycleCallback: compactionLifecycleCallback, 1333 compactDir: compactDir, 1334 bkt: bkt, 1335 concurrency: concurrency, 1336 skipBlocksWithOutOfOrderChunks: skipBlocksWithOutOfOrderChunks, 1337 }, nil 1338 } 1339 1340 // Compact runs compaction over bucket. 1341 func (c *BucketCompactor) Compact(ctx context.Context) (rerr error) { 1342 defer func() { 1343 // Do not remove the compactDir if an error has occurred 1344 // because potentially on the next run we would not have to download 1345 // everything again. 1346 if rerr != nil { 1347 return 1348 } 1349 if err := os.RemoveAll(c.compactDir); err != nil { 1350 level.Error(c.logger).Log("msg", "failed to remove compaction work directory", "path", c.compactDir, "err", err) 1351 } 1352 }() 1353 1354 // Loop over bucket and compact until there's no work left. 1355 for { 1356 var ( 1357 wg sync.WaitGroup 1358 workCtx, workCtxCancel = context.WithCancel(ctx) 1359 groupChan = make(chan *Group) 1360 errChan = make(chan error, c.concurrency) 1361 finishedAllGroups = true 1362 mtx sync.Mutex 1363 ) 1364 defer workCtxCancel() 1365 1366 // Set up workers who will compact the groups when the groups are ready. 1367 // They will compact available groups until they encounter an error, after which they will stop. 1368 for i := 0; i < c.concurrency; i++ { 1369 wg.Add(1) 1370 go func() { 1371 defer wg.Done() 1372 for g := range groupChan { 1373 shouldRerunGroup, _, err := g.Compact(workCtx, c.compactDir, c.planner, c.comp, c.blockDeletableChecker, c.compactionLifecycleCallback) 1374 if err == nil { 1375 if shouldRerunGroup { 1376 mtx.Lock() 1377 finishedAllGroups = false 1378 mtx.Unlock() 1379 } 1380 continue 1381 } 1382 1383 if IsIssue347Error(err) { 1384 if err := RepairIssue347(workCtx, c.logger, c.bkt, c.sy.metrics.blocksMarkedForDeletion, err); err == nil { 1385 mtx.Lock() 1386 finishedAllGroups = false 1387 mtx.Unlock() 1388 continue 1389 } 1390 } 1391 // If block has out of order chunk and it has been configured to skip it, 1392 // then we can mark the block for no compaction so that the next compaction run 1393 // will skip it. 1394 if IsOutOfOrderChunkError(err) && c.skipBlocksWithOutOfOrderChunks { 1395 if err := block.MarkForNoCompact( 1396 ctx, 1397 c.logger, 1398 c.bkt, 1399 err.(OutOfOrderChunksError).id, 1400 metadata.OutOfOrderChunksNoCompactReason, 1401 "OutofOrderChunk: marking block with out-of-order series/chunks to as no compact to unblock compaction", g.blocksMarkedForNoCompact); err == nil { 1402 mtx.Lock() 1403 finishedAllGroups = false 1404 mtx.Unlock() 1405 continue 1406 } 1407 } 1408 errChan <- errors.Wrapf(err, "group %s", g.Key()) 1409 return 1410 } 1411 }() 1412 } 1413 1414 level.Info(c.logger).Log("msg", "start sync of metas") 1415 if err := c.sy.SyncMetas(ctx); err != nil { 1416 return errors.Wrap(err, "sync") 1417 } 1418 1419 level.Info(c.logger).Log("msg", "start of GC") 1420 // Blocks that were compacted are garbage collected after each Compaction. 1421 // However if compactor crashes we need to resolve those on startup. 1422 if err := c.sy.GarbageCollect(ctx); err != nil { 1423 return errors.Wrap(err, "garbage") 1424 } 1425 1426 groups, err := c.grouper.Groups(c.sy.Metas()) 1427 if err != nil { 1428 return errors.Wrap(err, "build compaction groups") 1429 } 1430 1431 ignoreDirs := []string{} 1432 for _, gr := range groups { 1433 for _, grID := range gr.IDs() { 1434 ignoreDirs = append(ignoreDirs, filepath.Join(gr.Key(), grID.String())) 1435 } 1436 } 1437 1438 if err := runutil.DeleteAll(c.compactDir, ignoreDirs...); err != nil { 1439 level.Warn(c.logger).Log("msg", "failed deleting non-compaction group directories/files, some disk space usage might have leaked. Continuing", "err", err, "dir", c.compactDir) 1440 } 1441 1442 level.Info(c.logger).Log("msg", "start of compactions") 1443 1444 // Send all groups found during this pass to the compaction workers. 1445 var groupErrs errutil.MultiError 1446 groupLoop: 1447 for _, g := range groups { 1448 // Ignore groups with only one block because there is nothing to compact. 1449 if len(g.IDs()) == 1 { 1450 continue 1451 } 1452 select { 1453 case groupErr := <-errChan: 1454 groupErrs.Add(groupErr) 1455 break groupLoop 1456 case groupChan <- g: 1457 } 1458 } 1459 close(groupChan) 1460 wg.Wait() 1461 1462 // Collect any other error reported by the workers, or any error reported 1463 // while we were waiting for the last batch of groups to run the compaction. 1464 close(errChan) 1465 for groupErr := range errChan { 1466 groupErrs.Add(groupErr) 1467 } 1468 1469 workCtxCancel() 1470 if len(groupErrs) > 0 { 1471 return groupErrs.Err() 1472 } 1473 1474 if finishedAllGroups { 1475 break 1476 } 1477 } 1478 level.Info(c.logger).Log("msg", "compaction iterations done") 1479 return nil 1480 } 1481 1482 var _ block.MetadataFilter = &GatherNoCompactionMarkFilter{} 1483 1484 // GatherNoCompactionMarkFilter is a block.Fetcher filter that passes all metas. While doing it, it gathers all no-compact-mark.json markers. 1485 // Not go routine safe. 1486 // TODO(bwplotka): Add unit test. 1487 type GatherNoCompactionMarkFilter struct { 1488 logger log.Logger 1489 bkt objstore.InstrumentedBucketReader 1490 noCompactMarkedMap map[ulid.ULID]*metadata.NoCompactMark 1491 concurrency int 1492 mtx sync.Mutex 1493 } 1494 1495 // NewGatherNoCompactionMarkFilter creates GatherNoCompactionMarkFilter. 1496 func NewGatherNoCompactionMarkFilter(logger log.Logger, bkt objstore.InstrumentedBucketReader, concurrency int) *GatherNoCompactionMarkFilter { 1497 return &GatherNoCompactionMarkFilter{ 1498 logger: logger, 1499 bkt: bkt, 1500 concurrency: concurrency, 1501 } 1502 } 1503 1504 // NoCompactMarkedBlocks returns block ids that were marked for no compaction. 1505 func (f *GatherNoCompactionMarkFilter) NoCompactMarkedBlocks() map[ulid.ULID]*metadata.NoCompactMark { 1506 f.mtx.Lock() 1507 copiedNoCompactMarked := make(map[ulid.ULID]*metadata.NoCompactMark, len(f.noCompactMarkedMap)) 1508 for k, v := range f.noCompactMarkedMap { 1509 copiedNoCompactMarked[k] = v 1510 } 1511 f.mtx.Unlock() 1512 1513 return copiedNoCompactMarked 1514 } 1515 1516 // Filter passes all metas, while gathering no compact markers. 1517 func (f *GatherNoCompactionMarkFilter) Filter(ctx context.Context, metas map[ulid.ULID]*metadata.Meta, synced block.GaugeVec, modified block.GaugeVec) error { 1518 var localNoCompactMapMtx sync.Mutex 1519 1520 noCompactMarkedMap := make(map[ulid.ULID]*metadata.NoCompactMark) 1521 1522 // Make a copy of block IDs to check, in order to avoid concurrency issues 1523 // between the scheduler and workers. 1524 blockIDs := make([]ulid.ULID, 0, len(metas)) 1525 for id := range metas { 1526 blockIDs = append(blockIDs, id) 1527 } 1528 1529 var ( 1530 eg errgroup.Group 1531 ch = make(chan ulid.ULID, f.concurrency) 1532 ) 1533 1534 for i := 0; i < f.concurrency; i++ { 1535 eg.Go(func() error { 1536 var lastErr error 1537 for id := range ch { 1538 m := &metadata.NoCompactMark{} 1539 // TODO(bwplotka): Hook up bucket cache here + reset API so we don't introduce API calls . 1540 if err := metadata.ReadMarker(ctx, f.logger, f.bkt, id.String(), m); err != nil { 1541 if errors.Cause(err) == metadata.ErrorMarkerNotFound { 1542 continue 1543 } 1544 if errors.Cause(err) == metadata.ErrorUnmarshalMarker { 1545 level.Warn(f.logger).Log("msg", "found partial no-compact-mark.json; if we will see it happening often for the same block, consider manually deleting no-compact-mark.json from the object storage", "block", id, "err", err) 1546 continue 1547 } 1548 // Remember the last error and continue draining the channel. 1549 lastErr = err 1550 continue 1551 } 1552 1553 localNoCompactMapMtx.Lock() 1554 noCompactMarkedMap[id] = m 1555 localNoCompactMapMtx.Unlock() 1556 synced.WithLabelValues(block.MarkedForNoCompactionMeta).Inc() 1557 } 1558 1559 return lastErr 1560 }) 1561 } 1562 1563 // Workers scheduled, distribute blocks. 1564 eg.Go(func() error { 1565 defer close(ch) 1566 1567 for _, id := range blockIDs { 1568 select { 1569 case ch <- id: 1570 // Nothing to do. 1571 case <-ctx.Done(): 1572 return ctx.Err() 1573 } 1574 } 1575 1576 return nil 1577 }) 1578 1579 if err := eg.Wait(); err != nil { 1580 return errors.Wrap(err, "filter blocks marked for no compaction") 1581 } 1582 1583 f.mtx.Lock() 1584 f.noCompactMarkedMap = noCompactMarkedMap 1585 f.mtx.Unlock() 1586 1587 return nil 1588 }