github.com/thanos-io/thanos@v0.32.5/pkg/compact/planner.go

github.com/thanos-io/thanos@v0.32.5/pkg/compact/planner.go (about)

     1  // Copyright (c) The Thanos Authors.
     2  // Licensed under the Apache License 2.0.
     3  
     4  package compact
     5  
     6  import (
     7  	"context"
     8  	"fmt"
     9  	"math"
    10  	"path/filepath"
    11  
    12  	"github.com/go-kit/log"
    13  	"github.com/oklog/ulid"
    14  	"github.com/pkg/errors"
    15  	"github.com/prometheus/client_golang/prometheus"
    16  	"github.com/thanos-io/objstore"
    17  
    18  	"github.com/thanos-io/thanos/pkg/block"
    19  	"github.com/thanos-io/thanos/pkg/block/metadata"
    20  )
    21  
    22  type tsdbBasedPlanner struct {
    23  	logger log.Logger
    24  
    25  	ranges []int64
    26  
    27  	noCompBlocksFunc func() map[ulid.ULID]*metadata.NoCompactMark
    28  }
    29  
    30  var _ Planner = &tsdbBasedPlanner{}
    31  
    32  // NewTSDBBasedPlanner is planner with the same functionality as Prometheus' TSDB.
    33  // TODO(bwplotka): Consider upstreaming this to Prometheus.
    34  // It's the same functionality just without accessing filesystem.
    35  func NewTSDBBasedPlanner(logger log.Logger, ranges []int64) *tsdbBasedPlanner {
    36  	return &tsdbBasedPlanner{
    37  		logger: logger,
    38  		ranges: ranges,
    39  		noCompBlocksFunc: func() map[ulid.ULID]*metadata.NoCompactMark {
    40  			return make(map[ulid.ULID]*metadata.NoCompactMark)
    41  		},
    42  	}
    43  }
    44  
    45  // NewPlanner is a default Thanos planner with the same functionality as Prometheus' TSDB plus special handling of excluded blocks.
    46  // It's the same functionality just without accessing filesystem, and special handling of excluded blocks.
    47  func NewPlanner(logger log.Logger, ranges []int64, noCompBlocks *GatherNoCompactionMarkFilter) *tsdbBasedPlanner {
    48  	return &tsdbBasedPlanner{logger: logger, ranges: ranges, noCompBlocksFunc: noCompBlocks.NoCompactMarkedBlocks}
    49  }
    50  
    51  // TODO(bwplotka): Consider smarter algorithm, this prefers smaller iterative compactions vs big single one: https://github.com/thanos-io/thanos/issues/3405
    52  func (p *tsdbBasedPlanner) Plan(_ context.Context, metasByMinTime []*metadata.Meta, _ chan error, _ any) ([]*metadata.Meta, error) {
    53  	return p.plan(p.noCompBlocksFunc(), metasByMinTime)
    54  }
    55  
    56  func (p *tsdbBasedPlanner) plan(noCompactMarked map[ulid.ULID]*metadata.NoCompactMark, metasByMinTime []*metadata.Meta) ([]*metadata.Meta, error) {
    57  	notExcludedMetasByMinTime := make([]*metadata.Meta, 0, len(metasByMinTime))
    58  	for _, meta := range metasByMinTime {
    59  		if _, excluded := noCompactMarked[meta.ULID]; excluded {
    60  			continue
    61  		}
    62  		notExcludedMetasByMinTime = append(notExcludedMetasByMinTime, meta)
    63  	}
    64  
    65  	res := selectOverlappingMetas(notExcludedMetasByMinTime)
    66  	if len(res) > 0 {
    67  		return res, nil
    68  	}
    69  	// No overlapping blocks, do compaction the usual way.
    70  
    71  	// We do not include a recently producted block with max(minTime), so the block which was just uploaded to bucket.
    72  	// This gives users a window of a full block size maintenance if needed.
    73  	if _, excluded := noCompactMarked[metasByMinTime[len(metasByMinTime)-1].ULID]; !excluded {
    74  		notExcludedMetasByMinTime = notExcludedMetasByMinTime[:len(notExcludedMetasByMinTime)-1]
    75  	}
    76  	metasByMinTime = metasByMinTime[:len(metasByMinTime)-1]
    77  	res = append(res, selectMetas(p.ranges, noCompactMarked, metasByMinTime)...)
    78  	if len(res) > 0 {
    79  		return res, nil
    80  	}
    81  
    82  	// Compact any blocks with big enough time range that have >5% tombstones.
    83  	for i := len(notExcludedMetasByMinTime) - 1; i >= 0; i-- {
    84  		meta := notExcludedMetasByMinTime[i]
    85  		if meta.MaxTime-meta.MinTime < p.ranges[len(p.ranges)/2] {
    86  			break
    87  		}
    88  		if float64(meta.Stats.NumTombstones)/float64(meta.Stats.NumSeries+1) > 0.05 {
    89  			return []*metadata.Meta{notExcludedMetasByMinTime[i]}, nil
    90  		}
    91  	}
    92  
    93  	return nil, nil
    94  }
    95  
    96  // selectMetas returns the dir metas that should be compacted into a single new block.
    97  // If only a single block range is configured, the result is always nil.
    98  // Copied and adjusted from https://github.com/prometheus/prometheus/blob/3d8826a3d42566684283a9b7f7e812e412c24407/tsdb/compact.go#L229.
    99  func selectMetas(ranges []int64, noCompactMarked map[ulid.ULID]*metadata.NoCompactMark, metasByMinTime []*metadata.Meta) []*metadata.Meta {
   100  	if len(ranges) < 2 || len(metasByMinTime) < 1 {
   101  		return nil
   102  	}
   103  	highTime := metasByMinTime[len(metasByMinTime)-1].MinTime
   104  
   105  	for _, iv := range ranges[1:] {
   106  		parts := splitByRange(metasByMinTime, iv)
   107  		if len(parts) == 0 {
   108  			continue
   109  		}
   110  	Outer:
   111  		for _, p := range parts {
   112  			// Do not select the range if it has a block whose compaction failed.
   113  			for _, m := range p {
   114  				if m.Compaction.Failed {
   115  					continue Outer
   116  				}
   117  			}
   118  
   119  			if len(p) < 2 {
   120  				continue
   121  			}
   122  
   123  			mint := p[0].MinTime
   124  			maxt := p[len(p)-1].MaxTime
   125  
   126  			// Pick the range of blocks if it spans the full range (potentially with gaps) or is before the most recent block.
   127  			// This ensures we don't compact blocks prematurely when another one of the same size still would fits in the range
   128  			// after upload.
   129  			if maxt-mint != iv && maxt > highTime {
   130  				continue
   131  			}
   132  
   133  			// Check if any of resulted blocks are excluded. Exclude them in a way that does not introduce gaps to the system
   134  			// as well as preserve the ranges that would be used if they were not excluded.
   135  			// This is meant as short-term workaround to create ability for marking some blocks to not be touched for compaction.
   136  			lastExcluded := 0
   137  			for i, id := range p {
   138  				if _, excluded := noCompactMarked[id.ULID]; !excluded {
   139  					continue
   140  				}
   141  				if len(p[lastExcluded:i]) > 1 {
   142  					return p[lastExcluded:i]
   143  				}
   144  				lastExcluded = i + 1
   145  			}
   146  			if len(p[lastExcluded:]) > 1 {
   147  				return p[lastExcluded:]
   148  			}
   149  		}
   150  	}
   151  
   152  	return nil
   153  }
   154  
   155  // selectOverlappingMetas returns all dirs with overlapping time ranges.
   156  // It expects sorted input by mint and returns the overlapping dirs in the same order as received.
   157  // Copied and adjusted from https://github.com/prometheus/prometheus/blob/3d8826a3d42566684283a9b7f7e812e412c24407/tsdb/compact.go#L268.
   158  func selectOverlappingMetas(metasByMinTime []*metadata.Meta) []*metadata.Meta {
   159  	if len(metasByMinTime) < 2 {
   160  		return nil
   161  	}
   162  	var overlappingMetas []*metadata.Meta
   163  	globalMaxt := metasByMinTime[0].MaxTime
   164  	for i, m := range metasByMinTime[1:] {
   165  		if m.MinTime < globalMaxt {
   166  			if len(overlappingMetas) == 0 {
   167  				// When it is the first overlap, need to add the last one as well.
   168  				overlappingMetas = append(overlappingMetas, metasByMinTime[i])
   169  			}
   170  			overlappingMetas = append(overlappingMetas, m)
   171  		} else if len(overlappingMetas) > 0 {
   172  			break
   173  		}
   174  
   175  		if m.MaxTime > globalMaxt {
   176  			globalMaxt = m.MaxTime
   177  		}
   178  	}
   179  	return overlappingMetas
   180  }
   181  
   182  // splitByRange splits the directories by the time range. The range sequence starts at 0.
   183  //
   184  // For example, if we have blocks [0-10, 10-20, 50-60, 90-100] and the split range tr is 30
   185  // it returns [0-10, 10-20], [50-60], [90-100].
   186  // Copied and adjusted from: https://github.com/prometheus/prometheus/blob/3d8826a3d42566684283a9b7f7e812e412c24407/tsdb/compact.go#L294.
   187  func splitByRange(metasByMinTime []*metadata.Meta, tr int64) [][]*metadata.Meta {
   188  	var splitDirs [][]*metadata.Meta
   189  
   190  	for i := 0; i < len(metasByMinTime); {
   191  		var (
   192  			group []*metadata.Meta
   193  			t0    int64
   194  			m     = metasByMinTime[i]
   195  		)
   196  		// Compute start of aligned time range of size tr closest to the current block's start.
   197  		if m.MinTime >= 0 {
   198  			t0 = tr * (m.MinTime / tr)
   199  		} else {
   200  			t0 = tr * ((m.MinTime - tr + 1) / tr)
   201  		}
   202  
   203  		// Skip blocks that don't fall into the range. This can happen via mis-alignment or
   204  		// by being the multiple of the intended range.
   205  		if m.MaxTime > t0+tr {
   206  			i++
   207  			continue
   208  		}
   209  
   210  		// Add all metas to the current group that are within [t0, t0+tr].
   211  		for ; i < len(metasByMinTime); i++ {
   212  			// Either the block falls into the next range or doesn't fit at all (checked above).
   213  			if metasByMinTime[i].MaxTime > t0+tr {
   214  				break
   215  			}
   216  			group = append(group, metasByMinTime[i])
   217  		}
   218  
   219  		if len(group) > 0 {
   220  			splitDirs = append(splitDirs, group)
   221  		}
   222  	}
   223  
   224  	return splitDirs
   225  }
   226  
   227  type largeTotalIndexSizeFilter struct {
   228  	*tsdbBasedPlanner
   229  
   230  	bkt                    objstore.Bucket
   231  	markedForNoCompact     prometheus.Counter
   232  	totalMaxIndexSizeBytes int64
   233  }
   234  
   235  var _ Planner = &largeTotalIndexSizeFilter{}
   236  
   237  // WithLargeTotalIndexSizeFilter wraps Planner with largeTotalIndexSizeFilter that checks the given plans and estimates total index size.
   238  // When found, it marks block for no compaction by placing no-compact-mark.json and updating cache.
   239  // NOTE: The estimation is very rough as it assumes extreme cases of indexes sharing no bytes, thus summing all source index sizes.
   240  // Adjust limit accordingly reducing to some % of actual limit you want to give.
   241  // TODO(bwplotka): This is short term fix for https://github.com/thanos-io/thanos/issues/1424, replace with vertical block sharding https://github.com/thanos-io/thanos/pull/3390.
   242  func WithLargeTotalIndexSizeFilter(with *tsdbBasedPlanner, bkt objstore.Bucket, totalMaxIndexSizeBytes int64, markedForNoCompact prometheus.Counter) *largeTotalIndexSizeFilter {
   243  	return &largeTotalIndexSizeFilter{tsdbBasedPlanner: with, bkt: bkt, totalMaxIndexSizeBytes: totalMaxIndexSizeBytes, markedForNoCompact: markedForNoCompact}
   244  }
   245  
   246  func (t *largeTotalIndexSizeFilter) Plan(ctx context.Context, metasByMinTime []*metadata.Meta, _ chan error, _ any) ([]*metadata.Meta, error) {
   247  	noCompactMarked := t.noCompBlocksFunc()
   248  	copiedNoCompactMarked := make(map[ulid.ULID]*metadata.NoCompactMark, len(noCompactMarked))
   249  	for k, v := range noCompactMarked {
   250  		copiedNoCompactMarked[k] = v
   251  	}
   252  
   253  PlanLoop:
   254  	for {
   255  		plan, err := t.plan(copiedNoCompactMarked, metasByMinTime)
   256  		if err != nil {
   257  			return nil, err
   258  		}
   259  		var totalIndexBytes, maxIndexSize int64 = 0, math.MinInt64
   260  		var biggestIndex int
   261  		for i, p := range plan {
   262  			indexSize := int64(-1)
   263  			for _, f := range p.Thanos.Files {
   264  				if f.RelPath == block.IndexFilename {
   265  					indexSize = f.SizeBytes
   266  				}
   267  			}
   268  			if indexSize <= 0 {
   269  				// Get size from bkt instead.
   270  				attr, err := t.bkt.Attributes(ctx, filepath.Join(p.ULID.String(), block.IndexFilename))
   271  				if err != nil {
   272  					return nil, errors.Wrapf(err, "get attr of %v", filepath.Join(p.ULID.String(), block.IndexFilename))
   273  				}
   274  				indexSize = attr.Size
   275  			}
   276  
   277  			if maxIndexSize < indexSize {
   278  				maxIndexSize = indexSize
   279  				biggestIndex = i
   280  			}
   281  			totalIndexBytes += indexSize
   282  			// Leave 15% headroom for index compaction bloat.
   283  			if totalIndexBytes >= int64(float64(t.totalMaxIndexSizeBytes)*0.85) {
   284  				// Marking blocks for no compact to limit size.
   285  				// TODO(bwplotka): Make sure to reset cache once this is done: https://github.com/thanos-io/thanos/issues/3408
   286  				if err := block.MarkForNoCompact(
   287  					ctx,
   288  					t.logger,
   289  					t.bkt,
   290  					plan[biggestIndex].ULID,
   291  					metadata.IndexSizeExceedingNoCompactReason,
   292  					fmt.Sprintf("largeTotalIndexSizeFilter: Total compacted block's index size could exceed: %v with this block. See https://github.com/thanos-io/thanos/issues/1424", t.totalMaxIndexSizeBytes),
   293  					t.markedForNoCompact,
   294  				); err != nil {
   295  					return nil, errors.Wrapf(err, "mark %v for no compaction", plan[biggestIndex].ULID.String())
   296  				}
   297  				// Make sure wrapped planner exclude this block.
   298  				copiedNoCompactMarked[plan[biggestIndex].ULID] = &metadata.NoCompactMark{ID: plan[biggestIndex].ULID, Version: metadata.NoCompactMarkVersion1}
   299  				continue PlanLoop
   300  			}
   301  		}
   302  		// Planned blocks should not exceed limit.
   303  		return plan, nil
   304  	}
   305  }