github.com/thanos-io/thanos@v0.32.5/pkg/compact/planner.go (about) 1 // Copyright (c) The Thanos Authors. 2 // Licensed under the Apache License 2.0. 3 4 package compact 5 6 import ( 7 "context" 8 "fmt" 9 "math" 10 "path/filepath" 11 12 "github.com/go-kit/log" 13 "github.com/oklog/ulid" 14 "github.com/pkg/errors" 15 "github.com/prometheus/client_golang/prometheus" 16 "github.com/thanos-io/objstore" 17 18 "github.com/thanos-io/thanos/pkg/block" 19 "github.com/thanos-io/thanos/pkg/block/metadata" 20 ) 21 22 type tsdbBasedPlanner struct { 23 logger log.Logger 24 25 ranges []int64 26 27 noCompBlocksFunc func() map[ulid.ULID]*metadata.NoCompactMark 28 } 29 30 var _ Planner = &tsdbBasedPlanner{} 31 32 // NewTSDBBasedPlanner is planner with the same functionality as Prometheus' TSDB. 33 // TODO(bwplotka): Consider upstreaming this to Prometheus. 34 // It's the same functionality just without accessing filesystem. 35 func NewTSDBBasedPlanner(logger log.Logger, ranges []int64) *tsdbBasedPlanner { 36 return &tsdbBasedPlanner{ 37 logger: logger, 38 ranges: ranges, 39 noCompBlocksFunc: func() map[ulid.ULID]*metadata.NoCompactMark { 40 return make(map[ulid.ULID]*metadata.NoCompactMark) 41 }, 42 } 43 } 44 45 // NewPlanner is a default Thanos planner with the same functionality as Prometheus' TSDB plus special handling of excluded blocks. 46 // It's the same functionality just without accessing filesystem, and special handling of excluded blocks. 47 func NewPlanner(logger log.Logger, ranges []int64, noCompBlocks *GatherNoCompactionMarkFilter) *tsdbBasedPlanner { 48 return &tsdbBasedPlanner{logger: logger, ranges: ranges, noCompBlocksFunc: noCompBlocks.NoCompactMarkedBlocks} 49 } 50 51 // TODO(bwplotka): Consider smarter algorithm, this prefers smaller iterative compactions vs big single one: https://github.com/thanos-io/thanos/issues/3405 52 func (p *tsdbBasedPlanner) Plan(_ context.Context, metasByMinTime []*metadata.Meta, _ chan error, _ any) ([]*metadata.Meta, error) { 53 return p.plan(p.noCompBlocksFunc(), metasByMinTime) 54 } 55 56 func (p *tsdbBasedPlanner) plan(noCompactMarked map[ulid.ULID]*metadata.NoCompactMark, metasByMinTime []*metadata.Meta) ([]*metadata.Meta, error) { 57 notExcludedMetasByMinTime := make([]*metadata.Meta, 0, len(metasByMinTime)) 58 for _, meta := range metasByMinTime { 59 if _, excluded := noCompactMarked[meta.ULID]; excluded { 60 continue 61 } 62 notExcludedMetasByMinTime = append(notExcludedMetasByMinTime, meta) 63 } 64 65 res := selectOverlappingMetas(notExcludedMetasByMinTime) 66 if len(res) > 0 { 67 return res, nil 68 } 69 // No overlapping blocks, do compaction the usual way. 70 71 // We do not include a recently producted block with max(minTime), so the block which was just uploaded to bucket. 72 // This gives users a window of a full block size maintenance if needed. 73 if _, excluded := noCompactMarked[metasByMinTime[len(metasByMinTime)-1].ULID]; !excluded { 74 notExcludedMetasByMinTime = notExcludedMetasByMinTime[:len(notExcludedMetasByMinTime)-1] 75 } 76 metasByMinTime = metasByMinTime[:len(metasByMinTime)-1] 77 res = append(res, selectMetas(p.ranges, noCompactMarked, metasByMinTime)...) 78 if len(res) > 0 { 79 return res, nil 80 } 81 82 // Compact any blocks with big enough time range that have >5% tombstones. 83 for i := len(notExcludedMetasByMinTime) - 1; i >= 0; i-- { 84 meta := notExcludedMetasByMinTime[i] 85 if meta.MaxTime-meta.MinTime < p.ranges[len(p.ranges)/2] { 86 break 87 } 88 if float64(meta.Stats.NumTombstones)/float64(meta.Stats.NumSeries+1) > 0.05 { 89 return []*metadata.Meta{notExcludedMetasByMinTime[i]}, nil 90 } 91 } 92 93 return nil, nil 94 } 95 96 // selectMetas returns the dir metas that should be compacted into a single new block. 97 // If only a single block range is configured, the result is always nil. 98 // Copied and adjusted from https://github.com/prometheus/prometheus/blob/3d8826a3d42566684283a9b7f7e812e412c24407/tsdb/compact.go#L229. 99 func selectMetas(ranges []int64, noCompactMarked map[ulid.ULID]*metadata.NoCompactMark, metasByMinTime []*metadata.Meta) []*metadata.Meta { 100 if len(ranges) < 2 || len(metasByMinTime) < 1 { 101 return nil 102 } 103 highTime := metasByMinTime[len(metasByMinTime)-1].MinTime 104 105 for _, iv := range ranges[1:] { 106 parts := splitByRange(metasByMinTime, iv) 107 if len(parts) == 0 { 108 continue 109 } 110 Outer: 111 for _, p := range parts { 112 // Do not select the range if it has a block whose compaction failed. 113 for _, m := range p { 114 if m.Compaction.Failed { 115 continue Outer 116 } 117 } 118 119 if len(p) < 2 { 120 continue 121 } 122 123 mint := p[0].MinTime 124 maxt := p[len(p)-1].MaxTime 125 126 // Pick the range of blocks if it spans the full range (potentially with gaps) or is before the most recent block. 127 // This ensures we don't compact blocks prematurely when another one of the same size still would fits in the range 128 // after upload. 129 if maxt-mint != iv && maxt > highTime { 130 continue 131 } 132 133 // Check if any of resulted blocks are excluded. Exclude them in a way that does not introduce gaps to the system 134 // as well as preserve the ranges that would be used if they were not excluded. 135 // This is meant as short-term workaround to create ability for marking some blocks to not be touched for compaction. 136 lastExcluded := 0 137 for i, id := range p { 138 if _, excluded := noCompactMarked[id.ULID]; !excluded { 139 continue 140 } 141 if len(p[lastExcluded:i]) > 1 { 142 return p[lastExcluded:i] 143 } 144 lastExcluded = i + 1 145 } 146 if len(p[lastExcluded:]) > 1 { 147 return p[lastExcluded:] 148 } 149 } 150 } 151 152 return nil 153 } 154 155 // selectOverlappingMetas returns all dirs with overlapping time ranges. 156 // It expects sorted input by mint and returns the overlapping dirs in the same order as received. 157 // Copied and adjusted from https://github.com/prometheus/prometheus/blob/3d8826a3d42566684283a9b7f7e812e412c24407/tsdb/compact.go#L268. 158 func selectOverlappingMetas(metasByMinTime []*metadata.Meta) []*metadata.Meta { 159 if len(metasByMinTime) < 2 { 160 return nil 161 } 162 var overlappingMetas []*metadata.Meta 163 globalMaxt := metasByMinTime[0].MaxTime 164 for i, m := range metasByMinTime[1:] { 165 if m.MinTime < globalMaxt { 166 if len(overlappingMetas) == 0 { 167 // When it is the first overlap, need to add the last one as well. 168 overlappingMetas = append(overlappingMetas, metasByMinTime[i]) 169 } 170 overlappingMetas = append(overlappingMetas, m) 171 } else if len(overlappingMetas) > 0 { 172 break 173 } 174 175 if m.MaxTime > globalMaxt { 176 globalMaxt = m.MaxTime 177 } 178 } 179 return overlappingMetas 180 } 181 182 // splitByRange splits the directories by the time range. The range sequence starts at 0. 183 // 184 // For example, if we have blocks [0-10, 10-20, 50-60, 90-100] and the split range tr is 30 185 // it returns [0-10, 10-20], [50-60], [90-100]. 186 // Copied and adjusted from: https://github.com/prometheus/prometheus/blob/3d8826a3d42566684283a9b7f7e812e412c24407/tsdb/compact.go#L294. 187 func splitByRange(metasByMinTime []*metadata.Meta, tr int64) [][]*metadata.Meta { 188 var splitDirs [][]*metadata.Meta 189 190 for i := 0; i < len(metasByMinTime); { 191 var ( 192 group []*metadata.Meta 193 t0 int64 194 m = metasByMinTime[i] 195 ) 196 // Compute start of aligned time range of size tr closest to the current block's start. 197 if m.MinTime >= 0 { 198 t0 = tr * (m.MinTime / tr) 199 } else { 200 t0 = tr * ((m.MinTime - tr + 1) / tr) 201 } 202 203 // Skip blocks that don't fall into the range. This can happen via mis-alignment or 204 // by being the multiple of the intended range. 205 if m.MaxTime > t0+tr { 206 i++ 207 continue 208 } 209 210 // Add all metas to the current group that are within [t0, t0+tr]. 211 for ; i < len(metasByMinTime); i++ { 212 // Either the block falls into the next range or doesn't fit at all (checked above). 213 if metasByMinTime[i].MaxTime > t0+tr { 214 break 215 } 216 group = append(group, metasByMinTime[i]) 217 } 218 219 if len(group) > 0 { 220 splitDirs = append(splitDirs, group) 221 } 222 } 223 224 return splitDirs 225 } 226 227 type largeTotalIndexSizeFilter struct { 228 *tsdbBasedPlanner 229 230 bkt objstore.Bucket 231 markedForNoCompact prometheus.Counter 232 totalMaxIndexSizeBytes int64 233 } 234 235 var _ Planner = &largeTotalIndexSizeFilter{} 236 237 // WithLargeTotalIndexSizeFilter wraps Planner with largeTotalIndexSizeFilter that checks the given plans and estimates total index size. 238 // When found, it marks block for no compaction by placing no-compact-mark.json and updating cache. 239 // NOTE: The estimation is very rough as it assumes extreme cases of indexes sharing no bytes, thus summing all source index sizes. 240 // Adjust limit accordingly reducing to some % of actual limit you want to give. 241 // TODO(bwplotka): This is short term fix for https://github.com/thanos-io/thanos/issues/1424, replace with vertical block sharding https://github.com/thanos-io/thanos/pull/3390. 242 func WithLargeTotalIndexSizeFilter(with *tsdbBasedPlanner, bkt objstore.Bucket, totalMaxIndexSizeBytes int64, markedForNoCompact prometheus.Counter) *largeTotalIndexSizeFilter { 243 return &largeTotalIndexSizeFilter{tsdbBasedPlanner: with, bkt: bkt, totalMaxIndexSizeBytes: totalMaxIndexSizeBytes, markedForNoCompact: markedForNoCompact} 244 } 245 246 func (t *largeTotalIndexSizeFilter) Plan(ctx context.Context, metasByMinTime []*metadata.Meta, _ chan error, _ any) ([]*metadata.Meta, error) { 247 noCompactMarked := t.noCompBlocksFunc() 248 copiedNoCompactMarked := make(map[ulid.ULID]*metadata.NoCompactMark, len(noCompactMarked)) 249 for k, v := range noCompactMarked { 250 copiedNoCompactMarked[k] = v 251 } 252 253 PlanLoop: 254 for { 255 plan, err := t.plan(copiedNoCompactMarked, metasByMinTime) 256 if err != nil { 257 return nil, err 258 } 259 var totalIndexBytes, maxIndexSize int64 = 0, math.MinInt64 260 var biggestIndex int 261 for i, p := range plan { 262 indexSize := int64(-1) 263 for _, f := range p.Thanos.Files { 264 if f.RelPath == block.IndexFilename { 265 indexSize = f.SizeBytes 266 } 267 } 268 if indexSize <= 0 { 269 // Get size from bkt instead. 270 attr, err := t.bkt.Attributes(ctx, filepath.Join(p.ULID.String(), block.IndexFilename)) 271 if err != nil { 272 return nil, errors.Wrapf(err, "get attr of %v", filepath.Join(p.ULID.String(), block.IndexFilename)) 273 } 274 indexSize = attr.Size 275 } 276 277 if maxIndexSize < indexSize { 278 maxIndexSize = indexSize 279 biggestIndex = i 280 } 281 totalIndexBytes += indexSize 282 // Leave 15% headroom for index compaction bloat. 283 if totalIndexBytes >= int64(float64(t.totalMaxIndexSizeBytes)*0.85) { 284 // Marking blocks for no compact to limit size. 285 // TODO(bwplotka): Make sure to reset cache once this is done: https://github.com/thanos-io/thanos/issues/3408 286 if err := block.MarkForNoCompact( 287 ctx, 288 t.logger, 289 t.bkt, 290 plan[biggestIndex].ULID, 291 metadata.IndexSizeExceedingNoCompactReason, 292 fmt.Sprintf("largeTotalIndexSizeFilter: Total compacted block's index size could exceed: %v with this block. See https://github.com/thanos-io/thanos/issues/1424", t.totalMaxIndexSizeBytes), 293 t.markedForNoCompact, 294 ); err != nil { 295 return nil, errors.Wrapf(err, "mark %v for no compaction", plan[biggestIndex].ULID.String()) 296 } 297 // Make sure wrapped planner exclude this block. 298 copiedNoCompactMarked[plan[biggestIndex].ULID] = &metadata.NoCompactMark{ID: plan[biggestIndex].ULID, Version: metadata.NoCompactMarkVersion1} 299 continue PlanLoop 300 } 301 } 302 // Planned blocks should not exceed limit. 303 return plan, nil 304 } 305 }