github.com/grafana/pyroscope@v1.18.0/pkg/metastore/compaction/compactor/compaction_queue.go (about) 1 package compactor 2 3 import ( 4 "container/heap" 5 "slices" 6 "sync" 7 "sync/atomic" 8 9 "github.com/prometheus/client_golang/prometheus" 10 11 "github.com/grafana/pyroscope/pkg/metastore/compaction" 12 "github.com/grafana/pyroscope/pkg/util" 13 ) 14 15 const defaultBlockBatchSize = 20 16 17 type compactionKey struct { 18 // Order of the fields is not important. 19 // Can be generalized. 20 tenant string 21 shard uint32 22 level uint32 23 } 24 25 type compactionQueue struct { 26 config Config 27 registerer prometheus.Registerer 28 levels []*blockQueue 29 globalStats *globalQueueStats 30 statsCollector *globalQueueStatsCollector 31 } 32 33 // blockQueue stages blocks as they are being added. Once a batch of blocks 34 // within the compaction key reaches a certain size or age, it is pushed to 35 // the linked list in the arrival order and to the compaction key queue. 36 // 37 // This allows to iterate over the blocks in the order of arrival within the 38 // compaction dimension, while maintaining an ability to remove blocks from the 39 // queue efficiently. 40 // 41 // No pop operation is needed for the block queue: the only way blocks leave 42 // the queue is through explicit removal. Batch and block iterators provide 43 // the read access. 44 type blockQueue struct { 45 config Config 46 registerer prometheus.Registerer 47 staged map[compactionKey]*stagedBlocks 48 globalStats *globalQueueStats 49 // Batches ordered by arrival. 50 head, tail *batch 51 // Priority queue by last update: we need to flush 52 // incomplete batches once they stop updating. 53 updates *priorityBlockQueue 54 } 55 56 // stagedBlocks is a queue of blocks sharing the same compaction key. 57 type stagedBlocks struct { 58 key compactionKey 59 // Local queue (blocks sharing this compaction key). 60 head, tail *batch 61 // Parent block queue (global). 62 queue *blockQueue 63 // Incomplete batch of blocks. 64 batch *batch 65 // Map of block IDs to their locations in batches. 66 refs map[string]blockRef 67 stats *queueStats 68 collector *queueStatsCollector 69 // Parent block queue maintains a priority queue of 70 // incomplete batches by the last update time. 71 heapIndex int 72 updatedAt int64 73 } 74 75 type queueStats struct { 76 blocks atomic.Int32 77 batches atomic.Int32 78 rejected atomic.Int32 79 missed atomic.Int32 80 } 81 82 // blockRef points to the block in the batch. 83 type blockRef struct { 84 batch *batch 85 index int 86 } 87 88 type blockEntry struct { 89 id string // Block ID. 90 index uint64 // Index of the command in the raft log. 91 } 92 93 type batch struct { 94 flush sync.Once 95 size uint32 96 blocks []blockEntry 97 // Reference to the parent. 98 staged *stagedBlocks 99 // Links to the global batch queue items: 100 // the compaction key of batches may differ. 101 nextG, prevG *batch 102 // Links to the local batch queue items: 103 // batches that share the same compaction key. 104 next, prev *batch 105 createdAt int64 106 } 107 108 func newCompactionQueue(config Config, registerer prometheus.Registerer) *compactionQueue { 109 globalStats := newGlobalQueueStats(len(config.Levels)) 110 q := &compactionQueue{ 111 config: config, 112 registerer: registerer, 113 globalStats: globalStats, 114 } 115 if registerer != nil { 116 q.statsCollector = newGlobalQueueStatsCollector(q) 117 util.RegisterOrGet(registerer, q.statsCollector) 118 } 119 return q 120 } 121 122 func (q *compactionQueue) reset() { 123 for _, level := range q.levels { 124 if level != nil { 125 for _, s := range level.staged { 126 level.removeStaged(s) 127 } 128 } 129 } 130 clear(q.levels) 131 q.levels = q.levels[:0] 132 } 133 134 func (q *compactionQueue) push(e compaction.BlockEntry) bool { 135 level := q.blockQueue(e.Level) 136 staged := level.stagedBlocks(compactionKey{ 137 tenant: e.Tenant, 138 shard: e.Shard, 139 level: e.Level, 140 }) 141 staged.updatedAt = e.AppendedAt 142 pushed := staged.push(blockEntry{ 143 id: e.ID, 144 index: e.Index, 145 }) 146 heap.Fix(level.updates, staged.heapIndex) 147 level.flushOldest(e.AppendedAt) 148 return pushed 149 } 150 151 func (q *compactionQueue) blockQueue(l uint32) *blockQueue { 152 s := l + 1 // Levels are 0-based. 153 if s > uint32(len(q.levels)) { 154 q.levels = slices.Grow(q.levels, int(s))[:s] 155 } 156 level := q.levels[l] 157 if level == nil { 158 level = newBlockQueue(q.config, q.registerer, q.globalStats) 159 q.levels[l] = level 160 } 161 return level 162 } 163 164 func newBlockQueue(config Config, registerer prometheus.Registerer, globalStats *globalQueueStats) *blockQueue { 165 return &blockQueue{ 166 config: config, 167 registerer: registerer, 168 staged: make(map[compactionKey]*stagedBlocks), 169 globalStats: globalStats, 170 updates: new(priorityBlockQueue), 171 } 172 } 173 174 func (q *blockQueue) stagedBlocks(k compactionKey) *stagedBlocks { 175 staged, ok := q.staged[k] 176 if !ok { 177 staged = &stagedBlocks{ 178 queue: q, 179 key: k, 180 refs: make(map[string]blockRef), 181 stats: new(queueStats), 182 } 183 staged.resetBatch() 184 q.staged[k] = staged 185 heap.Push(q.updates, staged) 186 q.globalStats.AddQueues(k, 1) 187 if q.registerer != nil { 188 staged.collector = newQueueStatsCollector(staged) 189 util.RegisterOrGet(q.registerer, staged.collector) 190 } 191 } 192 return staged 193 } 194 195 func (q *blockQueue) removeStaged(s *stagedBlocks) { 196 if s.collector != nil { 197 q.registerer.Unregister(s.collector) 198 } 199 delete(q.staged, s.key) 200 if s.heapIndex < 0 || s.heapIndex >= q.updates.Len() { 201 panic("bug: attempt to delete compaction queue with an invalid priority index") 202 } 203 heap.Remove(q.updates, s.heapIndex) 204 q.globalStats.AddQueues(s.key, -1) 205 } 206 207 func (s *stagedBlocks) push(block blockEntry) bool { 208 if _, found := s.refs[block.id]; found { 209 s.stats.rejected.Add(1) 210 return false 211 } 212 s.refs[block.id] = blockRef{batch: s.batch, index: len(s.batch.blocks)} 213 s.batch.blocks = append(s.batch.blocks, block) 214 if s.batch.size == 0 { 215 s.batch.createdAt = s.updatedAt 216 } 217 s.batch.size++ 218 s.stats.blocks.Add(1) 219 s.queue.globalStats.AddBlocks(s.key, 1) 220 if s.queue.config.exceedsMaxSize(s.batch) || 221 s.queue.config.exceedsMaxAge(s.batch, s.updatedAt) { 222 s.flush() 223 } 224 return true 225 } 226 227 func (s *stagedBlocks) flush() { 228 var flushed bool 229 s.batch.flush.Do(func() { 230 if !s.queue.pushBatch(s.batch) { 231 panic("bug: attempt to detach the compaction queue head") 232 } 233 flushed = true 234 }) 235 if !flushed { 236 panic("bug: attempt to flush a compaction queue batch twice") 237 } 238 s.resetBatch() 239 } 240 241 func (s *stagedBlocks) resetBatch() { 242 s.batch = &batch{ 243 blocks: make([]blockEntry, 0, defaultBlockBatchSize), 244 staged: s, 245 } 246 } 247 248 var zeroBlockEntry blockEntry 249 250 func (s *stagedBlocks) delete(block string) blockEntry { 251 ref, found := s.refs[block] 252 if !found { 253 s.stats.missed.Add(1) 254 return zeroBlockEntry 255 } 256 // We can't change the order of the blocks in the batch, 257 // because that would require updating all the block locations. 258 e := ref.batch.blocks[ref.index] 259 ref.batch.blocks[ref.index] = zeroBlockEntry 260 ref.batch.size-- 261 s.stats.blocks.Add(-1) 262 s.queue.globalStats.AddBlocks(s.key, -1) 263 if ref.batch.size == 0 { 264 if ref.batch != s.batch { 265 // We should never ever try to delete the staging batch from the 266 // queue: it has not been flushed and added to the queue yet. 267 // 268 // NOTE(kolesnikovae): 269 // It caused a problem because removeBatch mistakenly interpreted 270 // the batch as a head (s.batch.prev == nil), and detached it, 271 // replacing a valid head with s.batch.next, which is always nil 272 // at this point; it made the queue look empty for the reader, 273 // because the queue is read from the head. 274 // 275 // The only way we may end up here if blocks are removed from the 276 // staging batch. Typically, blocks are not supposed to be removed 277 // from there before they left the queue (i.e., flushed to the 278 // global queue). 279 // 280 // In practice, the compactor is distributed and has multiple 281 // replicas: the leader instance could have already decided to 282 // flush the blocks, and now they should be removed from all 283 // instances. Due to a bug in time-based flushing (when it stops 284 // working), it was possible that after the leader restarts and 285 // recovers time-based flushing locally, it would desire to flush 286 // the oldest batch. Consequently, the follower instances, where 287 // the batch is still in staging, would need to do the same. 288 if !s.queue.removeBatch(ref.batch) { 289 panic("bug: attempt to remove a batch that is not in the compaction queue") 290 } 291 } 292 } 293 delete(s.refs, block) 294 if len(s.refs) == 0 { 295 // This is the last block with the given compaction key, so we want to 296 // remove the staging structure. It's fine to delete it from the queue 297 // at any point: we guarantee that it does not reference any blocks in 298 // the queue, and we do not need to flush it anymore. 299 s.queue.removeStaged(s) 300 } 301 return e 302 } 303 304 func (q *blockQueue) pushBatch(b *batch) bool { 305 if q.tail != nil { 306 q.tail.nextG = b 307 b.prevG = q.tail 308 } else if q.head == nil { 309 q.head = b 310 } else { 311 return false 312 } 313 q.tail = b 314 315 // Same for the queue of batches 316 // with matching compaction key. 317 318 if b.staged.tail != nil { 319 b.staged.tail.next = b 320 b.prev = b.staged.tail 321 } else if b.staged.head == nil { 322 b.staged.head = b 323 } else { 324 return false 325 } 326 b.staged.tail = b 327 328 b.staged.stats.batches.Add(1) 329 q.globalStats.AddBatches(b.staged.key, 1) 330 return true 331 } 332 333 func (q *blockQueue) removeBatch(b *batch) bool { 334 if b.prevG != nil { 335 b.prevG.nextG = b.nextG 336 } else if b == q.head { 337 // This is the head. 338 q.head = q.head.nextG 339 } else { 340 return false 341 } 342 if b.nextG != nil { 343 b.nextG.prevG = b.prevG 344 } else if b == q.tail { 345 // This is the tail. 346 q.tail = q.tail.prevG 347 } else { 348 return false 349 } 350 b.nextG = nil 351 b.prevG = nil 352 353 // Same for the queue of batches 354 // with matching compaction key. 355 356 if b.prev != nil { 357 b.prev.next = b.next 358 } else if b == b.staged.head { 359 // This is the head. 360 b.staged.head = b.staged.head.next 361 } else { 362 return false 363 } 364 if b.next != nil { 365 b.next.prev = b.prev 366 } else if b == b.staged.tail { 367 // This is the tail. 368 b.staged.tail = b.staged.tail.prev 369 } else { 370 return false 371 } 372 b.next = nil 373 b.prev = nil 374 375 b.staged.stats.batches.Add(-1) 376 q.globalStats.AddBatches(b.staged.key, -1) 377 return true 378 } 379 380 func (q *blockQueue) flushOldest(now int64) { 381 if q.updates.Len() == 0 { 382 panic("bug: compaction queue has empty priority queue") 383 } 384 // Peek the oldest staging batch in the priority queue (min-heap). 385 oldest := (*q.updates)[0] 386 if !q.config.exceedsMaxAge(oldest.batch, now) { 387 return 388 } 389 // It's possible that the staging batch is empty: it's only removed 390 // from the queue when the last block with the given compaction key is 391 // removed, including ones flushed to the global queue. Therefore, we 392 // should not pop it from the queue, but update its index in the heap. 393 // Otherwise, if the staging batch has not been removed from the queue 394 // yet i.e., references some blocks in the compaction queue (it's rare 395 // but not impossible), time-based flush will stop working for it. 396 if oldest.batch.size > 0 { 397 oldest.flush() 398 } 399 oldest.updatedAt = now 400 heap.Fix(q.updates, oldest.heapIndex) 401 } 402 403 type priorityBlockQueue []*stagedBlocks 404 405 func (pq priorityBlockQueue) Len() int { return len(pq) } 406 407 func (pq priorityBlockQueue) Less(i, j int) bool { 408 return pq[i].updatedAt < pq[j].updatedAt 409 } 410 411 func (pq priorityBlockQueue) Swap(i, j int) { 412 pq[i], pq[j] = pq[j], pq[i] 413 pq[i].heapIndex = i 414 pq[j].heapIndex = j 415 } 416 417 func (pq *priorityBlockQueue) Push(x interface{}) { 418 n := len(*pq) 419 staged := x.(*stagedBlocks) 420 staged.heapIndex = n 421 *pq = append(*pq, staged) 422 } 423 424 func (pq *priorityBlockQueue) Pop() interface{} { 425 old := *pq 426 n := len(old) 427 staged := old[n-1] 428 old[n-1] = nil 429 staged.heapIndex = -1 430 *pq = old[0 : n-1] 431 return staged 432 } 433 434 func newBatchIter(q *blockQueue) *batchIter { return &batchIter{batch: q.head} } 435 436 // batchIter iterates over the batches in the queue, in the order of arrival. 437 type batchIter struct{ batch *batch } 438 439 func (i *batchIter) next() (*batch, bool) { 440 if i.batch == nil { 441 return nil, false 442 } 443 b := i.batch 444 i.batch = i.batch.nextG 445 return b, b != nil 446 } 447 448 func (i *batchIter) reset(b *batch) { i.batch = b } 449 450 // batchIter iterates over the batches in the queue, in the order of arrival 451 // within the compaction key. It's guaranteed that returned blocks are unique 452 // across all batched. 453 type blockIter struct { 454 visited map[string]struct{} 455 batch *batch 456 i int 457 } 458 459 func newBlockIter() *blockIter { 460 // Assuming that block IDs (16b ULID) are globally unique. 461 // We could achieve the same with more efficiency by marking visited 462 // batches. However, marking visited blocks seems to be more robust, 463 // and the size of the map is expected to be small. 464 visited := make(map[string]struct{}, 64) 465 visited[zeroBlockEntry.id] = struct{}{} 466 return &blockIter{visited: visited} 467 } 468 469 func (it *blockIter) setBatch(b *batch) { 470 it.batch = b 471 it.i = 0 472 } 473 474 func (it *blockIter) more() bool { 475 if it.batch == nil { 476 return false 477 } 478 return it.i < len(it.batch.blocks) 479 } 480 481 func (it *blockIter) peek() (string, bool) { 482 for it.batch != nil { 483 if it.i >= len(it.batch.blocks) { 484 it.setBatch(it.batch.next) 485 continue 486 } 487 entry := it.batch.blocks[it.i] 488 if _, visited := it.visited[entry.id]; visited { 489 it.i++ 490 continue 491 } 492 return entry.id, true 493 } 494 return "", false 495 } 496 497 func (it *blockIter) advance() { 498 entry := it.batch.blocks[it.i] 499 it.visited[entry.id] = struct{}{} 500 it.i++ 501 }