github.com/grafana/pyroscope@v1.18.0/pkg/metastore/compaction/compactor/compaction_queue.go

github.com/grafana/pyroscope@v1.18.0/pkg/metastore/compaction/compactor/compaction_queue.go (about)

     1  package compactor
     2  
     3  import (
     4  	"container/heap"
     5  	"slices"
     6  	"sync"
     7  	"sync/atomic"
     8  
     9  	"github.com/prometheus/client_golang/prometheus"
    10  
    11  	"github.com/grafana/pyroscope/pkg/metastore/compaction"
    12  	"github.com/grafana/pyroscope/pkg/util"
    13  )
    14  
    15  const defaultBlockBatchSize = 20
    16  
    17  type compactionKey struct {
    18  	// Order of the fields is not important.
    19  	// Can be generalized.
    20  	tenant string
    21  	shard  uint32
    22  	level  uint32
    23  }
    24  
    25  type compactionQueue struct {
    26  	config         Config
    27  	registerer     prometheus.Registerer
    28  	levels         []*blockQueue
    29  	globalStats    *globalQueueStats
    30  	statsCollector *globalQueueStatsCollector
    31  }
    32  
    33  // blockQueue stages blocks as they are being added. Once a batch of blocks
    34  // within the compaction key reaches a certain size or age, it is pushed to
    35  // the linked list in the arrival order and to the compaction key queue.
    36  //
    37  // This allows to iterate over the blocks in the order of arrival within the
    38  // compaction dimension, while maintaining an ability to remove blocks from the
    39  // queue efficiently.
    40  //
    41  // No pop operation is needed for the block queue: the only way blocks leave
    42  // the queue is through explicit removal. Batch and block iterators provide
    43  // the read access.
    44  type blockQueue struct {
    45  	config      Config
    46  	registerer  prometheus.Registerer
    47  	staged      map[compactionKey]*stagedBlocks
    48  	globalStats *globalQueueStats
    49  	// Batches ordered by arrival.
    50  	head, tail *batch
    51  	// Priority queue by last update: we need to flush
    52  	// incomplete batches once they stop updating.
    53  	updates *priorityBlockQueue
    54  }
    55  
    56  // stagedBlocks is a queue of blocks sharing the same compaction key.
    57  type stagedBlocks struct {
    58  	key compactionKey
    59  	// Local queue (blocks sharing this compaction key).
    60  	head, tail *batch
    61  	// Parent block queue (global).
    62  	queue *blockQueue
    63  	// Incomplete batch of blocks.
    64  	batch *batch
    65  	// Map of block IDs to their locations in batches.
    66  	refs      map[string]blockRef
    67  	stats     *queueStats
    68  	collector *queueStatsCollector
    69  	// Parent block queue maintains a priority queue of
    70  	// incomplete batches by the last update time.
    71  	heapIndex int
    72  	updatedAt int64
    73  }
    74  
    75  type queueStats struct {
    76  	blocks   atomic.Int32
    77  	batches  atomic.Int32
    78  	rejected atomic.Int32
    79  	missed   atomic.Int32
    80  }
    81  
    82  // blockRef points to the block in the batch.
    83  type blockRef struct {
    84  	batch *batch
    85  	index int
    86  }
    87  
    88  type blockEntry struct {
    89  	id    string // Block ID.
    90  	index uint64 // Index of the command in the raft log.
    91  }
    92  
    93  type batch struct {
    94  	flush  sync.Once
    95  	size   uint32
    96  	blocks []blockEntry
    97  	// Reference to the parent.
    98  	staged *stagedBlocks
    99  	// Links to the global batch queue items:
   100  	// the compaction key of batches may differ.
   101  	nextG, prevG *batch
   102  	// Links to the local batch queue items:
   103  	// batches that share the same compaction key.
   104  	next, prev *batch
   105  	createdAt  int64
   106  }
   107  
   108  func newCompactionQueue(config Config, registerer prometheus.Registerer) *compactionQueue {
   109  	globalStats := newGlobalQueueStats(len(config.Levels))
   110  	q := &compactionQueue{
   111  		config:      config,
   112  		registerer:  registerer,
   113  		globalStats: globalStats,
   114  	}
   115  	if registerer != nil {
   116  		q.statsCollector = newGlobalQueueStatsCollector(q)
   117  		util.RegisterOrGet(registerer, q.statsCollector)
   118  	}
   119  	return q
   120  }
   121  
   122  func (q *compactionQueue) reset() {
   123  	for _, level := range q.levels {
   124  		if level != nil {
   125  			for _, s := range level.staged {
   126  				level.removeStaged(s)
   127  			}
   128  		}
   129  	}
   130  	clear(q.levels)
   131  	q.levels = q.levels[:0]
   132  }
   133  
   134  func (q *compactionQueue) push(e compaction.BlockEntry) bool {
   135  	level := q.blockQueue(e.Level)
   136  	staged := level.stagedBlocks(compactionKey{
   137  		tenant: e.Tenant,
   138  		shard:  e.Shard,
   139  		level:  e.Level,
   140  	})
   141  	staged.updatedAt = e.AppendedAt
   142  	pushed := staged.push(blockEntry{
   143  		id:    e.ID,
   144  		index: e.Index,
   145  	})
   146  	heap.Fix(level.updates, staged.heapIndex)
   147  	level.flushOldest(e.AppendedAt)
   148  	return pushed
   149  }
   150  
   151  func (q *compactionQueue) blockQueue(l uint32) *blockQueue {
   152  	s := l + 1 // Levels are 0-based.
   153  	if s > uint32(len(q.levels)) {
   154  		q.levels = slices.Grow(q.levels, int(s))[:s]
   155  	}
   156  	level := q.levels[l]
   157  	if level == nil {
   158  		level = newBlockQueue(q.config, q.registerer, q.globalStats)
   159  		q.levels[l] = level
   160  	}
   161  	return level
   162  }
   163  
   164  func newBlockQueue(config Config, registerer prometheus.Registerer, globalStats *globalQueueStats) *blockQueue {
   165  	return &blockQueue{
   166  		config:      config,
   167  		registerer:  registerer,
   168  		staged:      make(map[compactionKey]*stagedBlocks),
   169  		globalStats: globalStats,
   170  		updates:     new(priorityBlockQueue),
   171  	}
   172  }
   173  
   174  func (q *blockQueue) stagedBlocks(k compactionKey) *stagedBlocks {
   175  	staged, ok := q.staged[k]
   176  	if !ok {
   177  		staged = &stagedBlocks{
   178  			queue: q,
   179  			key:   k,
   180  			refs:  make(map[string]blockRef),
   181  			stats: new(queueStats),
   182  		}
   183  		staged.resetBatch()
   184  		q.staged[k] = staged
   185  		heap.Push(q.updates, staged)
   186  		q.globalStats.AddQueues(k, 1)
   187  		if q.registerer != nil {
   188  			staged.collector = newQueueStatsCollector(staged)
   189  			util.RegisterOrGet(q.registerer, staged.collector)
   190  		}
   191  	}
   192  	return staged
   193  }
   194  
   195  func (q *blockQueue) removeStaged(s *stagedBlocks) {
   196  	if s.collector != nil {
   197  		q.registerer.Unregister(s.collector)
   198  	}
   199  	delete(q.staged, s.key)
   200  	if s.heapIndex < 0 || s.heapIndex >= q.updates.Len() {
   201  		panic("bug: attempt to delete compaction queue with an invalid priority index")
   202  	}
   203  	heap.Remove(q.updates, s.heapIndex)
   204  	q.globalStats.AddQueues(s.key, -1)
   205  }
   206  
   207  func (s *stagedBlocks) push(block blockEntry) bool {
   208  	if _, found := s.refs[block.id]; found {
   209  		s.stats.rejected.Add(1)
   210  		return false
   211  	}
   212  	s.refs[block.id] = blockRef{batch: s.batch, index: len(s.batch.blocks)}
   213  	s.batch.blocks = append(s.batch.blocks, block)
   214  	if s.batch.size == 0 {
   215  		s.batch.createdAt = s.updatedAt
   216  	}
   217  	s.batch.size++
   218  	s.stats.blocks.Add(1)
   219  	s.queue.globalStats.AddBlocks(s.key, 1)
   220  	if s.queue.config.exceedsMaxSize(s.batch) ||
   221  		s.queue.config.exceedsMaxAge(s.batch, s.updatedAt) {
   222  		s.flush()
   223  	}
   224  	return true
   225  }
   226  
   227  func (s *stagedBlocks) flush() {
   228  	var flushed bool
   229  	s.batch.flush.Do(func() {
   230  		if !s.queue.pushBatch(s.batch) {
   231  			panic("bug: attempt to detach the compaction queue head")
   232  		}
   233  		flushed = true
   234  	})
   235  	if !flushed {
   236  		panic("bug: attempt to flush a compaction queue batch twice")
   237  	}
   238  	s.resetBatch()
   239  }
   240  
   241  func (s *stagedBlocks) resetBatch() {
   242  	s.batch = &batch{
   243  		blocks: make([]blockEntry, 0, defaultBlockBatchSize),
   244  		staged: s,
   245  	}
   246  }
   247  
   248  var zeroBlockEntry blockEntry
   249  
   250  func (s *stagedBlocks) delete(block string) blockEntry {
   251  	ref, found := s.refs[block]
   252  	if !found {
   253  		s.stats.missed.Add(1)
   254  		return zeroBlockEntry
   255  	}
   256  	// We can't change the order of the blocks in the batch,
   257  	// because that would require updating all the block locations.
   258  	e := ref.batch.blocks[ref.index]
   259  	ref.batch.blocks[ref.index] = zeroBlockEntry
   260  	ref.batch.size--
   261  	s.stats.blocks.Add(-1)
   262  	s.queue.globalStats.AddBlocks(s.key, -1)
   263  	if ref.batch.size == 0 {
   264  		if ref.batch != s.batch {
   265  			// We should never ever try to delete the staging batch from the
   266  			// queue: it has not been flushed and added to the queue yet.
   267  			//
   268  			// NOTE(kolesnikovae):
   269  			//  It caused a problem because removeBatch mistakenly interpreted
   270  			//  the batch as a head (s.batch.prev == nil), and detached it,
   271  			//  replacing a valid head with s.batch.next, which is always nil
   272  			//  at this point; it made the queue look empty for the reader,
   273  			//  because the queue is read from the head.
   274  			//
   275  			//  The only way we may end up here if blocks are removed from the
   276  			//  staging batch. Typically, blocks are not supposed to be removed
   277  			//  from there before they left the queue (i.e., flushed to the
   278  			//  global queue).
   279  			//
   280  			//  In practice, the compactor is distributed and has multiple
   281  			//  replicas: the leader instance could have already decided to
   282  			//  flush the blocks, and now they should be removed from all
   283  			//  instances. Due to a bug in time-based flushing (when it stops
   284  			//  working), it was possible that after the leader restarts and
   285  			//  recovers time-based flushing locally, it would desire to flush
   286  			//  the oldest batch. Consequently, the follower instances, where
   287  			//  the batch is still in staging, would need to do the same.
   288  			if !s.queue.removeBatch(ref.batch) {
   289  				panic("bug: attempt to remove a batch that is not in the compaction queue")
   290  			}
   291  		}
   292  	}
   293  	delete(s.refs, block)
   294  	if len(s.refs) == 0 {
   295  		// This is the last block with the given compaction key, so we want to
   296  		// remove the staging structure. It's fine to delete it from the queue
   297  		// at any point: we guarantee that it does not reference any blocks in
   298  		// the queue, and we do not need to flush it anymore.
   299  		s.queue.removeStaged(s)
   300  	}
   301  	return e
   302  }
   303  
   304  func (q *blockQueue) pushBatch(b *batch) bool {
   305  	if q.tail != nil {
   306  		q.tail.nextG = b
   307  		b.prevG = q.tail
   308  	} else if q.head == nil {
   309  		q.head = b
   310  	} else {
   311  		return false
   312  	}
   313  	q.tail = b
   314  
   315  	// Same for the queue of batches
   316  	// with matching compaction key.
   317  
   318  	if b.staged.tail != nil {
   319  		b.staged.tail.next = b
   320  		b.prev = b.staged.tail
   321  	} else if b.staged.head == nil {
   322  		b.staged.head = b
   323  	} else {
   324  		return false
   325  	}
   326  	b.staged.tail = b
   327  
   328  	b.staged.stats.batches.Add(1)
   329  	q.globalStats.AddBatches(b.staged.key, 1)
   330  	return true
   331  }
   332  
   333  func (q *blockQueue) removeBatch(b *batch) bool {
   334  	if b.prevG != nil {
   335  		b.prevG.nextG = b.nextG
   336  	} else if b == q.head {
   337  		// This is the head.
   338  		q.head = q.head.nextG
   339  	} else {
   340  		return false
   341  	}
   342  	if b.nextG != nil {
   343  		b.nextG.prevG = b.prevG
   344  	} else if b == q.tail {
   345  		// This is the tail.
   346  		q.tail = q.tail.prevG
   347  	} else {
   348  		return false
   349  	}
   350  	b.nextG = nil
   351  	b.prevG = nil
   352  
   353  	// Same for the queue of batches
   354  	// with matching compaction key.
   355  
   356  	if b.prev != nil {
   357  		b.prev.next = b.next
   358  	} else if b == b.staged.head {
   359  		// This is the head.
   360  		b.staged.head = b.staged.head.next
   361  	} else {
   362  		return false
   363  	}
   364  	if b.next != nil {
   365  		b.next.prev = b.prev
   366  	} else if b == b.staged.tail {
   367  		// This is the tail.
   368  		b.staged.tail = b.staged.tail.prev
   369  	} else {
   370  		return false
   371  	}
   372  	b.next = nil
   373  	b.prev = nil
   374  
   375  	b.staged.stats.batches.Add(-1)
   376  	q.globalStats.AddBatches(b.staged.key, -1)
   377  	return true
   378  }
   379  
   380  func (q *blockQueue) flushOldest(now int64) {
   381  	if q.updates.Len() == 0 {
   382  		panic("bug: compaction queue has empty priority queue")
   383  	}
   384  	// Peek the oldest staging batch in the priority queue (min-heap).
   385  	oldest := (*q.updates)[0]
   386  	if !q.config.exceedsMaxAge(oldest.batch, now) {
   387  		return
   388  	}
   389  	// It's possible that the staging batch is empty: it's only removed
   390  	// from the queue when the last block with the given compaction key is
   391  	// removed, including ones flushed to the global queue. Therefore, we
   392  	// should not pop it from the queue, but update its index in the heap.
   393  	// Otherwise, if the staging batch has not been removed from the queue
   394  	// yet i.e., references some blocks in the compaction queue (it's rare
   395  	// but not impossible), time-based flush will stop working for it.
   396  	if oldest.batch.size > 0 {
   397  		oldest.flush()
   398  	}
   399  	oldest.updatedAt = now
   400  	heap.Fix(q.updates, oldest.heapIndex)
   401  }
   402  
   403  type priorityBlockQueue []*stagedBlocks
   404  
   405  func (pq priorityBlockQueue) Len() int { return len(pq) }
   406  
   407  func (pq priorityBlockQueue) Less(i, j int) bool {
   408  	return pq[i].updatedAt < pq[j].updatedAt
   409  }
   410  
   411  func (pq priorityBlockQueue) Swap(i, j int) {
   412  	pq[i], pq[j] = pq[j], pq[i]
   413  	pq[i].heapIndex = i
   414  	pq[j].heapIndex = j
   415  }
   416  
   417  func (pq *priorityBlockQueue) Push(x interface{}) {
   418  	n := len(*pq)
   419  	staged := x.(*stagedBlocks)
   420  	staged.heapIndex = n
   421  	*pq = append(*pq, staged)
   422  }
   423  
   424  func (pq *priorityBlockQueue) Pop() interface{} {
   425  	old := *pq
   426  	n := len(old)
   427  	staged := old[n-1]
   428  	old[n-1] = nil
   429  	staged.heapIndex = -1
   430  	*pq = old[0 : n-1]
   431  	return staged
   432  }
   433  
   434  func newBatchIter(q *blockQueue) *batchIter { return &batchIter{batch: q.head} }
   435  
   436  // batchIter iterates over the batches in the queue, in the order of arrival.
   437  type batchIter struct{ batch *batch }
   438  
   439  func (i *batchIter) next() (*batch, bool) {
   440  	if i.batch == nil {
   441  		return nil, false
   442  	}
   443  	b := i.batch
   444  	i.batch = i.batch.nextG
   445  	return b, b != nil
   446  }
   447  
   448  func (i *batchIter) reset(b *batch) { i.batch = b }
   449  
   450  // batchIter iterates over the batches in the queue, in the order of arrival
   451  // within the compaction key. It's guaranteed that returned blocks are unique
   452  // across all batched.
   453  type blockIter struct {
   454  	visited map[string]struct{}
   455  	batch   *batch
   456  	i       int
   457  }
   458  
   459  func newBlockIter() *blockIter {
   460  	// Assuming that block IDs (16b ULID) are globally unique.
   461  	// We could achieve the same with more efficiency by marking visited
   462  	// batches. However, marking visited blocks seems to be more robust,
   463  	// and the size of the map is expected to be small.
   464  	visited := make(map[string]struct{}, 64)
   465  	visited[zeroBlockEntry.id] = struct{}{}
   466  	return &blockIter{visited: visited}
   467  }
   468  
   469  func (it *blockIter) setBatch(b *batch) {
   470  	it.batch = b
   471  	it.i = 0
   472  }
   473  
   474  func (it *blockIter) more() bool {
   475  	if it.batch == nil {
   476  		return false
   477  	}
   478  	return it.i < len(it.batch.blocks)
   479  }
   480  
   481  func (it *blockIter) peek() (string, bool) {
   482  	for it.batch != nil {
   483  		if it.i >= len(it.batch.blocks) {
   484  			it.setBatch(it.batch.next)
   485  			continue
   486  		}
   487  		entry := it.batch.blocks[it.i]
   488  		if _, visited := it.visited[entry.id]; visited {
   489  			it.i++
   490  			continue
   491  		}
   492  		return entry.id, true
   493  	}
   494  	return "", false
   495  }
   496  
   497  func (it *blockIter) advance() {
   498  	entry := it.batch.blocks[it.i]
   499  	it.visited[entry.id] = struct{}{}
   500  	it.i++
   501  }