github.com/grafana/pyroscope@v1.18.0/pkg/metastore/compaction/scheduler/schedule.go

github.com/grafana/pyroscope@v1.18.0/pkg/metastore/compaction/scheduler/schedule.go (about)

     1  package scheduler
     2  
     3  import (
     4  	"container/heap"
     5  	"slices"
     6  	"time"
     7  
     8  	"go.etcd.io/bbolt"
     9  
    10  	metastorev1 "github.com/grafana/pyroscope/api/gen/proto/go/metastore/v1"
    11  	"github.com/grafana/pyroscope/api/gen/proto/go/metastore/v1/raft_log"
    12  )
    13  
    14  // schedule should be used to prepare the compaction plan update.
    15  // The implementation must have no side effects or alter the
    16  // Scheduler in any way.
    17  type schedule struct {
    18  	tx    *bbolt.Tx
    19  	now   time.Time
    20  	token uint64
    21  	// Read-only.
    22  	scheduler *Scheduler
    23  	// Uncommitted schedule updates.
    24  	updates map[string]*raft_log.CompactionJobState
    25  	added   int
    26  	evicted int
    27  	// Modified copy of the job queue.
    28  	copied []priorityJobQueue
    29  }
    30  
    31  func (p *schedule) AssignJob() (*raft_log.AssignedCompactionJob, error) {
    32  	p.scheduler.mu.Lock()
    33  	defer p.scheduler.mu.Unlock()
    34  	state := p.nextAssignment()
    35  	if state == nil {
    36  		return nil, nil
    37  	}
    38  	plan, err := p.scheduler.store.GetJobPlan(p.tx, state.Name)
    39  	if err != nil {
    40  		return nil, err
    41  	}
    42  	p.updates[state.Name] = state
    43  	assigned := &raft_log.AssignedCompactionJob{
    44  		State: state,
    45  		Plan:  plan,
    46  	}
    47  	return assigned, nil
    48  }
    49  
    50  func (p *schedule) UpdateJob(status *raft_log.CompactionJobStatusUpdate) *raft_log.CompactionJobState {
    51  	p.scheduler.mu.Lock()
    52  	defer p.scheduler.mu.Unlock()
    53  	state := p.newStateForStatusReport(status)
    54  	if state == nil {
    55  		return nil
    56  	}
    57  	// State changes should be taken into account when we assign jobs.
    58  	p.updates[status.Name] = state
    59  	return state
    60  }
    61  
    62  // handleStatusReport reports the job state change caused by the status report
    63  // from compaction worker. The function does not modify the actual job queue.
    64  func (p *schedule) newStateForStatusReport(status *raft_log.CompactionJobStatusUpdate) *raft_log.CompactionJobState {
    65  	state := p.scheduler.queue.jobs[status.Name]
    66  	if state == nil {
    67  		// This may happen if the job has been reassigned
    68  		// and completed by another worker; we respond in
    69  		// the same way.
    70  		return nil
    71  	}
    72  
    73  	if state.Token > status.Token {
    74  		// The job is not assigned to this worker.
    75  		return nil
    76  	}
    77  
    78  	switch newState := state.CloneVT(); status.Status {
    79  	case metastorev1.CompactionJobStatus_COMPACTION_STATUS_IN_PROGRESS:
    80  		// A regular lease renewal.
    81  		newState.LeaseExpiresAt = p.allocateLease()
    82  		return newState
    83  
    84  	case metastorev1.CompactionJobStatus_COMPACTION_STATUS_SUCCESS:
    85  		newState.Status = status.Status
    86  		return newState
    87  
    88  	default:
    89  		// Not allowed and unknown status updates can be safely ignored:
    90  		// eventually, the job will be reassigned. The same for status
    91  		// handlers: a nil state is returned, which is interpreted as
    92  		// "no new lease, stop the work".
    93  	}
    94  
    95  	return nil
    96  }
    97  
    98  func (p *schedule) EvictJob() *raft_log.CompactionJobState {
    99  	p.scheduler.mu.Lock()
   100  	defer p.scheduler.mu.Unlock()
   101  	limit := p.scheduler.config.MaxQueueSize
   102  	size := uint64(p.scheduler.queue.size() - p.evicted)
   103  	if limit == 0 || size <= limit {
   104  		return nil
   105  	}
   106  	for level := 0; level < len(p.scheduler.queue.levels); level++ {
   107  		// We evict the job from our copy of the queue: each job is only
   108  		// accessible once.
   109  		pq := p.queueLevelCopy(level)
   110  		if pq.Len() != 0 {
   111  			job := heap.Pop(pq).(*jobEntry)
   112  			if p.isFailed(job) {
   113  				p.evicted++
   114  				return job.CompactionJobState
   115  			}
   116  			heap.Push(pq, job)
   117  		}
   118  	}
   119  	return nil
   120  }
   121  
   122  // AddJob creates a state for the newly planned job.
   123  //
   124  // The method must be called after the last AssignJob and UpdateJob calls.
   125  // It returns an empty state if the queue size limit is reached.
   126  func (p *schedule) AddJob(plan *raft_log.CompactionJobPlan) *raft_log.CompactionJobState {
   127  	p.scheduler.mu.Lock()
   128  	defer p.scheduler.mu.Unlock()
   129  	if limit := p.scheduler.config.MaxQueueSize; limit > 0 {
   130  		if size := uint64(p.added + p.scheduler.queue.size()); size >= limit {
   131  			return nil
   132  		}
   133  	}
   134  	state := &raft_log.CompactionJobState{
   135  		Name:            plan.Name,
   136  		CompactionLevel: plan.CompactionLevel,
   137  		Status:          metastorev1.CompactionJobStatus_COMPACTION_STATUS_UNSPECIFIED,
   138  		AddedAt:         p.now.UnixNano(),
   139  		Token:           p.token,
   140  	}
   141  	p.updates[state.Name] = state
   142  	p.added++
   143  	return state
   144  }
   145  
   146  func (p *schedule) nextAssignment() *raft_log.CompactionJobState {
   147  	// We don't need to check the job ownership here: the worker asks
   148  	// for a job assigment (new ownership).
   149  	for level := 0; level < len(p.scheduler.queue.levels); {
   150  		// We evict the job from our copy of the queue: each job is only
   151  		// accessible once. When we reach the bottom of the queue (the first
   152  		// failed job, or the last job in the queue), we move to the next
   153  		// level. Note that we check all in-progress jobs if there are not
   154  		// enough unassigned jobs in the queue.
   155  		pq := p.queueLevelCopy(level)
   156  		if pq.Len() == 0 {
   157  			level++
   158  			continue
   159  		}
   160  
   161  		job := heap.Pop(pq).(*jobEntry)
   162  		if _, found := p.updates[job.Name]; found {
   163  			// We don't even consider own jobs: these are already
   164  			// assigned and are in-progress or have been completed.
   165  			// This, however, does not prevent from reassigning a
   166  			// job that the worker has abandoned in the past.
   167  			// Newly created jobs are not considered here as well.
   168  			continue
   169  		}
   170  
   171  		switch job.Status {
   172  		case metastorev1.CompactionJobStatus_COMPACTION_STATUS_UNSPECIFIED:
   173  			return p.assignJob(job)
   174  
   175  		case metastorev1.CompactionJobStatus_COMPACTION_STATUS_IN_PROGRESS:
   176  			if p.isFailed(job) {
   177  				// We reached the bottom of the queue: only failed jobs left.
   178  				heap.Push(pq, job)
   179  				level++
   180  				continue
   181  			}
   182  			if p.isAbandoned(job) {
   183  				state := p.assignJob(job)
   184  				state.Failures++
   185  				return state
   186  			}
   187  		}
   188  	}
   189  
   190  	return nil
   191  }
   192  
   193  func (p *schedule) allocateLease() int64 {
   194  	return p.now.Add(p.scheduler.config.LeaseDuration).UnixNano()
   195  }
   196  
   197  func (p *schedule) assignJob(e *jobEntry) *raft_log.CompactionJobState {
   198  	job := e.CloneVT()
   199  	job.Status = metastorev1.CompactionJobStatus_COMPACTION_STATUS_IN_PROGRESS
   200  	job.LeaseExpiresAt = p.allocateLease()
   201  	job.Token = p.token
   202  	return job
   203  }
   204  
   205  func (p *schedule) isAbandoned(job *jobEntry) bool {
   206  	return !p.isFailed(job) && p.now.UnixNano() > job.LeaseExpiresAt
   207  }
   208  
   209  func (p *schedule) isFailed(job *jobEntry) bool {
   210  	limit := p.scheduler.config.MaxFailures
   211  	return limit > 0 && uint64(job.Failures) >= limit
   212  }
   213  
   214  // The queue must not be modified by the assigner. Therefore, we're copying the
   215  // queue levels lazily. The queue is supposed to be small (hundreds of jobs
   216  // running concurrently); in the worst case, we have a ~24b alloc per entry.
   217  func (p *schedule) queueLevelCopy(i int) *priorityJobQueue {
   218  	s := i + 1 // Levels are 0-based.
   219  	if s > len(p.copied) {
   220  		p.copied = slices.Grow(p.copied, s)[:s]
   221  		if p.copied[i] == nil {
   222  			p.copied[i] = p.scheduler.queue.level(uint32(i)).clone()
   223  		}
   224  	}
   225  	return &p.copied[i]
   226  }