github.com/grafana/pyroscope@v1.18.0/pkg/metastore/compaction/scheduler/schedule.go (about) 1 package scheduler 2 3 import ( 4 "container/heap" 5 "slices" 6 "time" 7 8 "go.etcd.io/bbolt" 9 10 metastorev1 "github.com/grafana/pyroscope/api/gen/proto/go/metastore/v1" 11 "github.com/grafana/pyroscope/api/gen/proto/go/metastore/v1/raft_log" 12 ) 13 14 // schedule should be used to prepare the compaction plan update. 15 // The implementation must have no side effects or alter the 16 // Scheduler in any way. 17 type schedule struct { 18 tx *bbolt.Tx 19 now time.Time 20 token uint64 21 // Read-only. 22 scheduler *Scheduler 23 // Uncommitted schedule updates. 24 updates map[string]*raft_log.CompactionJobState 25 added int 26 evicted int 27 // Modified copy of the job queue. 28 copied []priorityJobQueue 29 } 30 31 func (p *schedule) AssignJob() (*raft_log.AssignedCompactionJob, error) { 32 p.scheduler.mu.Lock() 33 defer p.scheduler.mu.Unlock() 34 state := p.nextAssignment() 35 if state == nil { 36 return nil, nil 37 } 38 plan, err := p.scheduler.store.GetJobPlan(p.tx, state.Name) 39 if err != nil { 40 return nil, err 41 } 42 p.updates[state.Name] = state 43 assigned := &raft_log.AssignedCompactionJob{ 44 State: state, 45 Plan: plan, 46 } 47 return assigned, nil 48 } 49 50 func (p *schedule) UpdateJob(status *raft_log.CompactionJobStatusUpdate) *raft_log.CompactionJobState { 51 p.scheduler.mu.Lock() 52 defer p.scheduler.mu.Unlock() 53 state := p.newStateForStatusReport(status) 54 if state == nil { 55 return nil 56 } 57 // State changes should be taken into account when we assign jobs. 58 p.updates[status.Name] = state 59 return state 60 } 61 62 // handleStatusReport reports the job state change caused by the status report 63 // from compaction worker. The function does not modify the actual job queue. 64 func (p *schedule) newStateForStatusReport(status *raft_log.CompactionJobStatusUpdate) *raft_log.CompactionJobState { 65 state := p.scheduler.queue.jobs[status.Name] 66 if state == nil { 67 // This may happen if the job has been reassigned 68 // and completed by another worker; we respond in 69 // the same way. 70 return nil 71 } 72 73 if state.Token > status.Token { 74 // The job is not assigned to this worker. 75 return nil 76 } 77 78 switch newState := state.CloneVT(); status.Status { 79 case metastorev1.CompactionJobStatus_COMPACTION_STATUS_IN_PROGRESS: 80 // A regular lease renewal. 81 newState.LeaseExpiresAt = p.allocateLease() 82 return newState 83 84 case metastorev1.CompactionJobStatus_COMPACTION_STATUS_SUCCESS: 85 newState.Status = status.Status 86 return newState 87 88 default: 89 // Not allowed and unknown status updates can be safely ignored: 90 // eventually, the job will be reassigned. The same for status 91 // handlers: a nil state is returned, which is interpreted as 92 // "no new lease, stop the work". 93 } 94 95 return nil 96 } 97 98 func (p *schedule) EvictJob() *raft_log.CompactionJobState { 99 p.scheduler.mu.Lock() 100 defer p.scheduler.mu.Unlock() 101 limit := p.scheduler.config.MaxQueueSize 102 size := uint64(p.scheduler.queue.size() - p.evicted) 103 if limit == 0 || size <= limit { 104 return nil 105 } 106 for level := 0; level < len(p.scheduler.queue.levels); level++ { 107 // We evict the job from our copy of the queue: each job is only 108 // accessible once. 109 pq := p.queueLevelCopy(level) 110 if pq.Len() != 0 { 111 job := heap.Pop(pq).(*jobEntry) 112 if p.isFailed(job) { 113 p.evicted++ 114 return job.CompactionJobState 115 } 116 heap.Push(pq, job) 117 } 118 } 119 return nil 120 } 121 122 // AddJob creates a state for the newly planned job. 123 // 124 // The method must be called after the last AssignJob and UpdateJob calls. 125 // It returns an empty state if the queue size limit is reached. 126 func (p *schedule) AddJob(plan *raft_log.CompactionJobPlan) *raft_log.CompactionJobState { 127 p.scheduler.mu.Lock() 128 defer p.scheduler.mu.Unlock() 129 if limit := p.scheduler.config.MaxQueueSize; limit > 0 { 130 if size := uint64(p.added + p.scheduler.queue.size()); size >= limit { 131 return nil 132 } 133 } 134 state := &raft_log.CompactionJobState{ 135 Name: plan.Name, 136 CompactionLevel: plan.CompactionLevel, 137 Status: metastorev1.CompactionJobStatus_COMPACTION_STATUS_UNSPECIFIED, 138 AddedAt: p.now.UnixNano(), 139 Token: p.token, 140 } 141 p.updates[state.Name] = state 142 p.added++ 143 return state 144 } 145 146 func (p *schedule) nextAssignment() *raft_log.CompactionJobState { 147 // We don't need to check the job ownership here: the worker asks 148 // for a job assigment (new ownership). 149 for level := 0; level < len(p.scheduler.queue.levels); { 150 // We evict the job from our copy of the queue: each job is only 151 // accessible once. When we reach the bottom of the queue (the first 152 // failed job, or the last job in the queue), we move to the next 153 // level. Note that we check all in-progress jobs if there are not 154 // enough unassigned jobs in the queue. 155 pq := p.queueLevelCopy(level) 156 if pq.Len() == 0 { 157 level++ 158 continue 159 } 160 161 job := heap.Pop(pq).(*jobEntry) 162 if _, found := p.updates[job.Name]; found { 163 // We don't even consider own jobs: these are already 164 // assigned and are in-progress or have been completed. 165 // This, however, does not prevent from reassigning a 166 // job that the worker has abandoned in the past. 167 // Newly created jobs are not considered here as well. 168 continue 169 } 170 171 switch job.Status { 172 case metastorev1.CompactionJobStatus_COMPACTION_STATUS_UNSPECIFIED: 173 return p.assignJob(job) 174 175 case metastorev1.CompactionJobStatus_COMPACTION_STATUS_IN_PROGRESS: 176 if p.isFailed(job) { 177 // We reached the bottom of the queue: only failed jobs left. 178 heap.Push(pq, job) 179 level++ 180 continue 181 } 182 if p.isAbandoned(job) { 183 state := p.assignJob(job) 184 state.Failures++ 185 return state 186 } 187 } 188 } 189 190 return nil 191 } 192 193 func (p *schedule) allocateLease() int64 { 194 return p.now.Add(p.scheduler.config.LeaseDuration).UnixNano() 195 } 196 197 func (p *schedule) assignJob(e *jobEntry) *raft_log.CompactionJobState { 198 job := e.CloneVT() 199 job.Status = metastorev1.CompactionJobStatus_COMPACTION_STATUS_IN_PROGRESS 200 job.LeaseExpiresAt = p.allocateLease() 201 job.Token = p.token 202 return job 203 } 204 205 func (p *schedule) isAbandoned(job *jobEntry) bool { 206 return !p.isFailed(job) && p.now.UnixNano() > job.LeaseExpiresAt 207 } 208 209 func (p *schedule) isFailed(job *jobEntry) bool { 210 limit := p.scheduler.config.MaxFailures 211 return limit > 0 && uint64(job.Failures) >= limit 212 } 213 214 // The queue must not be modified by the assigner. Therefore, we're copying the 215 // queue levels lazily. The queue is supposed to be small (hundreds of jobs 216 // running concurrently); in the worst case, we have a ~24b alloc per entry. 217 func (p *schedule) queueLevelCopy(i int) *priorityJobQueue { 218 s := i + 1 // Levels are 0-based. 219 if s > len(p.copied) { 220 p.copied = slices.Grow(p.copied, s)[:s] 221 if p.copied[i] == nil { 222 p.copied[i] = p.scheduler.queue.level(uint32(i)).clone() 223 } 224 } 225 return &p.copied[i] 226 }