github.com/grafana/pyroscope@v1.18.0/pkg/metastore/compaction_service.go (about) 1 package metastore 2 3 import ( 4 "context" 5 "sync" 6 7 "github.com/go-kit/log" 8 "github.com/go-kit/log/level" 9 "github.com/opentracing/opentracing-go" 10 "github.com/opentracing/opentracing-go/ext" 11 "google.golang.org/grpc/codes" 12 "google.golang.org/grpc/status" 13 14 metastorev1 "github.com/grafana/pyroscope/api/gen/proto/go/metastore/v1" 15 "github.com/grafana/pyroscope/api/gen/proto/go/metastore/v1/raft_log" 16 "github.com/grafana/pyroscope/pkg/metastore/fsm" 17 "github.com/grafana/pyroscope/pkg/metastore/raftnode" 18 ) 19 20 type CompactionService struct { 21 metastorev1.CompactionServiceServer 22 23 logger log.Logger 24 mu sync.Mutex 25 raft Raft 26 } 27 28 func NewCompactionService( 29 logger log.Logger, 30 raft Raft, 31 ) *CompactionService { 32 return &CompactionService{ 33 logger: logger, 34 raft: raft, 35 } 36 } 37 38 func (svc *CompactionService) PollCompactionJobs( 39 ctx context.Context, 40 req *metastorev1.PollCompactionJobsRequest, 41 ) (resp *metastorev1.PollCompactionJobsResponse, err error) { 42 span, ctx := opentracing.StartSpanFromContext(ctx, "CompactionService.PollCompactionJobs") 43 defer func() { 44 if err != nil { 45 ext.LogError(span, err) 46 } 47 span.Finish() 48 }() 49 50 span.SetTag("status_updates", len(req.GetStatusUpdates())) 51 span.SetTag("job_capacity", req.GetJobCapacity()) 52 53 // This is a two-step process. To commit changes to the compaction plan, 54 // we need to ensure that all replicas apply exactly the same changes. 55 // Instead of relying on identical behavior across replicas and a 56 // reproducible compaction plan, we explicitly replicate the change. 57 // 58 // NOTE(kolesnikovae): We can use Leader Read optimization here. However, 59 // we would need to ensure synchronization between the compactor and the 60 // index, and unsure isolation at the data level. For now, we're using 61 // the raft log to guarantee serializable isolation level. 62 // 63 // Make sure that only one compaction plan update is in progress at a time. 64 // This lock does not introduce contention, as the raft log is synchronous. 65 svc.mu.Lock() 66 defer svc.mu.Unlock() 67 68 // First, we ask the current leader to prepare the change. This is a read 69 // operation conducted through the raft log: at this stage, we only 70 // prepare changes; the command handler does not alter the state. 71 request := &raft_log.GetCompactionPlanUpdateRequest{ 72 StatusUpdates: make([]*raft_log.CompactionJobStatusUpdate, 0, len(req.StatusUpdates)), 73 AssignJobsMax: req.JobCapacity, 74 } 75 76 // We only send the status updates (without job results) to minimize the 77 // traffic, but we want to include the results of compaction in the final 78 // proposal. If the status update is accepted, we trust the worker and 79 // don't need to load our own copy of the job. 80 compacted := make(map[string]*metastorev1.CompactionJobStatusUpdate, len(req.StatusUpdates)) 81 for _, update := range req.StatusUpdates { 82 if update.CompactedBlocks != nil { 83 compacted[update.Name] = update 84 } 85 request.StatusUpdates = append(request.StatusUpdates, &raft_log.CompactionJobStatusUpdate{ 86 Name: update.Name, 87 Token: update.Token, 88 Status: update.Status, 89 }) 90 } 91 92 cmd := fsm.RaftLogEntryType(raft_log.RaftCommand_RAFT_COMMAND_GET_COMPACTION_PLAN_UPDATE) 93 proposeResp, err := svc.raft.Propose(ctx, cmd, req) 94 if err != nil { 95 if !raftnode.IsRaftLeadershipError(err) { 96 level.Error(svc.logger).Log("msg", "failed to prepare compaction plan", "err", err) 97 } 98 return nil, err 99 } 100 prepared := proposeResp.(*raft_log.GetCompactionPlanUpdateResponse) 101 planUpdate := prepared.GetPlanUpdate() 102 103 // Copy plan updates to the worker response. The job plan is only sent for 104 // newly assigned jobs. Lease renewals do not require the plan to be sent. 105 workerResp := &metastorev1.PollCompactionJobsResponse{ 106 CompactionJobs: make([]*metastorev1.CompactionJob, 0, len(planUpdate.AssignedJobs)), 107 Assignments: make([]*metastorev1.CompactionJobAssignment, 0, len(planUpdate.UpdatedJobs)), 108 } 109 for _, updated := range planUpdate.UpdatedJobs { 110 update := updated.State 111 workerResp.Assignments = append(workerResp.Assignments, &metastorev1.CompactionJobAssignment{ 112 Name: update.Name, 113 Token: update.Token, 114 LeaseExpiresAt: update.LeaseExpiresAt, 115 }) 116 } 117 for _, assigned := range planUpdate.AssignedJobs { 118 assignment := assigned.State 119 workerResp.Assignments = append(workerResp.Assignments, &metastorev1.CompactionJobAssignment{ 120 Name: assignment.Name, 121 Token: assignment.Token, 122 LeaseExpiresAt: assignment.LeaseExpiresAt, 123 }) 124 job := assigned.Plan 125 workerResp.CompactionJobs = append(workerResp.CompactionJobs, &metastorev1.CompactionJob{ 126 Name: job.Name, 127 Shard: job.Shard, 128 Tenant: job.Tenant, 129 CompactionLevel: job.CompactionLevel, 130 SourceBlocks: job.SourceBlocks, 131 Tombstones: job.Tombstones, 132 }) 133 // Assigned jobs are not written to the raft log (only the assignments): 134 // from our perspective (scheduler and planner) these are just job updates. 135 assigned.Plan = nil 136 } 137 138 // Include the compacted blocks in the final proposal. 139 for _, job := range planUpdate.CompletedJobs { 140 if update := compacted[job.State.Name]; update != nil { 141 job.CompactedBlocks = update.CompactedBlocks 142 } 143 } 144 145 // Now that we have the plan, we need to propagate it through the 146 // raft log to ensure it is applied consistently across all replicas, 147 // regardless of their individual state or view of the plan. 148 cmd = fsm.RaftLogEntryType(raft_log.RaftCommand_RAFT_COMMAND_UPDATE_COMPACTION_PLAN) 149 150 // We also include the current term of the planning step so that later 151 // we can verify that the leader has not changed, and the plan is still 152 // up-to-date. Otherwise, e.g., in the ABA case, when the current node 153 // loses leadership and gains is back in-between these two steps, we 154 // cannot guarantee that the proposed plan is still valid and up-to-date. 155 // The raft handler cannot return an error here (because this is a valid 156 // scenario, and we don't want to stop the node/cluster). Instead, an 157 // empty response would indicate that the plan is rejected. 158 proposal := &raft_log.UpdateCompactionPlanRequest{Term: prepared.Term, PlanUpdate: planUpdate} 159 if proposeResp, err = svc.raft.Propose(ctx, cmd, proposal); err != nil { 160 if !raftnode.IsRaftLeadershipError(err) { 161 level.Error(svc.logger).Log("msg", "failed to update compaction plan", "err", err) 162 } 163 return nil, err 164 } 165 accepted := proposeResp.(*raft_log.UpdateCompactionPlanResponse).GetPlanUpdate() 166 if accepted == nil { 167 level.Warn(svc.logger).Log("msg", "compaction plan update rejected") 168 return nil, status.Error(codes.FailedPrecondition, "failed to update compaction plan") 169 } 170 171 // As of now, accepted plan always matches the proposed one, 172 // so our prepared worker response is still valid. 173 174 span.SetTag("assigned_jobs", len(workerResp.GetCompactionJobs())) 175 span.SetTag("assignment_updates", len(workerResp.GetAssignments())) 176 return workerResp, nil 177 }