github.com/adityamillind98/nomad@v0.11.8/scheduler/system_sched.go (about) 1 package scheduler 2 3 import ( 4 "fmt" 5 6 log "github.com/hashicorp/go-hclog" 7 "github.com/hashicorp/go-memdb" 8 "github.com/hashicorp/nomad/helper/uuid" 9 "github.com/hashicorp/nomad/nomad/structs" 10 ) 11 12 const ( 13 // maxSystemScheduleAttempts is used to limit the number of times 14 // we will attempt to schedule if we continue to hit conflicts for system 15 // jobs. 16 maxSystemScheduleAttempts = 5 17 ) 18 19 // SystemScheduler is used for 'system' jobs. This scheduler is 20 // designed for services that should be run on every client. 21 // One for each job, containing an allocation for each node 22 type SystemScheduler struct { 23 logger log.Logger 24 state State 25 planner Planner 26 27 eval *structs.Evaluation 28 job *structs.Job 29 plan *structs.Plan 30 planResult *structs.PlanResult 31 ctx *EvalContext 32 stack *SystemStack 33 nodes []*structs.Node 34 nodesByDC map[string]int 35 36 limitReached bool 37 nextEval *structs.Evaluation 38 39 failedTGAllocs map[string]*structs.AllocMetric 40 queuedAllocs map[string]int 41 } 42 43 // NewSystemScheduler is a factory function to instantiate a new system 44 // scheduler. 45 func NewSystemScheduler(logger log.Logger, state State, planner Planner) Scheduler { 46 return &SystemScheduler{ 47 logger: logger.Named("system_sched"), 48 state: state, 49 planner: planner, 50 } 51 } 52 53 // Process is used to handle a single evaluation. 54 func (s *SystemScheduler) Process(eval *structs.Evaluation) error { 55 // Store the evaluation 56 s.eval = eval 57 58 // Update our logger with the eval's information 59 s.logger = s.logger.With("eval_id", eval.ID, "job_id", eval.JobID, "namespace", eval.Namespace) 60 61 // Verify the evaluation trigger reason is understood 62 switch eval.TriggeredBy { 63 case structs.EvalTriggerJobRegister, structs.EvalTriggerNodeUpdate, structs.EvalTriggerFailedFollowUp, 64 structs.EvalTriggerJobDeregister, structs.EvalTriggerRollingUpdate, structs.EvalTriggerPreemption, 65 structs.EvalTriggerDeploymentWatcher, structs.EvalTriggerNodeDrain, structs.EvalTriggerAllocStop, 66 structs.EvalTriggerQueuedAllocs, structs.EvalTriggerScaling: 67 default: 68 desc := fmt.Sprintf("scheduler cannot handle '%s' evaluation reason", 69 eval.TriggeredBy) 70 return setStatus(s.logger, s.planner, s.eval, s.nextEval, nil, s.failedTGAllocs, structs.EvalStatusFailed, desc, 71 s.queuedAllocs, "") 72 } 73 74 // Retry up to the maxSystemScheduleAttempts and reset if progress is made. 75 progress := func() bool { return progressMade(s.planResult) } 76 if err := retryMax(maxSystemScheduleAttempts, s.process, progress); err != nil { 77 if statusErr, ok := err.(*SetStatusError); ok { 78 return setStatus(s.logger, s.planner, s.eval, s.nextEval, nil, s.failedTGAllocs, statusErr.EvalStatus, err.Error(), 79 s.queuedAllocs, "") 80 } 81 return err 82 } 83 84 // Update the status to complete 85 return setStatus(s.logger, s.planner, s.eval, s.nextEval, nil, s.failedTGAllocs, structs.EvalStatusComplete, "", 86 s.queuedAllocs, "") 87 } 88 89 // process is wrapped in retryMax to iteratively run the handler until we have no 90 // further work or we've made the maximum number of attempts. 91 func (s *SystemScheduler) process() (bool, error) { 92 // Lookup the Job by ID 93 var err error 94 ws := memdb.NewWatchSet() 95 s.job, err = s.state.JobByID(ws, s.eval.Namespace, s.eval.JobID) 96 if err != nil { 97 return false, fmt.Errorf("failed to get job '%s': %v", 98 s.eval.JobID, err) 99 } 100 numTaskGroups := 0 101 if !s.job.Stopped() { 102 numTaskGroups = len(s.job.TaskGroups) 103 } 104 s.queuedAllocs = make(map[string]int, numTaskGroups) 105 106 // Get the ready nodes in the required datacenters 107 if !s.job.Stopped() { 108 s.nodes, s.nodesByDC, err = readyNodesInDCs(s.state, s.job.Datacenters) 109 if err != nil { 110 return false, fmt.Errorf("failed to get ready nodes: %v", err) 111 } 112 } 113 114 // Create a plan 115 s.plan = s.eval.MakePlan(s.job) 116 117 // Reset the failed allocations 118 s.failedTGAllocs = nil 119 120 // Create an evaluation context 121 s.ctx = NewEvalContext(s.state, s.plan, s.logger) 122 123 // Construct the placement stack 124 s.stack = NewSystemStack(s.ctx) 125 if !s.job.Stopped() { 126 s.stack.SetJob(s.job) 127 } 128 129 // Compute the target job allocations 130 if err := s.computeJobAllocs(); err != nil { 131 s.logger.Error("failed to compute job allocations", "error", err) 132 return false, err 133 } 134 135 // If the plan is a no-op, we can bail. If AnnotatePlan is set submit the plan 136 // anyways to get the annotations. 137 if s.plan.IsNoOp() && !s.eval.AnnotatePlan { 138 return true, nil 139 } 140 141 // If the limit of placements was reached we need to create an evaluation 142 // to pickup from here after the stagger period. 143 if s.limitReached && s.nextEval == nil { 144 s.nextEval = s.eval.NextRollingEval(s.job.Update.Stagger) 145 if err := s.planner.CreateEval(s.nextEval); err != nil { 146 s.logger.Error("failed to make next eval for rolling update", "error", err) 147 return false, err 148 } 149 s.logger.Debug("rolling update limit reached, next eval created", "next_eval_id", s.nextEval.ID) 150 } 151 152 // Submit the plan 153 result, newState, err := s.planner.SubmitPlan(s.plan) 154 s.planResult = result 155 if err != nil { 156 return false, err 157 } 158 159 // Decrement the number of allocations pending per task group based on the 160 // number of allocations successfully placed 161 adjustQueuedAllocations(s.logger, result, s.queuedAllocs) 162 163 // If we got a state refresh, try again since we have stale data 164 if newState != nil { 165 s.logger.Debug("refresh forced") 166 s.state = newState 167 return false, nil 168 } 169 170 // Try again if the plan was not fully committed, potential conflict 171 fullCommit, expected, actual := result.FullCommit(s.plan) 172 if !fullCommit { 173 s.logger.Debug("plan didn't fully commit", "attempted", expected, "placed", actual) 174 return false, nil 175 } 176 177 // Success! 178 return true, nil 179 } 180 181 // computeJobAllocs is used to reconcile differences between the job, 182 // existing allocations and node status to update the allocations. 183 func (s *SystemScheduler) computeJobAllocs() error { 184 // Lookup the allocations by JobID 185 ws := memdb.NewWatchSet() 186 allocs, err := s.state.AllocsByJob(ws, s.eval.Namespace, s.eval.JobID, true) 187 if err != nil { 188 return fmt.Errorf("failed to get allocs for job '%s': %v", 189 s.eval.JobID, err) 190 } 191 192 // Determine the tainted nodes containing job allocs 193 tainted, err := taintedNodes(s.state, allocs) 194 if err != nil { 195 return fmt.Errorf("failed to get tainted nodes for job '%s': %v", 196 s.eval.JobID, err) 197 } 198 199 // Update the allocations which are in pending/running state on tainted 200 // nodes to lost 201 updateNonTerminalAllocsToLost(s.plan, tainted, allocs) 202 203 // Filter out the allocations in a terminal state 204 allocs, terminalAllocs := structs.FilterTerminalAllocs(allocs) 205 206 // Diff the required and existing allocations 207 diff := diffSystemAllocs(s.job, s.nodes, tainted, allocs, terminalAllocs) 208 s.logger.Debug("reconciled current state with desired state", 209 "place", len(diff.place), "update", len(diff.update), 210 "migrate", len(diff.migrate), "stop", len(diff.stop), 211 "ignore", len(diff.ignore), "lost", len(diff.lost)) 212 213 // Add all the allocs to stop 214 for _, e := range diff.stop { 215 s.plan.AppendStoppedAlloc(e.Alloc, allocNotNeeded, "", "") 216 } 217 218 // Add all the allocs to migrate 219 for _, e := range diff.migrate { 220 s.plan.AppendStoppedAlloc(e.Alloc, allocNodeTainted, "", "") 221 } 222 223 // Lost allocations should be transitioned to desired status stop and client 224 // status lost. 225 for _, e := range diff.lost { 226 s.plan.AppendStoppedAlloc(e.Alloc, allocLost, structs.AllocClientStatusLost, "") 227 } 228 229 // Attempt to do the upgrades in place 230 destructiveUpdates, inplaceUpdates := inplaceUpdate(s.ctx, s.eval, s.job, s.stack, diff.update) 231 diff.update = destructiveUpdates 232 233 if s.eval.AnnotatePlan { 234 s.plan.Annotations = &structs.PlanAnnotations{ 235 DesiredTGUpdates: desiredUpdates(diff, inplaceUpdates, destructiveUpdates), 236 } 237 } 238 239 // Check if a rolling upgrade strategy is being used 240 limit := len(diff.update) 241 if !s.job.Stopped() && s.job.Update.Rolling() { 242 limit = s.job.Update.MaxParallel 243 } 244 245 // Treat non in-place updates as an eviction and new placement. 246 s.limitReached = evictAndPlace(s.ctx, diff, diff.update, allocUpdating, &limit) 247 248 // Nothing remaining to do if placement is not required 249 if len(diff.place) == 0 { 250 if !s.job.Stopped() { 251 for _, tg := range s.job.TaskGroups { 252 s.queuedAllocs[tg.Name] = 0 253 } 254 } 255 return nil 256 } 257 258 // Record the number of allocations that needs to be placed per Task Group 259 for _, allocTuple := range diff.place { 260 s.queuedAllocs[allocTuple.TaskGroup.Name] += 1 261 } 262 263 // Compute the placements 264 return s.computePlacements(diff.place) 265 } 266 267 // computePlacements computes placements for allocations 268 func (s *SystemScheduler) computePlacements(place []allocTuple) error { 269 nodeByID := make(map[string]*structs.Node, len(s.nodes)) 270 for _, node := range s.nodes { 271 nodeByID[node.ID] = node 272 } 273 274 nodes := make([]*structs.Node, 1) 275 for _, missing := range place { 276 node, ok := nodeByID[missing.Alloc.NodeID] 277 if !ok { 278 s.logger.Debug("could not find node %q", missing.Alloc.NodeID) 279 continue 280 } 281 282 // Update the set of placement nodes 283 nodes[0] = node 284 s.stack.SetNodes(nodes) 285 286 // Attempt to match the task group 287 option := s.stack.Select(missing.TaskGroup, nil) 288 289 if option == nil { 290 // If the task can't be placed on this node, update reporting data 291 // and continue to short circuit the loop 292 293 // If this node was filtered because of constraint mismatches and we 294 // couldn't create an allocation then decrementing queued for that 295 // task group 296 if s.ctx.metrics.NodesFiltered > 0 { 297 s.queuedAllocs[missing.TaskGroup.Name] -= 1 298 299 // If we are annotating the plan, then decrement the desired 300 // placements based on whether the node meets the constraints 301 if s.eval.AnnotatePlan && s.plan.Annotations != nil && 302 s.plan.Annotations.DesiredTGUpdates != nil { 303 desired := s.plan.Annotations.DesiredTGUpdates[missing.TaskGroup.Name] 304 desired.Place -= 1 305 } 306 307 // Filtered nodes are not reported to users, just omitted from the job status 308 continue 309 } 310 311 // Check if this task group has already failed, reported to the user as a count 312 if metric, ok := s.failedTGAllocs[missing.TaskGroup.Name]; ok { 313 metric.CoalescedFailures += 1 314 continue 315 } 316 317 // Store the available nodes by datacenter 318 s.ctx.Metrics().NodesAvailable = s.nodesByDC 319 320 // Compute top K scoring node metadata 321 s.ctx.Metrics().PopulateScoreMetaData() 322 323 // Lazy initialize the failed map 324 if s.failedTGAllocs == nil { 325 s.failedTGAllocs = make(map[string]*structs.AllocMetric) 326 } 327 328 // Actual failure to start this task on this candidate node, report it individually 329 s.failedTGAllocs[missing.TaskGroup.Name] = s.ctx.Metrics() 330 s.addBlocked(node) 331 332 continue 333 } 334 335 // Store the available nodes by datacenter 336 s.ctx.Metrics().NodesAvailable = s.nodesByDC 337 338 // Compute top K scoring node metadata 339 s.ctx.Metrics().PopulateScoreMetaData() 340 341 // Set fields based on if we found an allocation option 342 resources := &structs.AllocatedResources{ 343 Tasks: option.TaskResources, 344 TaskLifecycles: option.TaskLifecycles, 345 Shared: structs.AllocatedSharedResources{ 346 DiskMB: int64(missing.TaskGroup.EphemeralDisk.SizeMB), 347 }, 348 } 349 350 if option.AllocResources != nil { 351 resources.Shared.Networks = option.AllocResources.Networks 352 } 353 354 // Create an allocation for this 355 alloc := &structs.Allocation{ 356 ID: uuid.Generate(), 357 Namespace: s.job.Namespace, 358 EvalID: s.eval.ID, 359 Name: missing.Name, 360 JobID: s.job.ID, 361 TaskGroup: missing.TaskGroup.Name, 362 Metrics: s.ctx.Metrics(), 363 NodeID: option.Node.ID, 364 NodeName: option.Node.Name, 365 TaskResources: resources.OldTaskResources(), 366 AllocatedResources: resources, 367 DesiredStatus: structs.AllocDesiredStatusRun, 368 ClientStatus: structs.AllocClientStatusPending, 369 // SharedResources is considered deprecated, will be removed in 0.11. 370 // It is only set for compat reasons 371 SharedResources: &structs.Resources{ 372 DiskMB: missing.TaskGroup.EphemeralDisk.SizeMB, 373 Networks: resources.Shared.Networks, 374 }, 375 } 376 377 // If the new allocation is replacing an older allocation then we record the 378 // older allocation id so that they are chained 379 if missing.Alloc != nil { 380 alloc.PreviousAllocation = missing.Alloc.ID 381 } 382 383 // If this placement involves preemption, set DesiredState to evict for those allocations 384 if option.PreemptedAllocs != nil { 385 var preemptedAllocIDs []string 386 for _, stop := range option.PreemptedAllocs { 387 s.plan.AppendPreemptedAlloc(stop, alloc.ID) 388 389 preemptedAllocIDs = append(preemptedAllocIDs, stop.ID) 390 if s.eval.AnnotatePlan && s.plan.Annotations != nil { 391 s.plan.Annotations.PreemptedAllocs = append(s.plan.Annotations.PreemptedAllocs, stop.Stub()) 392 if s.plan.Annotations.DesiredTGUpdates != nil { 393 desired := s.plan.Annotations.DesiredTGUpdates[missing.TaskGroup.Name] 394 desired.Preemptions += 1 395 } 396 } 397 } 398 alloc.PreemptedAllocations = preemptedAllocIDs 399 } 400 401 s.plan.AppendAlloc(alloc) 402 } 403 404 return nil 405 } 406 407 // addBlocked creates a new blocked eval for this job on this node 408 // and submit to the planner (worker.go), which keeps the eval for execution later 409 func (s *SystemScheduler) addBlocked(node *structs.Node) error { 410 e := s.ctx.Eligibility() 411 escaped := e.HasEscaped() 412 413 // Only store the eligible classes if the eval hasn't escaped. 414 var classEligibility map[string]bool 415 if !escaped { 416 classEligibility = e.GetClasses() 417 } 418 419 blocked := s.eval.CreateBlockedEval(classEligibility, escaped, e.QuotaLimitReached()) 420 blocked.StatusDescription = blockedEvalFailedPlacements 421 blocked.NodeID = node.ID 422 423 return s.planner.CreateEval(blocked) 424 }