github.com/smithx10/nomad@v0.9.1-rc1/scheduler/system_sched.go (about) 1 package scheduler 2 3 import ( 4 "fmt" 5 6 log "github.com/hashicorp/go-hclog" 7 memdb "github.com/hashicorp/go-memdb" 8 "github.com/hashicorp/nomad/helper/uuid" 9 "github.com/hashicorp/nomad/nomad/structs" 10 ) 11 12 const ( 13 // maxSystemScheduleAttempts is used to limit the number of times 14 // we will attempt to schedule if we continue to hit conflicts for system 15 // jobs. 16 maxSystemScheduleAttempts = 5 17 ) 18 19 // SystemScheduler is used for 'system' jobs. This scheduler is 20 // designed for services that should be run on every client. 21 type SystemScheduler struct { 22 logger log.Logger 23 state State 24 planner Planner 25 26 eval *structs.Evaluation 27 job *structs.Job 28 plan *structs.Plan 29 planResult *structs.PlanResult 30 ctx *EvalContext 31 stack *SystemStack 32 nodes []*structs.Node 33 nodesByDC map[string]int 34 35 limitReached bool 36 nextEval *structs.Evaluation 37 38 failedTGAllocs map[string]*structs.AllocMetric 39 queuedAllocs map[string]int 40 } 41 42 // NewSystemScheduler is a factory function to instantiate a new system 43 // scheduler. 44 func NewSystemScheduler(logger log.Logger, state State, planner Planner) Scheduler { 45 return &SystemScheduler{ 46 logger: logger.Named("system_sched"), 47 state: state, 48 planner: planner, 49 } 50 } 51 52 // Process is used to handle a single evaluation. 53 func (s *SystemScheduler) Process(eval *structs.Evaluation) error { 54 // Store the evaluation 55 s.eval = eval 56 57 // Update our logger with the eval's information 58 s.logger = s.logger.With("eval_id", eval.ID, "job_id", eval.JobID, "namespace", eval.Namespace) 59 60 // Verify the evaluation trigger reason is understood 61 switch eval.TriggeredBy { 62 case structs.EvalTriggerJobRegister, structs.EvalTriggerNodeUpdate, structs.EvalTriggerFailedFollowUp, 63 structs.EvalTriggerJobDeregister, structs.EvalTriggerRollingUpdate, structs.EvalTriggerPreemption, 64 structs.EvalTriggerDeploymentWatcher, structs.EvalTriggerNodeDrain: 65 default: 66 desc := fmt.Sprintf("scheduler cannot handle '%s' evaluation reason", 67 eval.TriggeredBy) 68 return setStatus(s.logger, s.planner, s.eval, s.nextEval, nil, s.failedTGAllocs, structs.EvalStatusFailed, desc, 69 s.queuedAllocs, "") 70 } 71 72 // Retry up to the maxSystemScheduleAttempts and reset if progress is made. 73 progress := func() bool { return progressMade(s.planResult) } 74 if err := retryMax(maxSystemScheduleAttempts, s.process, progress); err != nil { 75 if statusErr, ok := err.(*SetStatusError); ok { 76 return setStatus(s.logger, s.planner, s.eval, s.nextEval, nil, s.failedTGAllocs, statusErr.EvalStatus, err.Error(), 77 s.queuedAllocs, "") 78 } 79 return err 80 } 81 82 // Update the status to complete 83 return setStatus(s.logger, s.planner, s.eval, s.nextEval, nil, s.failedTGAllocs, structs.EvalStatusComplete, "", 84 s.queuedAllocs, "") 85 } 86 87 // process is wrapped in retryMax to iteratively run the handler until we have no 88 // further work or we've made the maximum number of attempts. 89 func (s *SystemScheduler) process() (bool, error) { 90 // Lookup the Job by ID 91 var err error 92 ws := memdb.NewWatchSet() 93 s.job, err = s.state.JobByID(ws, s.eval.Namespace, s.eval.JobID) 94 if err != nil { 95 return false, fmt.Errorf("failed to get job '%s': %v", 96 s.eval.JobID, err) 97 } 98 numTaskGroups := 0 99 if !s.job.Stopped() { 100 numTaskGroups = len(s.job.TaskGroups) 101 } 102 s.queuedAllocs = make(map[string]int, numTaskGroups) 103 104 // Get the ready nodes in the required datacenters 105 if !s.job.Stopped() { 106 s.nodes, s.nodesByDC, err = readyNodesInDCs(s.state, s.job.Datacenters) 107 if err != nil { 108 return false, fmt.Errorf("failed to get ready nodes: %v", err) 109 } 110 } 111 112 // Create a plan 113 s.plan = s.eval.MakePlan(s.job) 114 115 // Reset the failed allocations 116 s.failedTGAllocs = nil 117 118 // Create an evaluation context 119 s.ctx = NewEvalContext(s.state, s.plan, s.logger) 120 121 // Construct the placement stack 122 s.stack = NewSystemStack(s.ctx) 123 if !s.job.Stopped() { 124 s.stack.SetJob(s.job) 125 } 126 127 // Compute the target job allocations 128 if err := s.computeJobAllocs(); err != nil { 129 s.logger.Error("failed to compute job allocations", "error", err) 130 return false, err 131 } 132 133 // If the plan is a no-op, we can bail. If AnnotatePlan is set submit the plan 134 // anyways to get the annotations. 135 if s.plan.IsNoOp() && !s.eval.AnnotatePlan { 136 return true, nil 137 } 138 139 // If the limit of placements was reached we need to create an evaluation 140 // to pickup from here after the stagger period. 141 if s.limitReached && s.nextEval == nil { 142 s.nextEval = s.eval.NextRollingEval(s.job.Update.Stagger) 143 if err := s.planner.CreateEval(s.nextEval); err != nil { 144 s.logger.Error("failed to make next eval for rolling update", "error", err) 145 return false, err 146 } 147 s.logger.Debug("rolling update limit reached, next eval created", "next_eval_id", s.nextEval.ID) 148 } 149 150 // Submit the plan 151 result, newState, err := s.planner.SubmitPlan(s.plan) 152 s.planResult = result 153 if err != nil { 154 return false, err 155 } 156 157 // Decrement the number of allocations pending per task group based on the 158 // number of allocations successfully placed 159 adjustQueuedAllocations(s.logger, result, s.queuedAllocs) 160 161 // If we got a state refresh, try again since we have stale data 162 if newState != nil { 163 s.logger.Debug("refresh forced") 164 s.state = newState 165 return false, nil 166 } 167 168 // Try again if the plan was not fully committed, potential conflict 169 fullCommit, expected, actual := result.FullCommit(s.plan) 170 if !fullCommit { 171 s.logger.Debug("plan didn't fully commit", "attempted", expected, "placed", actual) 172 return false, nil 173 } 174 175 // Success! 176 return true, nil 177 } 178 179 // computeJobAllocs is used to reconcile differences between the job, 180 // existing allocations and node status to update the allocations. 181 func (s *SystemScheduler) computeJobAllocs() error { 182 // Lookup the allocations by JobID 183 ws := memdb.NewWatchSet() 184 allocs, err := s.state.AllocsByJob(ws, s.eval.Namespace, s.eval.JobID, true) 185 if err != nil { 186 return fmt.Errorf("failed to get allocs for job '%s': %v", 187 s.eval.JobID, err) 188 } 189 190 // Determine the tainted nodes containing job allocs 191 tainted, err := taintedNodes(s.state, allocs) 192 if err != nil { 193 return fmt.Errorf("failed to get tainted nodes for job '%s': %v", 194 s.eval.JobID, err) 195 } 196 197 // Update the allocations which are in pending/running state on tainted 198 // nodes to lost 199 updateNonTerminalAllocsToLost(s.plan, tainted, allocs) 200 201 // Filter out the allocations in a terminal state 202 allocs, terminalAllocs := structs.FilterTerminalAllocs(allocs) 203 204 // Diff the required and existing allocations 205 diff := diffSystemAllocs(s.job, s.nodes, tainted, allocs, terminalAllocs) 206 s.logger.Debug("reconciled current state with desired state", 207 "place", len(diff.place), "update", len(diff.update), 208 "migrate", len(diff.migrate), "stop", len(diff.stop), 209 "ignore", len(diff.ignore), "lost", len(diff.lost)) 210 211 // Add all the allocs to stop 212 for _, e := range diff.stop { 213 s.plan.AppendUpdate(e.Alloc, structs.AllocDesiredStatusStop, allocNotNeeded, "") 214 } 215 216 // Add all the allocs to migrate 217 for _, e := range diff.migrate { 218 s.plan.AppendUpdate(e.Alloc, structs.AllocDesiredStatusStop, allocNodeTainted, "") 219 } 220 221 // Lost allocations should be transitioned to desired status stop and client 222 // status lost. 223 for _, e := range diff.lost { 224 s.plan.AppendUpdate(e.Alloc, structs.AllocDesiredStatusStop, allocLost, structs.AllocClientStatusLost) 225 } 226 227 // Attempt to do the upgrades in place 228 destructiveUpdates, inplaceUpdates := inplaceUpdate(s.ctx, s.eval, s.job, s.stack, diff.update) 229 diff.update = destructiveUpdates 230 231 if s.eval.AnnotatePlan { 232 s.plan.Annotations = &structs.PlanAnnotations{ 233 DesiredTGUpdates: desiredUpdates(diff, inplaceUpdates, destructiveUpdates), 234 } 235 } 236 237 // Check if a rolling upgrade strategy is being used 238 limit := len(diff.update) 239 if !s.job.Stopped() && s.job.Update.Rolling() { 240 limit = s.job.Update.MaxParallel 241 } 242 243 // Treat non in-place updates as an eviction and new placement. 244 s.limitReached = evictAndPlace(s.ctx, diff, diff.update, allocUpdating, &limit) 245 246 // Nothing remaining to do if placement is not required 247 if len(diff.place) == 0 { 248 if !s.job.Stopped() { 249 for _, tg := range s.job.TaskGroups { 250 s.queuedAllocs[tg.Name] = 0 251 } 252 } 253 return nil 254 } 255 256 // Record the number of allocations that needs to be placed per Task Group 257 for _, allocTuple := range diff.place { 258 s.queuedAllocs[allocTuple.TaskGroup.Name] += 1 259 } 260 261 // Compute the placements 262 return s.computePlacements(diff.place) 263 } 264 265 // computePlacements computes placements for allocations 266 func (s *SystemScheduler) computePlacements(place []allocTuple) error { 267 nodeByID := make(map[string]*structs.Node, len(s.nodes)) 268 for _, node := range s.nodes { 269 nodeByID[node.ID] = node 270 } 271 272 nodes := make([]*structs.Node, 1) 273 for _, missing := range place { 274 node, ok := nodeByID[missing.Alloc.NodeID] 275 if !ok { 276 return fmt.Errorf("could not find node %q", missing.Alloc.NodeID) 277 } 278 279 // Update the set of placement nodes 280 nodes[0] = node 281 s.stack.SetNodes(nodes) 282 283 // Attempt to match the task group 284 option := s.stack.Select(missing.TaskGroup, nil) 285 286 if option == nil { 287 // If nodes were filtered because of constraint mismatches and we 288 // couldn't create an allocation then decrementing queued for that 289 // task group 290 if s.ctx.metrics.NodesFiltered > 0 { 291 s.queuedAllocs[missing.TaskGroup.Name] -= 1 292 293 // If we are annotating the plan, then decrement the desired 294 // placements based on whether the node meets the constraints 295 if s.eval.AnnotatePlan && s.plan.Annotations != nil && 296 s.plan.Annotations.DesiredTGUpdates != nil { 297 desired := s.plan.Annotations.DesiredTGUpdates[missing.TaskGroup.Name] 298 desired.Place -= 1 299 } 300 } 301 302 // Check if this task group has already failed 303 if metric, ok := s.failedTGAllocs[missing.TaskGroup.Name]; ok { 304 metric.CoalescedFailures += 1 305 continue 306 } 307 } 308 309 // Store the available nodes by datacenter 310 s.ctx.Metrics().NodesAvailable = s.nodesByDC 311 312 // Compute top K scoring node metadata 313 s.ctx.Metrics().PopulateScoreMetaData() 314 315 // Set fields based on if we found an allocation option 316 if option != nil { 317 resources := &structs.AllocatedResources{ 318 Tasks: option.TaskResources, 319 Shared: structs.AllocatedSharedResources{ 320 DiskMB: int64(missing.TaskGroup.EphemeralDisk.SizeMB), 321 }, 322 } 323 324 // Create an allocation for this 325 alloc := &structs.Allocation{ 326 ID: uuid.Generate(), 327 Namespace: s.job.Namespace, 328 EvalID: s.eval.ID, 329 Name: missing.Name, 330 JobID: s.job.ID, 331 TaskGroup: missing.TaskGroup.Name, 332 Metrics: s.ctx.Metrics(), 333 NodeID: option.Node.ID, 334 TaskResources: resources.OldTaskResources(), 335 AllocatedResources: resources, 336 DesiredStatus: structs.AllocDesiredStatusRun, 337 ClientStatus: structs.AllocClientStatusPending, 338 339 SharedResources: &structs.Resources{ 340 DiskMB: missing.TaskGroup.EphemeralDisk.SizeMB, 341 }, 342 } 343 344 // If the new allocation is replacing an older allocation then we 345 // set the record the older allocation id so that they are chained 346 if missing.Alloc != nil { 347 alloc.PreviousAllocation = missing.Alloc.ID 348 } 349 350 // If this placement involves preemption, set DesiredState to evict for those allocations 351 if option.PreemptedAllocs != nil { 352 var preemptedAllocIDs []string 353 for _, stop := range option.PreemptedAllocs { 354 s.plan.AppendPreemptedAlloc(stop, structs.AllocDesiredStatusEvict, alloc.ID) 355 356 preemptedAllocIDs = append(preemptedAllocIDs, stop.ID) 357 if s.eval.AnnotatePlan && s.plan.Annotations != nil { 358 s.plan.Annotations.PreemptedAllocs = append(s.plan.Annotations.PreemptedAllocs, stop.Stub()) 359 if s.plan.Annotations.DesiredTGUpdates != nil { 360 desired := s.plan.Annotations.DesiredTGUpdates[missing.TaskGroup.Name] 361 desired.Preemptions += 1 362 } 363 } 364 } 365 alloc.PreemptedAllocations = preemptedAllocIDs 366 } 367 368 s.plan.AppendAlloc(alloc) 369 } else { 370 // Lazy initialize the failed map 371 if s.failedTGAllocs == nil { 372 s.failedTGAllocs = make(map[string]*structs.AllocMetric) 373 } 374 375 s.failedTGAllocs[missing.TaskGroup.Name] = s.ctx.Metrics() 376 } 377 } 378 379 return nil 380 }