github.com/hspak/nomad@v0.7.2-0.20180309000617-bc4ae22a39a5/scheduler/system_sched.go (about) 1 package scheduler 2 3 import ( 4 "fmt" 5 "log" 6 7 memdb "github.com/hashicorp/go-memdb" 8 "github.com/hashicorp/nomad/helper/uuid" 9 "github.com/hashicorp/nomad/nomad/structs" 10 ) 11 12 const ( 13 // maxSystemScheduleAttempts is used to limit the number of times 14 // we will attempt to schedule if we continue to hit conflicts for system 15 // jobs. 16 maxSystemScheduleAttempts = 5 17 18 // allocNodeTainted is the status used when stopping an alloc because it's 19 // node is tainted. 20 allocNodeTainted = "alloc not needed as node is tainted" 21 ) 22 23 // SystemScheduler is used for 'system' jobs. This scheduler is 24 // designed for services that should be run on every client. 25 type SystemScheduler struct { 26 logger *log.Logger 27 state State 28 planner Planner 29 30 eval *structs.Evaluation 31 job *structs.Job 32 plan *structs.Plan 33 planResult *structs.PlanResult 34 ctx *EvalContext 35 stack *SystemStack 36 nodes []*structs.Node 37 nodesByDC map[string]int 38 39 limitReached bool 40 nextEval *structs.Evaluation 41 42 failedTGAllocs map[string]*structs.AllocMetric 43 queuedAllocs map[string]int 44 } 45 46 // NewSystemScheduler is a factory function to instantiate a new system 47 // scheduler. 48 func NewSystemScheduler(logger *log.Logger, state State, planner Planner) Scheduler { 49 return &SystemScheduler{ 50 logger: logger, 51 state: state, 52 planner: planner, 53 } 54 } 55 56 // Process is used to handle a single evaluation. 57 func (s *SystemScheduler) Process(eval *structs.Evaluation) error { 58 // Store the evaluation 59 s.eval = eval 60 61 // Verify the evaluation trigger reason is understood 62 switch eval.TriggeredBy { 63 case structs.EvalTriggerJobRegister, structs.EvalTriggerNodeUpdate, 64 structs.EvalTriggerJobDeregister, structs.EvalTriggerRollingUpdate, 65 structs.EvalTriggerDeploymentWatcher: 66 default: 67 desc := fmt.Sprintf("scheduler cannot handle '%s' evaluation reason", 68 eval.TriggeredBy) 69 return setStatus(s.logger, s.planner, s.eval, s.nextEval, nil, s.failedTGAllocs, structs.EvalStatusFailed, desc, 70 s.queuedAllocs, "") 71 } 72 73 // Retry up to the maxSystemScheduleAttempts and reset if progress is made. 74 progress := func() bool { return progressMade(s.planResult) } 75 if err := retryMax(maxSystemScheduleAttempts, s.process, progress); err != nil { 76 if statusErr, ok := err.(*SetStatusError); ok { 77 return setStatus(s.logger, s.planner, s.eval, s.nextEval, nil, s.failedTGAllocs, statusErr.EvalStatus, err.Error(), 78 s.queuedAllocs, "") 79 } 80 return err 81 } 82 83 // Update the status to complete 84 return setStatus(s.logger, s.planner, s.eval, s.nextEval, nil, s.failedTGAllocs, structs.EvalStatusComplete, "", 85 s.queuedAllocs, "") 86 } 87 88 // process is wrapped in retryMax to iteratively run the handler until we have no 89 // further work or we've made the maximum number of attempts. 90 func (s *SystemScheduler) process() (bool, error) { 91 // Lookup the Job by ID 92 var err error 93 ws := memdb.NewWatchSet() 94 s.job, err = s.state.JobByID(ws, s.eval.Namespace, s.eval.JobID) 95 if err != nil { 96 return false, fmt.Errorf("failed to get job '%s': %v", 97 s.eval.JobID, err) 98 } 99 numTaskGroups := 0 100 if !s.job.Stopped() { 101 numTaskGroups = len(s.job.TaskGroups) 102 } 103 s.queuedAllocs = make(map[string]int, numTaskGroups) 104 105 // Get the ready nodes in the required datacenters 106 if !s.job.Stopped() { 107 s.nodes, s.nodesByDC, err = readyNodesInDCs(s.state, s.job.Datacenters) 108 if err != nil { 109 return false, fmt.Errorf("failed to get ready nodes: %v", err) 110 } 111 } 112 113 // Create a plan 114 s.plan = s.eval.MakePlan(s.job) 115 116 // Reset the failed allocations 117 s.failedTGAllocs = nil 118 119 // Create an evaluation context 120 s.ctx = NewEvalContext(s.state, s.plan, s.logger) 121 122 // Construct the placement stack 123 s.stack = NewSystemStack(s.ctx) 124 if !s.job.Stopped() { 125 s.stack.SetJob(s.job) 126 } 127 128 // Compute the target job allocations 129 if err := s.computeJobAllocs(); err != nil { 130 s.logger.Printf("[ERR] sched: %#v: %v", s.eval, err) 131 return false, err 132 } 133 134 // If the plan is a no-op, we can bail. If AnnotatePlan is set submit the plan 135 // anyways to get the annotations. 136 if s.plan.IsNoOp() && !s.eval.AnnotatePlan { 137 return true, nil 138 } 139 140 // If the limit of placements was reached we need to create an evaluation 141 // to pickup from here after the stagger period. 142 if s.limitReached && s.nextEval == nil { 143 s.nextEval = s.eval.NextRollingEval(s.job.Update.Stagger) 144 if err := s.planner.CreateEval(s.nextEval); err != nil { 145 s.logger.Printf("[ERR] sched: %#v failed to make next eval for rolling update: %v", s.eval, err) 146 return false, err 147 } 148 s.logger.Printf("[DEBUG] sched: %#v: rolling update limit reached, next eval '%s' created", s.eval, s.nextEval.ID) 149 } 150 151 // Submit the plan 152 result, newState, err := s.planner.SubmitPlan(s.plan) 153 s.planResult = result 154 if err != nil { 155 return false, err 156 } 157 158 // Decrement the number of allocations pending per task group based on the 159 // number of allocations successfully placed 160 adjustQueuedAllocations(s.logger, result, s.queuedAllocs) 161 162 // If we got a state refresh, try again since we have stale data 163 if newState != nil { 164 s.logger.Printf("[DEBUG] sched: %#v: refresh forced", s.eval) 165 s.state = newState 166 return false, nil 167 } 168 169 // Try again if the plan was not fully committed, potential conflict 170 fullCommit, expected, actual := result.FullCommit(s.plan) 171 if !fullCommit { 172 s.logger.Printf("[DEBUG] sched: %#v: attempted %d placements, %d placed", 173 s.eval, expected, actual) 174 return false, nil 175 } 176 177 // Success! 178 return true, nil 179 } 180 181 // computeJobAllocs is used to reconcile differences between the job, 182 // existing allocations and node status to update the allocations. 183 func (s *SystemScheduler) computeJobAllocs() error { 184 // Lookup the allocations by JobID 185 ws := memdb.NewWatchSet() 186 allocs, err := s.state.AllocsByJob(ws, s.eval.Namespace, s.eval.JobID, true) 187 if err != nil { 188 return fmt.Errorf("failed to get allocs for job '%s': %v", 189 s.eval.JobID, err) 190 } 191 192 // Determine the tainted nodes containing job allocs 193 tainted, err := taintedNodes(s.state, allocs) 194 if err != nil { 195 return fmt.Errorf("failed to get tainted nodes for job '%s': %v", 196 s.eval.JobID, err) 197 } 198 199 // Update the allocations which are in pending/running state on tainted 200 // nodes to lost 201 updateNonTerminalAllocsToLost(s.plan, tainted, allocs) 202 203 // Filter out the allocations in a terminal state 204 allocs, terminalAllocs := structs.FilterTerminalAllocs(allocs) 205 206 // Diff the required and existing allocations 207 diff := diffSystemAllocs(s.job, s.nodes, tainted, allocs, terminalAllocs) 208 s.logger.Printf("[DEBUG] sched: %#v: %#v", s.eval, diff) 209 210 // Add all the allocs to stop 211 for _, e := range diff.stop { 212 s.plan.AppendUpdate(e.Alloc, structs.AllocDesiredStatusStop, allocNotNeeded, "") 213 } 214 215 // Lost allocations should be transitioned to desired status stop and client 216 // status lost. 217 for _, e := range diff.lost { 218 s.plan.AppendUpdate(e.Alloc, structs.AllocDesiredStatusStop, allocLost, structs.AllocClientStatusLost) 219 } 220 221 // Attempt to do the upgrades in place 222 destructiveUpdates, inplaceUpdates := inplaceUpdate(s.ctx, s.eval, s.job, s.stack, diff.update) 223 diff.update = destructiveUpdates 224 225 if s.eval.AnnotatePlan { 226 s.plan.Annotations = &structs.PlanAnnotations{ 227 DesiredTGUpdates: desiredUpdates(diff, inplaceUpdates, destructiveUpdates), 228 } 229 } 230 231 // Check if a rolling upgrade strategy is being used 232 limit := len(diff.update) 233 if !s.job.Stopped() && s.job.Update.Rolling() { 234 limit = s.job.Update.MaxParallel 235 } 236 237 // Treat non in-place updates as an eviction and new placement. 238 s.limitReached = evictAndPlace(s.ctx, diff, diff.update, allocUpdating, &limit) 239 240 // Nothing remaining to do if placement is not required 241 if len(diff.place) == 0 { 242 if !s.job.Stopped() { 243 for _, tg := range s.job.TaskGroups { 244 s.queuedAllocs[tg.Name] = 0 245 } 246 } 247 return nil 248 } 249 250 // Record the number of allocations that needs to be placed per Task Group 251 for _, allocTuple := range diff.place { 252 s.queuedAllocs[allocTuple.TaskGroup.Name] += 1 253 } 254 255 // Compute the placements 256 return s.computePlacements(diff.place) 257 } 258 259 // computePlacements computes placements for allocations 260 func (s *SystemScheduler) computePlacements(place []allocTuple) error { 261 nodeByID := make(map[string]*structs.Node, len(s.nodes)) 262 for _, node := range s.nodes { 263 nodeByID[node.ID] = node 264 } 265 266 nodes := make([]*structs.Node, 1) 267 for _, missing := range place { 268 node, ok := nodeByID[missing.Alloc.NodeID] 269 if !ok { 270 return fmt.Errorf("could not find node %q", missing.Alloc.NodeID) 271 } 272 273 // Update the set of placement nodes 274 nodes[0] = node 275 s.stack.SetNodes(nodes) 276 277 // Attempt to match the task group 278 option, _ := s.stack.Select(missing.TaskGroup, nil) 279 280 if option == nil { 281 // If nodes were filtered because of constraint mismatches and we 282 // couldn't create an allocation then decrementing queued for that 283 // task group 284 if s.ctx.metrics.NodesFiltered > 0 { 285 s.queuedAllocs[missing.TaskGroup.Name] -= 1 286 287 // If we are annotating the plan, then decrement the desired 288 // placements based on whether the node meets the constraints 289 if s.eval.AnnotatePlan && s.plan.Annotations != nil && 290 s.plan.Annotations.DesiredTGUpdates != nil { 291 desired := s.plan.Annotations.DesiredTGUpdates[missing.TaskGroup.Name] 292 desired.Place -= 1 293 } 294 } 295 296 // Check if this task group has already failed 297 if metric, ok := s.failedTGAllocs[missing.TaskGroup.Name]; ok { 298 metric.CoalescedFailures += 1 299 continue 300 } 301 } 302 303 // Store the available nodes by datacenter 304 s.ctx.Metrics().NodesAvailable = s.nodesByDC 305 306 // Set fields based on if we found an allocation option 307 if option != nil { 308 // Create an allocation for this 309 alloc := &structs.Allocation{ 310 ID: uuid.Generate(), 311 Namespace: s.job.Namespace, 312 EvalID: s.eval.ID, 313 Name: missing.Name, 314 JobID: s.job.ID, 315 TaskGroup: missing.TaskGroup.Name, 316 Metrics: s.ctx.Metrics(), 317 NodeID: option.Node.ID, 318 TaskResources: option.TaskResources, 319 DesiredStatus: structs.AllocDesiredStatusRun, 320 ClientStatus: structs.AllocClientStatusPending, 321 322 SharedResources: &structs.Resources{ 323 DiskMB: missing.TaskGroup.EphemeralDisk.SizeMB, 324 }, 325 } 326 327 // If the new allocation is replacing an older allocation then we 328 // set the record the older allocation id so that they are chained 329 if missing.Alloc != nil { 330 alloc.PreviousAllocation = missing.Alloc.ID 331 } 332 333 s.plan.AppendAlloc(alloc) 334 } else { 335 // Lazy initialize the failed map 336 if s.failedTGAllocs == nil { 337 s.failedTGAllocs = make(map[string]*structs.AllocMetric) 338 } 339 340 s.failedTGAllocs[missing.TaskGroup.Name] = s.ctx.Metrics() 341 } 342 } 343 344 return nil 345 }