github.com/hhrutter/nomad@v0.6.0-rc2.0.20170723054333-80c4b03f0705/scheduler/system_sched.go (about) 1 package scheduler 2 3 import ( 4 "fmt" 5 "log" 6 7 memdb "github.com/hashicorp/go-memdb" 8 "github.com/hashicorp/nomad/nomad/structs" 9 ) 10 11 const ( 12 // maxSystemScheduleAttempts is used to limit the number of times 13 // we will attempt to schedule if we continue to hit conflicts for system 14 // jobs. 15 maxSystemScheduleAttempts = 5 16 17 // allocNodeTainted is the status used when stopping an alloc because it's 18 // node is tainted. 19 allocNodeTainted = "alloc not needed as node is tainted" 20 ) 21 22 // SystemScheduler is used for 'system' jobs. This scheduler is 23 // designed for services that should be run on every client. 24 type SystemScheduler struct { 25 logger *log.Logger 26 state State 27 planner Planner 28 29 eval *structs.Evaluation 30 job *structs.Job 31 plan *structs.Plan 32 planResult *structs.PlanResult 33 ctx *EvalContext 34 stack *SystemStack 35 nodes []*structs.Node 36 nodesByDC map[string]int 37 38 limitReached bool 39 nextEval *structs.Evaluation 40 41 failedTGAllocs map[string]*structs.AllocMetric 42 queuedAllocs map[string]int 43 } 44 45 // NewSystemScheduler is a factory function to instantiate a new system 46 // scheduler. 47 func NewSystemScheduler(logger *log.Logger, state State, planner Planner) Scheduler { 48 return &SystemScheduler{ 49 logger: logger, 50 state: state, 51 planner: planner, 52 } 53 } 54 55 // Process is used to handle a single evaluation. 56 func (s *SystemScheduler) Process(eval *structs.Evaluation) error { 57 // Store the evaluation 58 s.eval = eval 59 60 // Verify the evaluation trigger reason is understood 61 switch eval.TriggeredBy { 62 case structs.EvalTriggerJobRegister, structs.EvalTriggerNodeUpdate, 63 structs.EvalTriggerJobDeregister, structs.EvalTriggerRollingUpdate, 64 structs.EvalTriggerDeploymentWatcher: 65 default: 66 desc := fmt.Sprintf("scheduler cannot handle '%s' evaluation reason", 67 eval.TriggeredBy) 68 return setStatus(s.logger, s.planner, s.eval, s.nextEval, nil, s.failedTGAllocs, structs.EvalStatusFailed, desc, 69 s.queuedAllocs, "") 70 } 71 72 // Retry up to the maxSystemScheduleAttempts and reset if progress is made. 73 progress := func() bool { return progressMade(s.planResult) } 74 if err := retryMax(maxSystemScheduleAttempts, s.process, progress); err != nil { 75 if statusErr, ok := err.(*SetStatusError); ok { 76 return setStatus(s.logger, s.planner, s.eval, s.nextEval, nil, s.failedTGAllocs, statusErr.EvalStatus, err.Error(), 77 s.queuedAllocs, "") 78 } 79 return err 80 } 81 82 // Update the status to complete 83 return setStatus(s.logger, s.planner, s.eval, s.nextEval, nil, s.failedTGAllocs, structs.EvalStatusComplete, "", 84 s.queuedAllocs, "") 85 } 86 87 // process is wrapped in retryMax to iteratively run the handler until we have no 88 // further work or we've made the maximum number of attempts. 89 func (s *SystemScheduler) process() (bool, error) { 90 // Lookup the Job by ID 91 var err error 92 ws := memdb.NewWatchSet() 93 s.job, err = s.state.JobByID(ws, s.eval.JobID) 94 if err != nil { 95 return false, fmt.Errorf("failed to get job '%s': %v", 96 s.eval.JobID, err) 97 } 98 numTaskGroups := 0 99 if !s.job.Stopped() { 100 numTaskGroups = len(s.job.TaskGroups) 101 } 102 s.queuedAllocs = make(map[string]int, numTaskGroups) 103 104 // Get the ready nodes in the required datacenters 105 if !s.job.Stopped() { 106 s.nodes, s.nodesByDC, err = readyNodesInDCs(s.state, s.job.Datacenters) 107 if err != nil { 108 return false, fmt.Errorf("failed to get ready nodes: %v", err) 109 } 110 } 111 112 // Create a plan 113 s.plan = s.eval.MakePlan(s.job) 114 115 // Reset the failed allocations 116 s.failedTGAllocs = nil 117 118 // Create an evaluation context 119 s.ctx = NewEvalContext(s.state, s.plan, s.logger) 120 121 // Construct the placement stack 122 s.stack = NewSystemStack(s.ctx) 123 if !s.job.Stopped() { 124 s.stack.SetJob(s.job) 125 } 126 127 // Compute the target job allocations 128 if err := s.computeJobAllocs(); err != nil { 129 s.logger.Printf("[ERR] sched: %#v: %v", s.eval, err) 130 return false, err 131 } 132 133 // If the plan is a no-op, we can bail. If AnnotatePlan is set submit the plan 134 // anyways to get the annotations. 135 if s.plan.IsNoOp() && !s.eval.AnnotatePlan { 136 return true, nil 137 } 138 139 // If the limit of placements was reached we need to create an evaluation 140 // to pickup from here after the stagger period. 141 if s.limitReached && s.nextEval == nil { 142 s.nextEval = s.eval.NextRollingEval(s.job.Update.Stagger) 143 if err := s.planner.CreateEval(s.nextEval); err != nil { 144 s.logger.Printf("[ERR] sched: %#v failed to make next eval for rolling update: %v", s.eval, err) 145 return false, err 146 } 147 s.logger.Printf("[DEBUG] sched: %#v: rolling update limit reached, next eval '%s' created", s.eval, s.nextEval.ID) 148 } 149 150 // Submit the plan 151 result, newState, err := s.planner.SubmitPlan(s.plan) 152 s.planResult = result 153 if err != nil { 154 return false, err 155 } 156 157 // Decrement the number of allocations pending per task group based on the 158 // number of allocations successfully placed 159 adjustQueuedAllocations(s.logger, result, s.queuedAllocs) 160 161 // If we got a state refresh, try again since we have stale data 162 if newState != nil { 163 s.logger.Printf("[DEBUG] sched: %#v: refresh forced", s.eval) 164 s.state = newState 165 return false, nil 166 } 167 168 // Try again if the plan was not fully committed, potential conflict 169 fullCommit, expected, actual := result.FullCommit(s.plan) 170 if !fullCommit { 171 s.logger.Printf("[DEBUG] sched: %#v: attempted %d placements, %d placed", 172 s.eval, expected, actual) 173 return false, nil 174 } 175 176 // Success! 177 return true, nil 178 } 179 180 // computeJobAllocs is used to reconcile differences between the job, 181 // existing allocations and node status to update the allocations. 182 func (s *SystemScheduler) computeJobAllocs() error { 183 // Lookup the allocations by JobID 184 ws := memdb.NewWatchSet() 185 allocs, err := s.state.AllocsByJob(ws, s.eval.JobID, true) 186 if err != nil { 187 return fmt.Errorf("failed to get allocs for job '%s': %v", 188 s.eval.JobID, err) 189 } 190 191 // Determine the tainted nodes containing job allocs 192 tainted, err := taintedNodes(s.state, allocs) 193 if err != nil { 194 return fmt.Errorf("failed to get tainted nodes for job '%s': %v", 195 s.eval.JobID, err) 196 } 197 198 // Update the allocations which are in pending/running state on tainted 199 // nodes to lost 200 updateNonTerminalAllocsToLost(s.plan, tainted, allocs) 201 202 // Filter out the allocations in a terminal state 203 allocs, terminalAllocs := structs.FilterTerminalAllocs(allocs) 204 205 // Diff the required and existing allocations 206 diff := diffSystemAllocs(s.job, s.nodes, tainted, allocs, terminalAllocs) 207 s.logger.Printf("[DEBUG] sched: %#v: %#v", s.eval, diff) 208 209 // Add all the allocs to stop 210 for _, e := range diff.stop { 211 s.plan.AppendUpdate(e.Alloc, structs.AllocDesiredStatusStop, allocNotNeeded, "") 212 } 213 214 // Lost allocations should be transistioned to desired status stop and client 215 // status lost. 216 for _, e := range diff.lost { 217 s.plan.AppendUpdate(e.Alloc, structs.AllocDesiredStatusStop, allocLost, structs.AllocClientStatusLost) 218 } 219 220 // Attempt to do the upgrades in place 221 destructiveUpdates, inplaceUpdates := inplaceUpdate(s.ctx, s.eval, s.job, s.stack, diff.update) 222 diff.update = destructiveUpdates 223 224 if s.eval.AnnotatePlan { 225 s.plan.Annotations = &structs.PlanAnnotations{ 226 DesiredTGUpdates: desiredUpdates(diff, inplaceUpdates, destructiveUpdates), 227 } 228 } 229 230 // Check if a rolling upgrade strategy is being used 231 limit := len(diff.update) 232 if !s.job.Stopped() && s.job.Update.Rolling() { 233 limit = s.job.Update.MaxParallel 234 } 235 236 // Treat non in-place updates as an eviction and new placement. 237 s.limitReached = evictAndPlace(s.ctx, diff, diff.update, allocUpdating, &limit) 238 239 // Nothing remaining to do if placement is not required 240 if len(diff.place) == 0 { 241 if !s.job.Stopped() { 242 for _, tg := range s.job.TaskGroups { 243 s.queuedAllocs[tg.Name] = 0 244 } 245 } 246 return nil 247 } 248 249 // Record the number of allocations that needs to be placed per Task Group 250 for _, allocTuple := range diff.place { 251 s.queuedAllocs[allocTuple.TaskGroup.Name] += 1 252 } 253 254 // Compute the placements 255 return s.computePlacements(diff.place) 256 } 257 258 // computePlacements computes placements for allocations 259 func (s *SystemScheduler) computePlacements(place []allocTuple) error { 260 nodeByID := make(map[string]*structs.Node, len(s.nodes)) 261 for _, node := range s.nodes { 262 nodeByID[node.ID] = node 263 } 264 265 nodes := make([]*structs.Node, 1) 266 for _, missing := range place { 267 node, ok := nodeByID[missing.Alloc.NodeID] 268 if !ok { 269 return fmt.Errorf("could not find node %q", missing.Alloc.NodeID) 270 } 271 272 // Update the set of placement nodes 273 nodes[0] = node 274 s.stack.SetNodes(nodes) 275 276 // Attempt to match the task group 277 option, _ := s.stack.Select(missing.TaskGroup) 278 279 if option == nil { 280 // If nodes were filtered because of constain mismatches and we 281 // couldn't create an allocation then decrementing queued for that 282 // task group 283 if s.ctx.metrics.NodesFiltered > 0 { 284 s.queuedAllocs[missing.TaskGroup.Name] -= 1 285 286 // If we are annotating the plan, then decrement the desired 287 // placements based on whether the node meets the constraints 288 if s.eval.AnnotatePlan && s.plan.Annotations != nil && 289 s.plan.Annotations.DesiredTGUpdates != nil { 290 desired := s.plan.Annotations.DesiredTGUpdates[missing.TaskGroup.Name] 291 desired.Place -= 1 292 } 293 } 294 295 // Check if this task group has already failed 296 if metric, ok := s.failedTGAllocs[missing.TaskGroup.Name]; ok { 297 metric.CoalescedFailures += 1 298 continue 299 } 300 } 301 302 // Store the available nodes by datacenter 303 s.ctx.Metrics().NodesAvailable = s.nodesByDC 304 305 // Set fields based on if we found an allocation option 306 if option != nil { 307 // Create an allocation for this 308 alloc := &structs.Allocation{ 309 ID: structs.GenerateUUID(), 310 EvalID: s.eval.ID, 311 Name: missing.Name, 312 JobID: s.job.ID, 313 TaskGroup: missing.TaskGroup.Name, 314 Metrics: s.ctx.Metrics(), 315 NodeID: option.Node.ID, 316 TaskResources: option.TaskResources, 317 DesiredStatus: structs.AllocDesiredStatusRun, 318 ClientStatus: structs.AllocClientStatusPending, 319 320 SharedResources: &structs.Resources{ 321 DiskMB: missing.TaskGroup.EphemeralDisk.SizeMB, 322 }, 323 } 324 325 // If the new allocation is replacing an older allocation then we 326 // set the record the older allocation id so that they are chained 327 if missing.Alloc != nil { 328 alloc.PreviousAllocation = missing.Alloc.ID 329 } 330 331 s.plan.AppendAlloc(alloc) 332 } else { 333 // Lazy initialize the failed map 334 if s.failedTGAllocs == nil { 335 s.failedTGAllocs = make(map[string]*structs.AllocMetric) 336 } 337 338 s.failedTGAllocs[missing.TaskGroup.Name] = s.ctx.Metrics() 339 } 340 } 341 342 return nil 343 }