github.com/maier/nomad@v0.4.1-0.20161110003312-a9e3d0b8549d/scheduler/system_sched.go (about) 1 package scheduler 2 3 import ( 4 "fmt" 5 "log" 6 7 "github.com/hashicorp/nomad/nomad/structs" 8 ) 9 10 const ( 11 // maxSystemScheduleAttempts is used to limit the number of times 12 // we will attempt to schedule if we continue to hit conflicts for system 13 // jobs. 14 maxSystemScheduleAttempts = 5 15 16 // allocNodeTainted is the status used when stopping an alloc because it's 17 // node is tainted. 18 allocNodeTainted = "system alloc not needed as node is tainted" 19 ) 20 21 // SystemScheduler is used for 'system' jobs. This scheduler is 22 // designed for services that should be run on every client. 23 type SystemScheduler struct { 24 logger *log.Logger 25 state State 26 planner Planner 27 28 eval *structs.Evaluation 29 job *structs.Job 30 plan *structs.Plan 31 planResult *structs.PlanResult 32 ctx *EvalContext 33 stack *SystemStack 34 nodes []*structs.Node 35 nodesByDC map[string]int 36 37 limitReached bool 38 nextEval *structs.Evaluation 39 40 failedTGAllocs map[string]*structs.AllocMetric 41 queuedAllocs map[string]int 42 } 43 44 // NewSystemScheduler is a factory function to instantiate a new system 45 // scheduler. 46 func NewSystemScheduler(logger *log.Logger, state State, planner Planner) Scheduler { 47 return &SystemScheduler{ 48 logger: logger, 49 state: state, 50 planner: planner, 51 } 52 } 53 54 // Process is used to handle a single evaluation. 55 func (s *SystemScheduler) Process(eval *structs.Evaluation) error { 56 // Store the evaluation 57 s.eval = eval 58 59 // Verify the evaluation trigger reason is understood 60 switch eval.TriggeredBy { 61 case structs.EvalTriggerJobRegister, structs.EvalTriggerNodeUpdate, 62 structs.EvalTriggerJobDeregister, structs.EvalTriggerRollingUpdate: 63 default: 64 desc := fmt.Sprintf("scheduler cannot handle '%s' evaluation reason", 65 eval.TriggeredBy) 66 return setStatus(s.logger, s.planner, s.eval, s.nextEval, nil, s.failedTGAllocs, structs.EvalStatusFailed, desc, 67 s.queuedAllocs) 68 } 69 70 // Retry up to the maxSystemScheduleAttempts and reset if progress is made. 71 progress := func() bool { return progressMade(s.planResult) } 72 if err := retryMax(maxSystemScheduleAttempts, s.process, progress); err != nil { 73 if statusErr, ok := err.(*SetStatusError); ok { 74 return setStatus(s.logger, s.planner, s.eval, s.nextEval, nil, s.failedTGAllocs, statusErr.EvalStatus, err.Error(), 75 s.queuedAllocs) 76 } 77 return err 78 } 79 80 // Update the status to complete 81 return setStatus(s.logger, s.planner, s.eval, s.nextEval, nil, s.failedTGAllocs, structs.EvalStatusComplete, "", 82 s.queuedAllocs) 83 } 84 85 // process is wrapped in retryMax to iteratively run the handler until we have no 86 // further work or we've made the maximum number of attempts. 87 func (s *SystemScheduler) process() (bool, error) { 88 // Lookup the Job by ID 89 var err error 90 s.job, err = s.state.JobByID(s.eval.JobID) 91 if err != nil { 92 return false, fmt.Errorf("failed to get job '%s': %v", 93 s.eval.JobID, err) 94 } 95 numTaskGroups := 0 96 if s.job != nil { 97 numTaskGroups = len(s.job.TaskGroups) 98 } 99 s.queuedAllocs = make(map[string]int, numTaskGroups) 100 101 // Get the ready nodes in the required datacenters 102 if s.job != nil { 103 s.nodes, s.nodesByDC, err = readyNodesInDCs(s.state, s.job.Datacenters) 104 if err != nil { 105 return false, fmt.Errorf("failed to get ready nodes: %v", err) 106 } 107 } 108 109 // Create a plan 110 s.plan = s.eval.MakePlan(s.job) 111 112 // Reset the failed allocations 113 s.failedTGAllocs = nil 114 115 // Create an evaluation context 116 s.ctx = NewEvalContext(s.state, s.plan, s.logger) 117 118 // Construct the placement stack 119 s.stack = NewSystemStack(s.ctx) 120 if s.job != nil { 121 s.stack.SetJob(s.job) 122 } 123 124 // Compute the target job allocations 125 if err := s.computeJobAllocs(); err != nil { 126 s.logger.Printf("[ERR] sched: %#v: %v", s.eval, err) 127 return false, err 128 } 129 130 // If the plan is a no-op, we can bail. If AnnotatePlan is set submit the plan 131 // anyways to get the annotations. 132 if s.plan.IsNoOp() && !s.eval.AnnotatePlan { 133 return true, nil 134 } 135 136 // If the limit of placements was reached we need to create an evaluation 137 // to pickup from here after the stagger period. 138 if s.limitReached && s.nextEval == nil { 139 s.nextEval = s.eval.NextRollingEval(s.job.Update.Stagger) 140 if err := s.planner.CreateEval(s.nextEval); err != nil { 141 s.logger.Printf("[ERR] sched: %#v failed to make next eval for rolling update: %v", s.eval, err) 142 return false, err 143 } 144 s.logger.Printf("[DEBUG] sched: %#v: rolling update limit reached, next eval '%s' created", s.eval, s.nextEval.ID) 145 } 146 147 // Submit the plan 148 result, newState, err := s.planner.SubmitPlan(s.plan) 149 s.planResult = result 150 if err != nil { 151 return false, err 152 } 153 154 // Decrement the number of allocations pending per task group based on the 155 // number of allocations successfully placed 156 adjustQueuedAllocations(s.logger, result, s.queuedAllocs) 157 158 // If we got a state refresh, try again since we have stale data 159 if newState != nil { 160 s.logger.Printf("[DEBUG] sched: %#v: refresh forced", s.eval) 161 s.state = newState 162 return false, nil 163 } 164 165 // Try again if the plan was not fully committed, potential conflict 166 fullCommit, expected, actual := result.FullCommit(s.plan) 167 if !fullCommit { 168 s.logger.Printf("[DEBUG] sched: %#v: attempted %d placements, %d placed", 169 s.eval, expected, actual) 170 return false, nil 171 } 172 173 // Success! 174 return true, nil 175 } 176 177 // computeJobAllocs is used to reconcile differences between the job, 178 // existing allocations and node status to update the allocations. 179 func (s *SystemScheduler) computeJobAllocs() error { 180 // Lookup the allocations by JobID 181 allocs, err := s.state.AllocsByJob(s.eval.JobID) 182 if err != nil { 183 return fmt.Errorf("failed to get allocs for job '%s': %v", 184 s.eval.JobID, err) 185 } 186 187 // Determine the tainted nodes containing job allocs 188 tainted, err := taintedNodes(s.state, allocs) 189 if err != nil { 190 return fmt.Errorf("failed to get tainted nodes for job '%s': %v", 191 s.eval.JobID, err) 192 } 193 194 // Update the allocations which are in pending/running state on tainted 195 // nodes to lost 196 updateNonTerminalAllocsToLost(s.plan, tainted, allocs) 197 198 // Filter out the allocations in a terminal state 199 allocs, terminalAllocs := structs.FilterTerminalAllocs(allocs) 200 201 // Diff the required and existing allocations 202 diff := diffSystemAllocs(s.job, s.nodes, tainted, allocs, terminalAllocs) 203 s.logger.Printf("[DEBUG] sched: %#v: %#v", s.eval, diff) 204 205 // Add all the allocs to stop 206 for _, e := range diff.stop { 207 s.plan.AppendUpdate(e.Alloc, structs.AllocDesiredStatusStop, allocNotNeeded, "") 208 } 209 210 // Lost allocations should be transistioned to desired status stop and client 211 // status lost. 212 for _, e := range diff.lost { 213 s.plan.AppendUpdate(e.Alloc, structs.AllocDesiredStatusStop, allocLost, structs.AllocClientStatusLost) 214 } 215 216 // Attempt to do the upgrades in place 217 destructiveUpdates, inplaceUpdates := inplaceUpdate(s.ctx, s.eval, s.job, s.stack, diff.update) 218 diff.update = destructiveUpdates 219 220 if s.eval.AnnotatePlan { 221 s.plan.Annotations = &structs.PlanAnnotations{ 222 DesiredTGUpdates: desiredUpdates(diff, inplaceUpdates, destructiveUpdates), 223 } 224 } 225 226 // Check if a rolling upgrade strategy is being used 227 limit := len(diff.update) 228 if s.job != nil && s.job.Update.Rolling() { 229 limit = s.job.Update.MaxParallel 230 } 231 232 // Treat non in-place updates as an eviction and new placement. 233 s.limitReached = evictAndPlace(s.ctx, diff, diff.update, allocUpdating, &limit) 234 235 // Nothing remaining to do if placement is not required 236 if len(diff.place) == 0 { 237 if s.job != nil { 238 for _, tg := range s.job.TaskGroups { 239 s.queuedAllocs[tg.Name] = 0 240 } 241 } 242 return nil 243 } 244 245 // Record the number of allocations that needs to be placed per Task Group 246 for _, allocTuple := range diff.place { 247 s.queuedAllocs[allocTuple.TaskGroup.Name] += 1 248 } 249 250 // Compute the placements 251 return s.computePlacements(diff.place) 252 } 253 254 // computePlacements computes placements for allocations 255 func (s *SystemScheduler) computePlacements(place []allocTuple) error { 256 nodeByID := make(map[string]*structs.Node, len(s.nodes)) 257 for _, node := range s.nodes { 258 nodeByID[node.ID] = node 259 } 260 261 nodes := make([]*structs.Node, 1) 262 for _, missing := range place { 263 node, ok := nodeByID[missing.Alloc.NodeID] 264 if !ok { 265 return fmt.Errorf("could not find node %q", missing.Alloc.NodeID) 266 } 267 268 // Update the set of placement nodes 269 nodes[0] = node 270 s.stack.SetNodes(nodes) 271 272 // Attempt to match the task group 273 option, _ := s.stack.Select(missing.TaskGroup) 274 275 if option == nil { 276 // If nodes were filtered because of constain mismatches and we 277 // couldn't create an allocation then decrementing queued for that 278 // task group 279 if s.ctx.metrics.NodesFiltered > 0 { 280 s.queuedAllocs[missing.TaskGroup.Name] -= 1 281 282 // If we are annotating the plan, then decrement the desired 283 // placements based on whether the node meets the constraints 284 if s.eval.AnnotatePlan && s.plan.Annotations != nil && 285 s.plan.Annotations.DesiredTGUpdates != nil { 286 desired := s.plan.Annotations.DesiredTGUpdates[missing.TaskGroup.Name] 287 desired.Place -= 1 288 } 289 } 290 291 // Check if this task group has already failed 292 if metric, ok := s.failedTGAllocs[missing.TaskGroup.Name]; ok { 293 metric.CoalescedFailures += 1 294 continue 295 } 296 } 297 298 // Store the available nodes by datacenter 299 s.ctx.Metrics().NodesAvailable = s.nodesByDC 300 301 // Set fields based on if we found an allocation option 302 if option != nil { 303 // Create an allocation for this 304 alloc := &structs.Allocation{ 305 ID: structs.GenerateUUID(), 306 EvalID: s.eval.ID, 307 Name: missing.Name, 308 JobID: s.job.ID, 309 TaskGroup: missing.TaskGroup.Name, 310 Metrics: s.ctx.Metrics(), 311 NodeID: option.Node.ID, 312 TaskResources: option.TaskResources, 313 DesiredStatus: structs.AllocDesiredStatusRun, 314 ClientStatus: structs.AllocClientStatusPending, 315 316 SharedResources: &structs.Resources{ 317 DiskMB: missing.TaskGroup.EphemeralDisk.SizeMB, 318 }, 319 } 320 321 // If the new allocation is replacing an older allocation then we 322 // set the record the older allocation id so that they are chained 323 if missing.Alloc != nil { 324 alloc.PreviousAllocation = missing.Alloc.ID 325 } 326 327 s.plan.AppendAlloc(alloc) 328 } else { 329 // Lazy initialize the failed map 330 if s.failedTGAllocs == nil { 331 s.failedTGAllocs = make(map[string]*structs.AllocMetric) 332 } 333 334 s.failedTGAllocs[missing.TaskGroup.Name] = s.ctx.Metrics() 335 } 336 } 337 338 return nil 339 }