github.com/ncodes/nomad@v0.5.7-0.20170403112158-97adf4a74fb3/scheduler/system_sched.go (about) 1 package scheduler 2 3 import ( 4 "fmt" 5 "log" 6 7 memdb "github.com/hashicorp/go-memdb" 8 "github.com/ncodes/nomad/nomad/structs" 9 ) 10 11 const ( 12 // maxSystemScheduleAttempts is used to limit the number of times 13 // we will attempt to schedule if we continue to hit conflicts for system 14 // jobs. 15 maxSystemScheduleAttempts = 5 16 17 // allocNodeTainted is the status used when stopping an alloc because it's 18 // node is tainted. 19 allocNodeTainted = "system alloc not needed as node is tainted" 20 ) 21 22 // SystemScheduler is used for 'system' jobs. This scheduler is 23 // designed for services that should be run on every client. 24 type SystemScheduler struct { 25 logger *log.Logger 26 state State 27 planner Planner 28 29 eval *structs.Evaluation 30 job *structs.Job 31 plan *structs.Plan 32 planResult *structs.PlanResult 33 ctx *EvalContext 34 stack *SystemStack 35 nodes []*structs.Node 36 nodesByDC map[string]int 37 38 limitReached bool 39 nextEval *structs.Evaluation 40 41 failedTGAllocs map[string]*structs.AllocMetric 42 queuedAllocs map[string]int 43 } 44 45 // NewSystemScheduler is a factory function to instantiate a new system 46 // scheduler. 47 func NewSystemScheduler(logger *log.Logger, state State, planner Planner) Scheduler { 48 return &SystemScheduler{ 49 logger: logger, 50 state: state, 51 planner: planner, 52 } 53 } 54 55 // Process is used to handle a single evaluation. 56 func (s *SystemScheduler) Process(eval *structs.Evaluation) error { 57 // Store the evaluation 58 s.eval = eval 59 60 // Verify the evaluation trigger reason is understood 61 switch eval.TriggeredBy { 62 case structs.EvalTriggerJobRegister, structs.EvalTriggerNodeUpdate, 63 structs.EvalTriggerJobDeregister, structs.EvalTriggerRollingUpdate: 64 default: 65 desc := fmt.Sprintf("scheduler cannot handle '%s' evaluation reason", 66 eval.TriggeredBy) 67 return setStatus(s.logger, s.planner, s.eval, s.nextEval, nil, s.failedTGAllocs, structs.EvalStatusFailed, desc, 68 s.queuedAllocs) 69 } 70 71 // Retry up to the maxSystemScheduleAttempts and reset if progress is made. 72 progress := func() bool { return progressMade(s.planResult) } 73 if err := retryMax(maxSystemScheduleAttempts, s.process, progress); err != nil { 74 if statusErr, ok := err.(*SetStatusError); ok { 75 return setStatus(s.logger, s.planner, s.eval, s.nextEval, nil, s.failedTGAllocs, statusErr.EvalStatus, err.Error(), 76 s.queuedAllocs) 77 } 78 return err 79 } 80 81 // Update the status to complete 82 return setStatus(s.logger, s.planner, s.eval, s.nextEval, nil, s.failedTGAllocs, structs.EvalStatusComplete, "", 83 s.queuedAllocs) 84 } 85 86 // process is wrapped in retryMax to iteratively run the handler until we have no 87 // further work or we've made the maximum number of attempts. 88 func (s *SystemScheduler) process() (bool, error) { 89 // Lookup the Job by ID 90 var err error 91 ws := memdb.NewWatchSet() 92 s.job, err = s.state.JobByID(ws, s.eval.JobID) 93 if err != nil { 94 return false, fmt.Errorf("failed to get job '%s': %v", 95 s.eval.JobID, err) 96 } 97 numTaskGroups := 0 98 if s.job != nil { 99 numTaskGroups = len(s.job.TaskGroups) 100 } 101 s.queuedAllocs = make(map[string]int, numTaskGroups) 102 103 // Get the ready nodes in the required datacenters 104 if s.job != nil { 105 s.nodes, s.nodesByDC, err = readyNodesInDCs(s.state, s.job.Datacenters) 106 if err != nil { 107 return false, fmt.Errorf("failed to get ready nodes: %v", err) 108 } 109 } 110 111 // Create a plan 112 s.plan = s.eval.MakePlan(s.job) 113 114 // Reset the failed allocations 115 s.failedTGAllocs = nil 116 117 // Create an evaluation context 118 s.ctx = NewEvalContext(s.state, s.plan, s.logger) 119 120 // Construct the placement stack 121 s.stack = NewSystemStack(s.ctx) 122 if s.job != nil { 123 s.stack.SetJob(s.job) 124 } 125 126 // Compute the target job allocations 127 if err := s.computeJobAllocs(); err != nil { 128 s.logger.Printf("[ERR] sched: %#v: %v", s.eval, err) 129 return false, err 130 } 131 132 // If the plan is a no-op, we can bail. If AnnotatePlan is set submit the plan 133 // anyways to get the annotations. 134 if s.plan.IsNoOp() && !s.eval.AnnotatePlan { 135 return true, nil 136 } 137 138 // If the limit of placements was reached we need to create an evaluation 139 // to pickup from here after the stagger period. 140 if s.limitReached && s.nextEval == nil { 141 s.nextEval = s.eval.NextRollingEval(s.job.Update.Stagger) 142 if err := s.planner.CreateEval(s.nextEval); err != nil { 143 s.logger.Printf("[ERR] sched: %#v failed to make next eval for rolling update: %v", s.eval, err) 144 return false, err 145 } 146 s.logger.Printf("[DEBUG] sched: %#v: rolling update limit reached, next eval '%s' created", s.eval, s.nextEval.ID) 147 } 148 149 // Submit the plan 150 result, newState, err := s.planner.SubmitPlan(s.plan) 151 s.planResult = result 152 if err != nil { 153 return false, err 154 } 155 156 // Decrement the number of allocations pending per task group based on the 157 // number of allocations successfully placed 158 adjustQueuedAllocations(s.logger, result, s.queuedAllocs) 159 160 // If we got a state refresh, try again since we have stale data 161 if newState != nil { 162 s.logger.Printf("[DEBUG] sched: %#v: refresh forced", s.eval) 163 s.state = newState 164 return false, nil 165 } 166 167 // Try again if the plan was not fully committed, potential conflict 168 fullCommit, expected, actual := result.FullCommit(s.plan) 169 if !fullCommit { 170 s.logger.Printf("[DEBUG] sched: %#v: attempted %d placements, %d placed", 171 s.eval, expected, actual) 172 return false, nil 173 } 174 175 // Success! 176 return true, nil 177 } 178 179 // computeJobAllocs is used to reconcile differences between the job, 180 // existing allocations and node status to update the allocations. 181 func (s *SystemScheduler) computeJobAllocs() error { 182 // Lookup the allocations by JobID 183 ws := memdb.NewWatchSet() 184 allocs, err := s.state.AllocsByJob(ws, s.eval.JobID, true) 185 if err != nil { 186 return fmt.Errorf("failed to get allocs for job '%s': %v", 187 s.eval.JobID, err) 188 } 189 190 // Determine the tainted nodes containing job allocs 191 tainted, err := taintedNodes(s.state, allocs) 192 if err != nil { 193 return fmt.Errorf("failed to get tainted nodes for job '%s': %v", 194 s.eval.JobID, err) 195 } 196 197 // Update the allocations which are in pending/running state on tainted 198 // nodes to lost 199 updateNonTerminalAllocsToLost(s.plan, tainted, allocs) 200 201 // Filter out the allocations in a terminal state 202 allocs, terminalAllocs := structs.FilterTerminalAllocs(allocs) 203 204 // Diff the required and existing allocations 205 diff := diffSystemAllocs(s.job, s.nodes, tainted, allocs, terminalAllocs) 206 s.logger.Printf("[DEBUG] sched: %#v: %#v", s.eval, diff) 207 208 // Add all the allocs to stop 209 for _, e := range diff.stop { 210 s.plan.AppendUpdate(e.Alloc, structs.AllocDesiredStatusStop, allocNotNeeded, "") 211 } 212 213 // Lost allocations should be transistioned to desired status stop and client 214 // status lost. 215 for _, e := range diff.lost { 216 s.plan.AppendUpdate(e.Alloc, structs.AllocDesiredStatusStop, allocLost, structs.AllocClientStatusLost) 217 } 218 219 // Attempt to do the upgrades in place 220 destructiveUpdates, inplaceUpdates := inplaceUpdate(s.ctx, s.eval, s.job, s.stack, diff.update) 221 diff.update = destructiveUpdates 222 223 if s.eval.AnnotatePlan { 224 s.plan.Annotations = &structs.PlanAnnotations{ 225 DesiredTGUpdates: desiredUpdates(diff, inplaceUpdates, destructiveUpdates), 226 } 227 } 228 229 // Check if a rolling upgrade strategy is being used 230 limit := len(diff.update) 231 if s.job != nil && s.job.Update.Rolling() { 232 limit = s.job.Update.MaxParallel 233 } 234 235 // Treat non in-place updates as an eviction and new placement. 236 s.limitReached = evictAndPlace(s.ctx, diff, diff.update, allocUpdating, &limit) 237 238 // Nothing remaining to do if placement is not required 239 if len(diff.place) == 0 { 240 if s.job != nil { 241 for _, tg := range s.job.TaskGroups { 242 s.queuedAllocs[tg.Name] = 0 243 } 244 } 245 return nil 246 } 247 248 // Record the number of allocations that needs to be placed per Task Group 249 for _, allocTuple := range diff.place { 250 s.queuedAllocs[allocTuple.TaskGroup.Name] += 1 251 } 252 253 // Compute the placements 254 return s.computePlacements(diff.place) 255 } 256 257 // computePlacements computes placements for allocations 258 func (s *SystemScheduler) computePlacements(place []allocTuple) error { 259 nodeByID := make(map[string]*structs.Node, len(s.nodes)) 260 for _, node := range s.nodes { 261 nodeByID[node.ID] = node 262 } 263 264 nodes := make([]*structs.Node, 1) 265 for _, missing := range place { 266 node, ok := nodeByID[missing.Alloc.NodeID] 267 if !ok { 268 return fmt.Errorf("could not find node %q", missing.Alloc.NodeID) 269 } 270 271 // Update the set of placement nodes 272 nodes[0] = node 273 s.stack.SetNodes(nodes) 274 275 // Attempt to match the task group 276 option, _ := s.stack.Select(missing.TaskGroup) 277 278 if option == nil { 279 // If nodes were filtered because of constain mismatches and we 280 // couldn't create an allocation then decrementing queued for that 281 // task group 282 if s.ctx.metrics.NodesFiltered > 0 { 283 s.queuedAllocs[missing.TaskGroup.Name] -= 1 284 285 // If we are annotating the plan, then decrement the desired 286 // placements based on whether the node meets the constraints 287 if s.eval.AnnotatePlan && s.plan.Annotations != nil && 288 s.plan.Annotations.DesiredTGUpdates != nil { 289 desired := s.plan.Annotations.DesiredTGUpdates[missing.TaskGroup.Name] 290 desired.Place -= 1 291 } 292 } 293 294 // Check if this task group has already failed 295 if metric, ok := s.failedTGAllocs[missing.TaskGroup.Name]; ok { 296 metric.CoalescedFailures += 1 297 continue 298 } 299 } 300 301 // Store the available nodes by datacenter 302 s.ctx.Metrics().NodesAvailable = s.nodesByDC 303 304 // Set fields based on if we found an allocation option 305 if option != nil { 306 // Create an allocation for this 307 alloc := &structs.Allocation{ 308 ID: structs.GenerateUUID(), 309 EvalID: s.eval.ID, 310 Name: missing.Name, 311 JobID: s.job.ID, 312 TaskGroup: missing.TaskGroup.Name, 313 Metrics: s.ctx.Metrics(), 314 NodeID: option.Node.ID, 315 TaskResources: option.TaskResources, 316 DesiredStatus: structs.AllocDesiredStatusRun, 317 ClientStatus: structs.AllocClientStatusPending, 318 319 SharedResources: &structs.Resources{ 320 DiskMB: missing.TaskGroup.EphemeralDisk.SizeMB, 321 }, 322 } 323 324 // If the new allocation is replacing an older allocation then we 325 // set the record the older allocation id so that they are chained 326 if missing.Alloc != nil { 327 alloc.PreviousAllocation = missing.Alloc.ID 328 } 329 330 s.plan.AppendAlloc(alloc) 331 } else { 332 // Lazy initialize the failed map 333 if s.failedTGAllocs == nil { 334 s.failedTGAllocs = make(map[string]*structs.AllocMetric) 335 } 336 337 s.failedTGAllocs[missing.TaskGroup.Name] = s.ctx.Metrics() 338 } 339 } 340 341 return nil 342 }