github.com/dkerwin/nomad@v0.3.3-0.20160525181927-74554135514b/scheduler/system_sched.go (about) 1 package scheduler 2 3 import ( 4 "fmt" 5 "log" 6 7 "github.com/hashicorp/nomad/nomad/structs" 8 ) 9 10 const ( 11 // maxSystemScheduleAttempts is used to limit the number of times 12 // we will attempt to schedule if we continue to hit conflicts for system 13 // jobs. 14 maxSystemScheduleAttempts = 5 15 16 // allocNodeTainted is the status used when stopping an alloc because it's 17 // node is tainted. 18 allocNodeTainted = "system alloc not needed as node is tainted" 19 ) 20 21 // SystemScheduler is used for 'system' jobs. This scheduler is 22 // designed for services that should be run on every client. 23 type SystemScheduler struct { 24 logger *log.Logger 25 state State 26 planner Planner 27 28 eval *structs.Evaluation 29 job *structs.Job 30 plan *structs.Plan 31 planResult *structs.PlanResult 32 ctx *EvalContext 33 stack *SystemStack 34 nodes []*structs.Node 35 nodesByDC map[string]int 36 37 limitReached bool 38 nextEval *structs.Evaluation 39 } 40 41 // NewSystemScheduler is a factory function to instantiate a new system 42 // scheduler. 43 func NewSystemScheduler(logger *log.Logger, state State, planner Planner) Scheduler { 44 return &SystemScheduler{ 45 logger: logger, 46 state: state, 47 planner: planner, 48 } 49 } 50 51 // Process is used to handle a single evaluation. 52 func (s *SystemScheduler) Process(eval *structs.Evaluation) error { 53 // Store the evaluation 54 s.eval = eval 55 56 // Verify the evaluation trigger reason is understood 57 switch eval.TriggeredBy { 58 case structs.EvalTriggerJobRegister, structs.EvalTriggerNodeUpdate, 59 structs.EvalTriggerJobDeregister, structs.EvalTriggerRollingUpdate: 60 default: 61 desc := fmt.Sprintf("scheduler cannot handle '%s' evaluation reason", 62 eval.TriggeredBy) 63 return setStatus(s.logger, s.planner, s.eval, s.nextEval, nil, structs.EvalStatusFailed, desc) 64 } 65 66 // Retry up to the maxSystemScheduleAttempts and reset if progress is made. 67 progress := func() bool { return progressMade(s.planResult) } 68 if err := retryMax(maxSystemScheduleAttempts, s.process, progress); err != nil { 69 if statusErr, ok := err.(*SetStatusError); ok { 70 return setStatus(s.logger, s.planner, s.eval, s.nextEval, nil, statusErr.EvalStatus, err.Error()) 71 } 72 return err 73 } 74 75 // Update the status to complete 76 return setStatus(s.logger, s.planner, s.eval, s.nextEval, nil, structs.EvalStatusComplete, "") 77 } 78 79 // process is wrapped in retryMax to iteratively run the handler until we have no 80 // further work or we've made the maximum number of attempts. 81 func (s *SystemScheduler) process() (bool, error) { 82 // Lookup the Job by ID 83 var err error 84 s.job, err = s.state.JobByID(s.eval.JobID) 85 if err != nil { 86 return false, fmt.Errorf("failed to get job '%s': %v", 87 s.eval.JobID, err) 88 } 89 90 // Get the ready nodes in the required datacenters 91 if s.job != nil { 92 s.nodes, s.nodesByDC, err = readyNodesInDCs(s.state, s.job.Datacenters) 93 if err != nil { 94 return false, fmt.Errorf("failed to get ready nodes: %v", err) 95 } 96 } 97 98 // Create a plan 99 s.plan = s.eval.MakePlan(s.job) 100 101 // Reset the failed allocations 102 s.eval.FailedTGAllocs = nil 103 104 // Create an evaluation context 105 s.ctx = NewEvalContext(s.state, s.plan, s.logger) 106 107 // Construct the placement stack 108 s.stack = NewSystemStack(s.ctx) 109 if s.job != nil { 110 s.stack.SetJob(s.job) 111 } 112 113 // Compute the target job allocations 114 if err := s.computeJobAllocs(); err != nil { 115 s.logger.Printf("[ERR] sched: %#v: %v", s.eval, err) 116 return false, err 117 } 118 119 // If the plan is a no-op, we can bail. If AnnotatePlan is set submit the plan 120 // anyways to get the annotations. 121 if s.plan.IsNoOp() && !s.eval.AnnotatePlan { 122 return true, nil 123 } 124 125 // If the limit of placements was reached we need to create an evaluation 126 // to pickup from here after the stagger period. 127 if s.limitReached && s.nextEval == nil { 128 s.nextEval = s.eval.NextRollingEval(s.job.Update.Stagger) 129 if err := s.planner.CreateEval(s.nextEval); err != nil { 130 s.logger.Printf("[ERR] sched: %#v failed to make next eval for rolling update: %v", s.eval, err) 131 return false, err 132 } 133 s.logger.Printf("[DEBUG] sched: %#v: rolling update limit reached, next eval '%s' created", s.eval, s.nextEval.ID) 134 } 135 136 // Submit the plan 137 result, newState, err := s.planner.SubmitPlan(s.plan) 138 s.planResult = result 139 if err != nil { 140 return false, err 141 } 142 143 // If we got a state refresh, try again since we have stale data 144 if newState != nil { 145 s.logger.Printf("[DEBUG] sched: %#v: refresh forced", s.eval) 146 s.state = newState 147 return false, nil 148 } 149 150 // Try again if the plan was not fully committed, potential conflict 151 fullCommit, expected, actual := result.FullCommit(s.plan) 152 if !fullCommit { 153 s.logger.Printf("[DEBUG] sched: %#v: attempted %d placements, %d placed", 154 s.eval, expected, actual) 155 return false, nil 156 } 157 158 // Success! 159 return true, nil 160 } 161 162 // computeJobAllocs is used to reconcile differences between the job, 163 // existing allocations and node status to update the allocations. 164 func (s *SystemScheduler) computeJobAllocs() error { 165 // Lookup the allocations by JobID 166 allocs, err := s.state.AllocsByJob(s.eval.JobID) 167 if err != nil { 168 return fmt.Errorf("failed to get allocs for job '%s': %v", 169 s.eval.JobID, err) 170 } 171 172 // Filter out the allocations in a terminal state 173 allocs = structs.FilterTerminalAllocs(allocs) 174 175 // Determine the tainted nodes containing job allocs 176 tainted, err := taintedNodes(s.state, allocs) 177 if err != nil { 178 return fmt.Errorf("failed to get tainted nodes for job '%s': %v", 179 s.eval.JobID, err) 180 } 181 182 // Diff the required and existing allocations 183 diff := diffSystemAllocs(s.job, s.nodes, tainted, allocs) 184 s.logger.Printf("[DEBUG] sched: %#v: %#v", s.eval, diff) 185 186 // Add all the allocs to stop 187 for _, e := range diff.stop { 188 s.plan.AppendUpdate(e.Alloc, structs.AllocDesiredStatusStop, allocNotNeeded) 189 } 190 191 // Attempt to do the upgrades in place 192 destructiveUpdates, inplaceUpdates := inplaceUpdate(s.ctx, s.eval, s.job, s.stack, diff.update) 193 diff.update = destructiveUpdates 194 195 if s.eval.AnnotatePlan { 196 s.plan.Annotations = &structs.PlanAnnotations{ 197 DesiredTGUpdates: desiredUpdates(diff, inplaceUpdates, destructiveUpdates), 198 } 199 } 200 201 // Check if a rolling upgrade strategy is being used 202 limit := len(diff.update) 203 if s.job != nil && s.job.Update.Rolling() { 204 limit = s.job.Update.MaxParallel 205 } 206 207 // Treat non in-place updates as an eviction and new placement. 208 s.limitReached = evictAndPlace(s.ctx, diff, diff.update, allocUpdating, &limit) 209 210 // Nothing remaining to do if placement is not required 211 if len(diff.place) == 0 { 212 return nil 213 } 214 215 // Compute the placements 216 return s.computePlacements(diff.place) 217 } 218 219 // computePlacements computes placements for allocations 220 func (s *SystemScheduler) computePlacements(place []allocTuple) error { 221 nodeByID := make(map[string]*structs.Node, len(s.nodes)) 222 for _, node := range s.nodes { 223 nodeByID[node.ID] = node 224 } 225 226 nodes := make([]*structs.Node, 1) 227 for _, missing := range place { 228 node, ok := nodeByID[missing.Alloc.NodeID] 229 if !ok { 230 return fmt.Errorf("could not find node %q", missing.Alloc.NodeID) 231 } 232 233 // Update the set of placement nodes 234 nodes[0] = node 235 s.stack.SetNodes(nodes) 236 237 // Attempt to match the task group 238 option, _ := s.stack.Select(missing.TaskGroup) 239 240 if option == nil { 241 // Check if this task group has already failed 242 if metric, ok := s.eval.FailedTGAllocs[missing.TaskGroup.Name]; ok { 243 metric.CoalescedFailures += 1 244 continue 245 } 246 } 247 248 // Store the available nodes by datacenter 249 s.ctx.Metrics().NodesAvailable = s.nodesByDC 250 251 // Set fields based on if we found an allocation option 252 if option != nil { 253 // Create an allocation for this 254 alloc := &structs.Allocation{ 255 ID: structs.GenerateUUID(), 256 EvalID: s.eval.ID, 257 Name: missing.Name, 258 JobID: s.job.ID, 259 TaskGroup: missing.TaskGroup.Name, 260 Metrics: s.ctx.Metrics(), 261 NodeID: option.Node.ID, 262 TaskResources: option.TaskResources, 263 DesiredStatus: structs.AllocDesiredStatusRun, 264 ClientStatus: structs.AllocClientStatusPending, 265 } 266 267 // Generate service IDs tasks in this allocation 268 // COMPAT - This is no longer required and would be removed in v0.4 269 alloc.PopulateServiceIDs(missing.TaskGroup) 270 271 s.plan.AppendAlloc(alloc) 272 } else { 273 // Lazy initialize the failed map 274 if s.eval.FailedTGAllocs == nil { 275 s.eval.FailedTGAllocs = make(map[string]*structs.AllocMetric) 276 } 277 278 s.eval.FailedTGAllocs[missing.TaskGroup.Name] = s.ctx.Metrics() 279 } 280 } 281 282 return nil 283 }