github.com/kardianos/nomad@v0.1.3-0.20151022182107-b13df73ee850/scheduler/system_sched.go (about) 1 package scheduler 2 3 import ( 4 "fmt" 5 "log" 6 7 "github.com/hashicorp/nomad/nomad/structs" 8 ) 9 10 const ( 11 // maxSystemScheduleAttempts is used to limit the number of times 12 // we will attempt to schedule if we continue to hit conflicts for system 13 // jobs. 14 maxSystemScheduleAttempts = 5 15 16 // allocNodeTainted is the status used when stopping an alloc because it's 17 // node is tainted. 18 allocNodeTainted = "system alloc not needed as node is tainted" 19 ) 20 21 // SystemScheduler is used for 'system' jobs. This scheduler is 22 // designed for services that should be run on every client. 23 type SystemScheduler struct { 24 logger *log.Logger 25 state State 26 planner Planner 27 28 eval *structs.Evaluation 29 job *structs.Job 30 plan *structs.Plan 31 ctx *EvalContext 32 stack *SystemStack 33 nodes []*structs.Node 34 35 limitReached bool 36 nextEval *structs.Evaluation 37 } 38 39 // NewSystemScheduler is a factory function to instantiate a new system 40 // scheduler. 41 func NewSystemScheduler(logger *log.Logger, state State, planner Planner) Scheduler { 42 return &SystemScheduler{ 43 logger: logger, 44 state: state, 45 planner: planner, 46 } 47 } 48 49 // Process is used to handle a single evaluation. 50 func (s *SystemScheduler) Process(eval *structs.Evaluation) error { 51 // Store the evaluation 52 s.eval = eval 53 54 // Verify the evaluation trigger reason is understood 55 switch eval.TriggeredBy { 56 case structs.EvalTriggerJobRegister, structs.EvalTriggerNodeUpdate, 57 structs.EvalTriggerJobDeregister, structs.EvalTriggerRollingUpdate: 58 default: 59 desc := fmt.Sprintf("scheduler cannot handle '%s' evaluation reason", 60 eval.TriggeredBy) 61 return setStatus(s.logger, s.planner, s.eval, s.nextEval, structs.EvalStatusFailed, desc) 62 } 63 64 // Retry up to the maxSystemScheduleAttempts 65 if err := retryMax(maxSystemScheduleAttempts, s.process); err != nil { 66 if statusErr, ok := err.(*SetStatusError); ok { 67 return setStatus(s.logger, s.planner, s.eval, s.nextEval, statusErr.EvalStatus, err.Error()) 68 } 69 return err 70 } 71 72 // Update the status to complete 73 return setStatus(s.logger, s.planner, s.eval, s.nextEval, structs.EvalStatusComplete, "") 74 } 75 76 // process is wrapped in retryMax to iteratively run the handler until we have no 77 // further work or we've made the maximum number of attempts. 78 func (s *SystemScheduler) process() (bool, error) { 79 // Lookup the Job by ID 80 var err error 81 s.job, err = s.state.JobByID(s.eval.JobID) 82 if err != nil { 83 return false, fmt.Errorf("failed to get job '%s': %v", 84 s.eval.JobID, err) 85 } 86 87 // Get the ready nodes in the required datacenters 88 if s.job != nil { 89 s.nodes, err = readyNodesInDCs(s.state, s.job.Datacenters) 90 if err != nil { 91 return false, fmt.Errorf("failed to get ready nodes: %v", err) 92 } 93 } 94 95 // Create a plan 96 s.plan = s.eval.MakePlan(s.job) 97 98 // Create an evaluation context 99 s.ctx = NewEvalContext(s.state, s.plan, s.logger) 100 101 // Construct the placement stack 102 s.stack = NewSystemStack(s.ctx) 103 if s.job != nil { 104 s.stack.SetJob(s.job) 105 } 106 107 // Compute the target job allocations 108 if err := s.computeJobAllocs(); err != nil { 109 s.logger.Printf("[ERR] sched: %#v: %v", s.eval, err) 110 return false, err 111 } 112 113 // If the plan is a no-op, we can bail 114 if s.plan.IsNoOp() { 115 return true, nil 116 } 117 118 // If the limit of placements was reached we need to create an evaluation 119 // to pickup from here after the stagger period. 120 if s.limitReached && s.nextEval == nil { 121 s.nextEval = s.eval.NextRollingEval(s.job.Update.Stagger) 122 if err := s.planner.CreateEval(s.nextEval); err != nil { 123 s.logger.Printf("[ERR] sched: %#v failed to make next eval for rolling update: %v", s.eval, err) 124 return false, err 125 } 126 s.logger.Printf("[DEBUG] sched: %#v: rolling update limit reached, next eval '%s' created", s.eval, s.nextEval.ID) 127 } 128 129 // Submit the plan 130 result, newState, err := s.planner.SubmitPlan(s.plan) 131 if err != nil { 132 return false, err 133 } 134 135 // If we got a state refresh, try again since we have stale data 136 if newState != nil { 137 s.logger.Printf("[DEBUG] sched: %#v: refresh forced", s.eval) 138 s.state = newState 139 return false, nil 140 } 141 142 // Try again if the plan was not fully committed, potential conflict 143 fullCommit, expected, actual := result.FullCommit(s.plan) 144 if !fullCommit { 145 s.logger.Printf("[DEBUG] sched: %#v: attempted %d placements, %d placed", 146 s.eval, expected, actual) 147 return false, nil 148 } 149 150 // Success! 151 return true, nil 152 } 153 154 // computeJobAllocs is used to reconcile differences between the job, 155 // existing allocations and node status to update the allocations. 156 func (s *SystemScheduler) computeJobAllocs() error { 157 // Lookup the allocations by JobID 158 allocs, err := s.state.AllocsByJob(s.eval.JobID) 159 if err != nil { 160 return fmt.Errorf("failed to get allocs for job '%s': %v", 161 s.eval.JobID, err) 162 } 163 164 // Filter out the allocations in a terminal state 165 allocs = structs.FilterTerminalAllocs(allocs) 166 167 // Determine the tainted nodes containing job allocs 168 tainted, err := taintedNodes(s.state, allocs) 169 if err != nil { 170 return fmt.Errorf("failed to get tainted nodes for job '%s': %v", 171 s.eval.JobID, err) 172 } 173 174 // Diff the required and existing allocations 175 diff := diffSystemAllocs(s.job, s.nodes, tainted, allocs) 176 s.logger.Printf("[DEBUG] sched: %#v: %#v", s.eval, diff) 177 178 // Add all the allocs to stop 179 for _, e := range diff.stop { 180 s.plan.AppendUpdate(e.Alloc, structs.AllocDesiredStatusStop, allocNotNeeded) 181 } 182 183 // Attempt to do the upgrades in place 184 diff.update = inplaceUpdate(s.ctx, s.eval, s.job, s.stack, diff.update) 185 186 // Check if a rolling upgrade strategy is being used 187 limit := len(diff.update) 188 if s.job != nil && s.job.Update.Rolling() { 189 limit = s.job.Update.MaxParallel 190 } 191 192 // Treat non in-place updates as an eviction and new placement. 193 s.limitReached = evictAndPlace(s.ctx, diff, diff.update, allocUpdating, &limit) 194 195 // Nothing remaining to do if placement is not required 196 if len(diff.place) == 0 { 197 return nil 198 } 199 200 // Compute the placements 201 return s.computePlacements(diff.place) 202 } 203 204 // computePlacements computes placements for allocations 205 func (s *SystemScheduler) computePlacements(place []allocTuple) error { 206 nodeByID := make(map[string]*structs.Node, len(s.nodes)) 207 for _, node := range s.nodes { 208 nodeByID[node.ID] = node 209 } 210 211 // Track the failed task groups so that we can coalesce 212 // the failures together to avoid creating many failed allocs. 213 failedTG := make(map[*structs.TaskGroup]*structs.Allocation) 214 215 nodes := make([]*structs.Node, 1) 216 for _, missing := range place { 217 node, ok := nodeByID[missing.Alloc.NodeID] 218 if !ok { 219 return fmt.Errorf("could not find node %q", missing.Alloc.NodeID) 220 } 221 222 // Update the set of placement ndoes 223 nodes[0] = node 224 s.stack.SetNodes(nodes) 225 226 // Attempt to match the task group 227 option, size := s.stack.Select(missing.TaskGroup) 228 229 if option == nil { 230 // Check if this task group has already failed 231 if alloc, ok := failedTG[missing.TaskGroup]; ok { 232 alloc.Metrics.CoalescedFailures += 1 233 continue 234 } 235 } 236 237 // Create an allocation for this 238 alloc := &structs.Allocation{ 239 ID: structs.GenerateUUID(), 240 EvalID: s.eval.ID, 241 Name: missing.Name, 242 JobID: s.job.ID, 243 Job: s.job, 244 TaskGroup: missing.TaskGroup.Name, 245 Resources: size, 246 Metrics: s.ctx.Metrics(), 247 } 248 249 // Set fields based on if we found an allocation option 250 if option != nil { 251 alloc.NodeID = option.Node.ID 252 alloc.TaskResources = option.TaskResources 253 alloc.DesiredStatus = structs.AllocDesiredStatusRun 254 alloc.ClientStatus = structs.AllocClientStatusPending 255 s.plan.AppendAlloc(alloc) 256 } else { 257 alloc.DesiredStatus = structs.AllocDesiredStatusFailed 258 alloc.DesiredDescription = "failed to find a node for placement" 259 alloc.ClientStatus = structs.AllocClientStatusFailed 260 s.plan.AppendFailed(alloc) 261 failedTG[missing.TaskGroup] = alloc 262 } 263 } 264 return nil 265 }