github.com/ryanslade/nomad@v0.2.4-0.20160128061903-fc95782f2089/scheduler/system_sched.go (about) 1 package scheduler 2 3 import ( 4 "fmt" 5 "log" 6 7 "github.com/hashicorp/nomad/nomad/structs" 8 ) 9 10 const ( 11 // maxSystemScheduleAttempts is used to limit the number of times 12 // we will attempt to schedule if we continue to hit conflicts for system 13 // jobs. 14 maxSystemScheduleAttempts = 5 15 16 // allocNodeTainted is the status used when stopping an alloc because it's 17 // node is tainted. 18 allocNodeTainted = "system alloc not needed as node is tainted" 19 ) 20 21 // SystemScheduler is used for 'system' jobs. This scheduler is 22 // designed for services that should be run on every client. 23 type SystemScheduler struct { 24 logger *log.Logger 25 state State 26 planner Planner 27 28 eval *structs.Evaluation 29 job *structs.Job 30 plan *structs.Plan 31 ctx *EvalContext 32 stack *SystemStack 33 nodes []*structs.Node 34 nodesByDC map[string]int 35 36 limitReached bool 37 nextEval *structs.Evaluation 38 } 39 40 // NewSystemScheduler is a factory function to instantiate a new system 41 // scheduler. 42 func NewSystemScheduler(logger *log.Logger, state State, planner Planner) Scheduler { 43 return &SystemScheduler{ 44 logger: logger, 45 state: state, 46 planner: planner, 47 } 48 } 49 50 // Process is used to handle a single evaluation. 51 func (s *SystemScheduler) Process(eval *structs.Evaluation) error { 52 // Store the evaluation 53 s.eval = eval 54 55 // Verify the evaluation trigger reason is understood 56 switch eval.TriggeredBy { 57 case structs.EvalTriggerJobRegister, structs.EvalTriggerNodeUpdate, 58 structs.EvalTriggerJobDeregister, structs.EvalTriggerRollingUpdate: 59 default: 60 desc := fmt.Sprintf("scheduler cannot handle '%s' evaluation reason", 61 eval.TriggeredBy) 62 return setStatus(s.logger, s.planner, s.eval, s.nextEval, structs.EvalStatusFailed, desc) 63 } 64 65 // Retry up to the maxSystemScheduleAttempts 66 if err := retryMax(maxSystemScheduleAttempts, s.process); err != nil { 67 if statusErr, ok := err.(*SetStatusError); ok { 68 return setStatus(s.logger, s.planner, s.eval, s.nextEval, statusErr.EvalStatus, err.Error()) 69 } 70 return err 71 } 72 73 // Update the status to complete 74 return setStatus(s.logger, s.planner, s.eval, s.nextEval, structs.EvalStatusComplete, "") 75 } 76 77 // process is wrapped in retryMax to iteratively run the handler until we have no 78 // further work or we've made the maximum number of attempts. 79 func (s *SystemScheduler) process() (bool, error) { 80 // Lookup the Job by ID 81 var err error 82 s.job, err = s.state.JobByID(s.eval.JobID) 83 if err != nil { 84 return false, fmt.Errorf("failed to get job '%s': %v", 85 s.eval.JobID, err) 86 } 87 88 // Get the ready nodes in the required datacenters 89 if s.job != nil { 90 s.nodes, s.nodesByDC, err = readyNodesInDCs(s.state, s.job.Datacenters) 91 if err != nil { 92 return false, fmt.Errorf("failed to get ready nodes: %v", err) 93 } 94 } 95 96 // Create a plan 97 s.plan = s.eval.MakePlan(s.job) 98 99 // Create an evaluation context 100 s.ctx = NewEvalContext(s.state, s.plan, s.logger) 101 102 // Construct the placement stack 103 s.stack = NewSystemStack(s.ctx) 104 if s.job != nil { 105 s.stack.SetJob(s.job) 106 } 107 108 // Compute the target job allocations 109 if err := s.computeJobAllocs(); err != nil { 110 s.logger.Printf("[ERR] sched: %#v: %v", s.eval, err) 111 return false, err 112 } 113 114 // If the plan is a no-op, we can bail 115 if s.plan.IsNoOp() { 116 return true, nil 117 } 118 119 // If the limit of placements was reached we need to create an evaluation 120 // to pickup from here after the stagger period. 121 if s.limitReached && s.nextEval == nil { 122 s.nextEval = s.eval.NextRollingEval(s.job.Update.Stagger) 123 if err := s.planner.CreateEval(s.nextEval); err != nil { 124 s.logger.Printf("[ERR] sched: %#v failed to make next eval for rolling update: %v", s.eval, err) 125 return false, err 126 } 127 s.logger.Printf("[DEBUG] sched: %#v: rolling update limit reached, next eval '%s' created", s.eval, s.nextEval.ID) 128 } 129 130 // Submit the plan 131 result, newState, err := s.planner.SubmitPlan(s.plan) 132 if err != nil { 133 return false, err 134 } 135 136 // If we got a state refresh, try again since we have stale data 137 if newState != nil { 138 s.logger.Printf("[DEBUG] sched: %#v: refresh forced", s.eval) 139 s.state = newState 140 return false, nil 141 } 142 143 // Try again if the plan was not fully committed, potential conflict 144 fullCommit, expected, actual := result.FullCommit(s.plan) 145 if !fullCommit { 146 s.logger.Printf("[DEBUG] sched: %#v: attempted %d placements, %d placed", 147 s.eval, expected, actual) 148 return false, nil 149 } 150 151 // Success! 152 return true, nil 153 } 154 155 // computeJobAllocs is used to reconcile differences between the job, 156 // existing allocations and node status to update the allocations. 157 func (s *SystemScheduler) computeJobAllocs() error { 158 // Lookup the allocations by JobID 159 allocs, err := s.state.AllocsByJob(s.eval.JobID) 160 if err != nil { 161 return fmt.Errorf("failed to get allocs for job '%s': %v", 162 s.eval.JobID, err) 163 } 164 165 // Filter out the allocations in a terminal state 166 allocs = structs.FilterTerminalAllocs(allocs) 167 168 // Determine the tainted nodes containing job allocs 169 tainted, err := taintedNodes(s.state, allocs) 170 if err != nil { 171 return fmt.Errorf("failed to get tainted nodes for job '%s': %v", 172 s.eval.JobID, err) 173 } 174 175 // Diff the required and existing allocations 176 diff := diffSystemAllocs(s.job, s.nodes, tainted, allocs) 177 s.logger.Printf("[DEBUG] sched: %#v: %#v", s.eval, diff) 178 179 // Add all the allocs to stop 180 for _, e := range diff.stop { 181 s.plan.AppendUpdate(e.Alloc, structs.AllocDesiredStatusStop, allocNotNeeded) 182 } 183 184 // Attempt to do the upgrades in place 185 diff.update = inplaceUpdate(s.ctx, s.eval, s.job, s.stack, diff.update) 186 187 // Check if a rolling upgrade strategy is being used 188 limit := len(diff.update) 189 if s.job != nil && s.job.Update.Rolling() { 190 limit = s.job.Update.MaxParallel 191 } 192 193 // Treat non in-place updates as an eviction and new placement. 194 s.limitReached = evictAndPlace(s.ctx, diff, diff.update, allocUpdating, &limit) 195 196 // Nothing remaining to do if placement is not required 197 if len(diff.place) == 0 { 198 return nil 199 } 200 201 // Compute the placements 202 return s.computePlacements(diff.place) 203 } 204 205 // computePlacements computes placements for allocations 206 func (s *SystemScheduler) computePlacements(place []allocTuple) error { 207 nodeByID := make(map[string]*structs.Node, len(s.nodes)) 208 for _, node := range s.nodes { 209 nodeByID[node.ID] = node 210 } 211 212 // Track the failed task groups so that we can coalesce 213 // the failures together to avoid creating many failed allocs. 214 failedTG := make(map[*structs.TaskGroup]*structs.Allocation) 215 216 nodes := make([]*structs.Node, 1) 217 for _, missing := range place { 218 node, ok := nodeByID[missing.Alloc.NodeID] 219 if !ok { 220 return fmt.Errorf("could not find node %q", missing.Alloc.NodeID) 221 } 222 223 // Update the set of placement nodes 224 nodes[0] = node 225 s.stack.SetNodes(nodes) 226 227 // Attempt to match the task group 228 option, size := s.stack.Select(missing.TaskGroup) 229 230 if option == nil { 231 // Check if this task group has already failed 232 if alloc, ok := failedTG[missing.TaskGroup]; ok { 233 alloc.Metrics.CoalescedFailures += 1 234 continue 235 } 236 } 237 238 // Create an allocation for this 239 alloc := &structs.Allocation{ 240 ID: structs.GenerateUUID(), 241 EvalID: s.eval.ID, 242 Name: missing.Name, 243 JobID: s.job.ID, 244 Job: s.job, 245 TaskGroup: missing.TaskGroup.Name, 246 Resources: size, 247 Metrics: s.ctx.Metrics(), 248 } 249 250 // Store the available nodes by datacenter 251 s.ctx.Metrics().NodesAvailable = s.nodesByDC 252 253 // Set fields based on if we found an allocation option 254 if option != nil { 255 // Generate the service ids for the tasks that this allocation is going 256 // to run 257 alloc.PopulateServiceIDs() 258 259 alloc.NodeID = option.Node.ID 260 alloc.TaskResources = option.TaskResources 261 alloc.DesiredStatus = structs.AllocDesiredStatusRun 262 alloc.ClientStatus = structs.AllocClientStatusPending 263 alloc.TaskStates = initTaskState(missing.TaskGroup, structs.TaskStatePending) 264 s.plan.AppendAlloc(alloc) 265 } else { 266 alloc.DesiredStatus = structs.AllocDesiredStatusFailed 267 alloc.DesiredDescription = "failed to find a node for placement" 268 alloc.ClientStatus = structs.AllocClientStatusFailed 269 alloc.TaskStates = initTaskState(missing.TaskGroup, structs.TaskStateDead) 270 s.plan.AppendFailed(alloc) 271 failedTG[missing.TaskGroup] = alloc 272 } 273 } 274 return nil 275 }