github.com/ranjib/nomad@v0.1.1-0.20160225204057-97751b02f70b/scheduler/system_sched.go (about) 1 package scheduler 2 3 import ( 4 "fmt" 5 "log" 6 7 "github.com/hashicorp/nomad/nomad/structs" 8 ) 9 10 const ( 11 // maxSystemScheduleAttempts is used to limit the number of times 12 // we will attempt to schedule if we continue to hit conflicts for system 13 // jobs. 14 maxSystemScheduleAttempts = 5 15 16 // allocNodeTainted is the status used when stopping an alloc because it's 17 // node is tainted. 18 allocNodeTainted = "system alloc not needed as node is tainted" 19 ) 20 21 // SystemScheduler is used for 'system' jobs. This scheduler is 22 // designed for services that should be run on every client. 23 type SystemScheduler struct { 24 logger *log.Logger 25 state State 26 planner Planner 27 28 eval *structs.Evaluation 29 job *structs.Job 30 plan *structs.Plan 31 planResult *structs.PlanResult 32 ctx *EvalContext 33 stack *SystemStack 34 nodes []*structs.Node 35 nodesByDC map[string]int 36 37 limitReached bool 38 nextEval *structs.Evaluation 39 } 40 41 // NewSystemScheduler is a factory function to instantiate a new system 42 // scheduler. 43 func NewSystemScheduler(logger *log.Logger, state State, planner Planner) Scheduler { 44 return &SystemScheduler{ 45 logger: logger, 46 state: state, 47 planner: planner, 48 } 49 } 50 51 // Process is used to handle a single evaluation. 52 func (s *SystemScheduler) Process(eval *structs.Evaluation) error { 53 // Store the evaluation 54 s.eval = eval 55 56 // Verify the evaluation trigger reason is understood 57 switch eval.TriggeredBy { 58 case structs.EvalTriggerJobRegister, structs.EvalTriggerNodeUpdate, 59 structs.EvalTriggerJobDeregister, structs.EvalTriggerRollingUpdate: 60 default: 61 desc := fmt.Sprintf("scheduler cannot handle '%s' evaluation reason", 62 eval.TriggeredBy) 63 return setStatus(s.logger, s.planner, s.eval, s.nextEval, structs.EvalStatusFailed, desc) 64 } 65 66 // Retry up to the maxSystemScheduleAttempts and reset if progress is made. 67 progress := func() bool { return progressMade(s.planResult) } 68 if err := retryMax(maxSystemScheduleAttempts, s.process, progress); err != nil { 69 if statusErr, ok := err.(*SetStatusError); ok { 70 return setStatus(s.logger, s.planner, s.eval, s.nextEval, statusErr.EvalStatus, err.Error()) 71 } 72 return err 73 } 74 75 // Update the status to complete 76 return setStatus(s.logger, s.planner, s.eval, s.nextEval, structs.EvalStatusComplete, "") 77 } 78 79 // process is wrapped in retryMax to iteratively run the handler until we have no 80 // further work or we've made the maximum number of attempts. 81 func (s *SystemScheduler) process() (bool, error) { 82 // Lookup the Job by ID 83 var err error 84 s.job, err = s.state.JobByID(s.eval.JobID) 85 if err != nil { 86 return false, fmt.Errorf("failed to get job '%s': %v", 87 s.eval.JobID, err) 88 } 89 90 // Get the ready nodes in the required datacenters 91 if s.job != nil { 92 s.nodes, s.nodesByDC, err = readyNodesInDCs(s.state, s.job.Datacenters) 93 if err != nil { 94 return false, fmt.Errorf("failed to get ready nodes: %v", err) 95 } 96 } 97 98 // Create a plan 99 s.plan = s.eval.MakePlan(s.job) 100 101 // Create an evaluation context 102 s.ctx = NewEvalContext(s.state, s.plan, s.logger) 103 104 // Construct the placement stack 105 s.stack = NewSystemStack(s.ctx) 106 if s.job != nil { 107 s.stack.SetJob(s.job) 108 } 109 110 // Compute the target job allocations 111 if err := s.computeJobAllocs(); err != nil { 112 s.logger.Printf("[ERR] sched: %#v: %v", s.eval, err) 113 return false, err 114 } 115 116 // If the plan is a no-op, we can bail 117 if s.plan.IsNoOp() { 118 return true, nil 119 } 120 121 // If the limit of placements was reached we need to create an evaluation 122 // to pickup from here after the stagger period. 123 if s.limitReached && s.nextEval == nil { 124 s.nextEval = s.eval.NextRollingEval(s.job.Update.Stagger) 125 if err := s.planner.CreateEval(s.nextEval); err != nil { 126 s.logger.Printf("[ERR] sched: %#v failed to make next eval for rolling update: %v", s.eval, err) 127 return false, err 128 } 129 s.logger.Printf("[DEBUG] sched: %#v: rolling update limit reached, next eval '%s' created", s.eval, s.nextEval.ID) 130 } 131 132 // Submit the plan 133 result, newState, err := s.planner.SubmitPlan(s.plan) 134 s.planResult = result 135 if err != nil { 136 return false, err 137 } 138 139 // If we got a state refresh, try again since we have stale data 140 if newState != nil { 141 s.logger.Printf("[DEBUG] sched: %#v: refresh forced", s.eval) 142 s.state = newState 143 return false, nil 144 } 145 146 // Try again if the plan was not fully committed, potential conflict 147 fullCommit, expected, actual := result.FullCommit(s.plan) 148 if !fullCommit { 149 s.logger.Printf("[DEBUG] sched: %#v: attempted %d placements, %d placed", 150 s.eval, expected, actual) 151 return false, nil 152 } 153 154 // Success! 155 return true, nil 156 } 157 158 // computeJobAllocs is used to reconcile differences between the job, 159 // existing allocations and node status to update the allocations. 160 func (s *SystemScheduler) computeJobAllocs() error { 161 // Lookup the allocations by JobID 162 allocs, err := s.state.AllocsByJob(s.eval.JobID) 163 if err != nil { 164 return fmt.Errorf("failed to get allocs for job '%s': %v", 165 s.eval.JobID, err) 166 } 167 168 // Filter out the allocations in a terminal state 169 allocs = structs.FilterTerminalAllocs(allocs) 170 171 // Determine the tainted nodes containing job allocs 172 tainted, err := taintedNodes(s.state, allocs) 173 if err != nil { 174 return fmt.Errorf("failed to get tainted nodes for job '%s': %v", 175 s.eval.JobID, err) 176 } 177 178 // Diff the required and existing allocations 179 diff := diffSystemAllocs(s.job, s.nodes, tainted, allocs) 180 s.logger.Printf("[DEBUG] sched: %#v: %#v", s.eval, diff) 181 182 // Add all the allocs to stop 183 for _, e := range diff.stop { 184 s.plan.AppendUpdate(e.Alloc, structs.AllocDesiredStatusStop, allocNotNeeded) 185 } 186 187 // Attempt to do the upgrades in place 188 diff.update = inplaceUpdate(s.ctx, s.eval, s.job, s.stack, diff.update) 189 190 // Check if a rolling upgrade strategy is being used 191 limit := len(diff.update) 192 if s.job != nil && s.job.Update.Rolling() { 193 limit = s.job.Update.MaxParallel 194 } 195 196 // Treat non in-place updates as an eviction and new placement. 197 s.limitReached = evictAndPlace(s.ctx, diff, diff.update, allocUpdating, &limit) 198 199 // Nothing remaining to do if placement is not required 200 if len(diff.place) == 0 { 201 return nil 202 } 203 204 // Compute the placements 205 return s.computePlacements(diff.place) 206 } 207 208 // computePlacements computes placements for allocations 209 func (s *SystemScheduler) computePlacements(place []allocTuple) error { 210 nodeByID := make(map[string]*structs.Node, len(s.nodes)) 211 for _, node := range s.nodes { 212 nodeByID[node.ID] = node 213 } 214 215 // Track the failed task groups so that we can coalesce 216 // the failures together to avoid creating many failed allocs. 217 failedTG := make(map[*structs.TaskGroup]*structs.Allocation) 218 219 nodes := make([]*structs.Node, 1) 220 for _, missing := range place { 221 node, ok := nodeByID[missing.Alloc.NodeID] 222 if !ok { 223 return fmt.Errorf("could not find node %q", missing.Alloc.NodeID) 224 } 225 226 // Update the set of placement nodes 227 nodes[0] = node 228 s.stack.SetNodes(nodes) 229 230 // Attempt to match the task group 231 option, size := s.stack.Select(missing.TaskGroup) 232 233 if option == nil { 234 // Check if this task group has already failed 235 if alloc, ok := failedTG[missing.TaskGroup]; ok { 236 alloc.Metrics.CoalescedFailures += 1 237 continue 238 } 239 } 240 241 // Create an allocation for this 242 alloc := &structs.Allocation{ 243 ID: structs.GenerateUUID(), 244 EvalID: s.eval.ID, 245 Name: missing.Name, 246 JobID: s.job.ID, 247 TaskGroup: missing.TaskGroup.Name, 248 Resources: size, 249 Metrics: s.ctx.Metrics(), 250 } 251 252 // Store the available nodes by datacenter 253 s.ctx.Metrics().NodesAvailable = s.nodesByDC 254 255 // Set fields based on if we found an allocation option 256 if option != nil { 257 // Generate service IDs tasks in this allocation 258 alloc.PopulateServiceIDs(missing.TaskGroup) 259 260 alloc.NodeID = option.Node.ID 261 alloc.TaskResources = option.TaskResources 262 alloc.DesiredStatus = structs.AllocDesiredStatusRun 263 alloc.ClientStatus = structs.AllocClientStatusPending 264 alloc.TaskStates = initTaskState(missing.TaskGroup, structs.TaskStatePending) 265 s.plan.AppendAlloc(alloc) 266 } else { 267 alloc.DesiredStatus = structs.AllocDesiredStatusFailed 268 alloc.DesiredDescription = "failed to find a node for placement" 269 alloc.ClientStatus = structs.AllocClientStatusFailed 270 alloc.TaskStates = initTaskState(missing.TaskGroup, structs.TaskStateDead) 271 s.plan.AppendFailed(alloc) 272 failedTG[missing.TaskGroup] = alloc 273 } 274 } 275 return nil 276 }