github.com/ranjib/nomad@v0.1.1-0.20160225204057-97751b02f70b/scheduler/generic_sched.go (about) 1 package scheduler 2 3 import ( 4 "fmt" 5 "log" 6 7 "github.com/hashicorp/go-multierror" 8 "github.com/hashicorp/nomad/nomad/structs" 9 ) 10 11 const ( 12 // maxServiceScheduleAttempts is used to limit the number of times 13 // we will attempt to schedule if we continue to hit conflicts for services. 14 maxServiceScheduleAttempts = 5 15 16 // maxBatchScheduleAttempts is used to limit the number of times 17 // we will attempt to schedule if we continue to hit conflicts for batch. 18 maxBatchScheduleAttempts = 2 19 20 // allocNotNeeded is the status used when a job no longer requires an allocation 21 allocNotNeeded = "alloc not needed due to job update" 22 23 // allocMigrating is the status used when we must migrate an allocation 24 allocMigrating = "alloc is being migrated" 25 26 // allocUpdating is the status used when a job requires an update 27 allocUpdating = "alloc is being updated due to job update" 28 29 // allocInPlace is the status used when speculating on an in-place update 30 allocInPlace = "alloc updating in-place" 31 ) 32 33 // SetStatusError is used to set the status of the evaluation to the given error 34 type SetStatusError struct { 35 Err error 36 EvalStatus string 37 } 38 39 func (s *SetStatusError) Error() string { 40 return s.Err.Error() 41 } 42 43 // GenericScheduler is used for 'service' and 'batch' type jobs. This scheduler is 44 // designed for long-lived services, and as such spends more time attemping 45 // to make a high quality placement. This is the primary scheduler for 46 // most workloads. It also supports a 'batch' mode to optimize for fast decision 47 // making at the cost of quality. 48 type GenericScheduler struct { 49 logger *log.Logger 50 state State 51 planner Planner 52 batch bool 53 54 eval *structs.Evaluation 55 job *structs.Job 56 plan *structs.Plan 57 planResult *structs.PlanResult 58 ctx *EvalContext 59 stack *GenericStack 60 61 limitReached bool 62 nextEval *structs.Evaluation 63 64 blocked *structs.Evaluation 65 } 66 67 // NewServiceScheduler is a factory function to instantiate a new service scheduler 68 func NewServiceScheduler(logger *log.Logger, state State, planner Planner) Scheduler { 69 s := &GenericScheduler{ 70 logger: logger, 71 state: state, 72 planner: planner, 73 batch: false, 74 } 75 return s 76 } 77 78 // NewBatchScheduler is a factory function to instantiate a new batch scheduler 79 func NewBatchScheduler(logger *log.Logger, state State, planner Planner) Scheduler { 80 s := &GenericScheduler{ 81 logger: logger, 82 state: state, 83 planner: planner, 84 batch: true, 85 } 86 return s 87 } 88 89 // Process is used to handle a single evaluation 90 func (s *GenericScheduler) Process(eval *structs.Evaluation) error { 91 // Store the evaluation 92 s.eval = eval 93 94 // Verify the evaluation trigger reason is understood 95 switch eval.TriggeredBy { 96 case structs.EvalTriggerJobRegister, structs.EvalTriggerNodeUpdate, 97 structs.EvalTriggerJobDeregister, structs.EvalTriggerRollingUpdate, 98 structs.EvalTriggerPeriodicJob: 99 default: 100 desc := fmt.Sprintf("scheduler cannot handle '%s' evaluation reason", 101 eval.TriggeredBy) 102 return setStatus(s.logger, s.planner, s.eval, s.nextEval, structs.EvalStatusFailed, desc) 103 } 104 105 // Retry up to the maxScheduleAttempts and reset if progress is made. 106 progress := func() bool { return progressMade(s.planResult) } 107 limit := maxServiceScheduleAttempts 108 if s.batch { 109 limit = maxBatchScheduleAttempts 110 } 111 if err := retryMax(limit, s.process, progress); err != nil { 112 if statusErr, ok := err.(*SetStatusError); ok { 113 // Scheduling was tried but made no forward progress so create a 114 // blocked eval to retry once resources become available. 115 var mErr multierror.Error 116 if err := s.createBlockedEval(); err != nil { 117 mErr.Errors = append(mErr.Errors, err) 118 } 119 if err := setStatus(s.logger, s.planner, s.eval, s.nextEval, statusErr.EvalStatus, err.Error()); err != nil { 120 mErr.Errors = append(mErr.Errors, err) 121 } 122 return mErr.ErrorOrNil() 123 } 124 return err 125 } 126 127 // Update the status to complete 128 return setStatus(s.logger, s.planner, s.eval, s.nextEval, structs.EvalStatusComplete, "") 129 } 130 131 // createBlockedEval creates a blocked eval and stores it. 132 func (s *GenericScheduler) createBlockedEval() error { 133 e := s.ctx.Eligibility() 134 escaped := e.HasEscaped() 135 136 // Only store the eligible classes if the eval hasn't escaped. 137 var classEligibility map[string]bool 138 if !escaped { 139 classEligibility = e.GetClasses() 140 } 141 142 s.blocked = s.eval.BlockedEval(classEligibility, escaped) 143 return s.planner.CreateEval(s.blocked) 144 } 145 146 // process is wrapped in retryMax to iteratively run the handler until we have no 147 // further work or we've made the maximum number of attempts. 148 func (s *GenericScheduler) process() (bool, error) { 149 // Lookup the Job by ID 150 var err error 151 s.job, err = s.state.JobByID(s.eval.JobID) 152 if err != nil { 153 return false, fmt.Errorf("failed to get job '%s': %v", 154 s.eval.JobID, err) 155 } 156 157 // Create a plan 158 s.plan = s.eval.MakePlan(s.job) 159 160 // Create an evaluation context 161 s.ctx = NewEvalContext(s.state, s.plan, s.logger) 162 163 // Construct the placement stack 164 s.stack = NewGenericStack(s.batch, s.ctx) 165 if s.job != nil { 166 s.stack.SetJob(s.job) 167 } 168 169 // Compute the target job allocations 170 if err := s.computeJobAllocs(); err != nil { 171 s.logger.Printf("[ERR] sched: %#v: %v", s.eval, err) 172 return false, err 173 } 174 175 // If the plan is a no-op, we can bail 176 if s.plan.IsNoOp() { 177 return true, nil 178 } 179 180 // If the limit of placements was reached we need to create an evaluation 181 // to pickup from here after the stagger period. 182 if s.limitReached && s.nextEval == nil { 183 s.nextEval = s.eval.NextRollingEval(s.job.Update.Stagger) 184 if err := s.planner.CreateEval(s.nextEval); err != nil { 185 s.logger.Printf("[ERR] sched: %#v failed to make next eval for rolling update: %v", s.eval, err) 186 return false, err 187 } 188 s.logger.Printf("[DEBUG] sched: %#v: rolling update limit reached, next eval '%s' created", s.eval, s.nextEval.ID) 189 } 190 191 // If there are failed allocations, we need to create a blocked evaluation 192 // to place the failed allocations when resources become available. 193 if len(s.plan.FailedAllocs) != 0 && s.blocked == nil { 194 if err := s.createBlockedEval(); err != nil { 195 s.logger.Printf("[ERR] sched: %#v failed to make blocked eval: %v", s.eval, err) 196 return false, err 197 } 198 s.logger.Printf("[DEBUG] sched: %#v: failed to place all allocations, blocked eval '%s' created", s.eval, s.blocked.ID) 199 } 200 201 // Submit the plan and store the results. 202 result, newState, err := s.planner.SubmitPlan(s.plan) 203 s.planResult = result 204 if err != nil { 205 return false, err 206 } 207 208 // If we got a state refresh, try again since we have stale data 209 if newState != nil { 210 s.logger.Printf("[DEBUG] sched: %#v: refresh forced", s.eval) 211 s.state = newState 212 return false, nil 213 } 214 215 // Try again if the plan was not fully committed, potential conflict 216 fullCommit, expected, actual := result.FullCommit(s.plan) 217 if !fullCommit { 218 s.logger.Printf("[DEBUG] sched: %#v: attempted %d placements, %d placed", 219 s.eval, expected, actual) 220 if newState == nil { 221 return false, fmt.Errorf("missing state refresh after partial commit") 222 } 223 return false, nil 224 } 225 226 // Success! 227 return true, nil 228 } 229 230 // filterCompleteAllocs filters allocations that are terminal and should be 231 // re-placed. 232 func (s *GenericScheduler) filterCompleteAllocs(allocs []*structs.Allocation) []*structs.Allocation { 233 filter := func(a *structs.Allocation) bool { 234 // Allocs from batch jobs should be filtered when their status is failed so that 235 // they will be replaced. If they are dead but not failed, they 236 // shouldn't be replaced. 237 if s.batch { 238 return a.ClientStatus == structs.AllocClientStatusFailed 239 } 240 241 // Filter terminal, non batch allocations 242 return a.TerminalStatus() 243 } 244 245 n := len(allocs) 246 for i := 0; i < n; i++ { 247 if filter(allocs[i]) { 248 allocs[i], allocs[n-1] = allocs[n-1], nil 249 i-- 250 n-- 251 } 252 } 253 return allocs[:n] 254 } 255 256 // computeJobAllocs is used to reconcile differences between the job, 257 // existing allocations and node status to update the allocations. 258 func (s *GenericScheduler) computeJobAllocs() error { 259 // Materialize all the task groups, job could be missing if deregistered 260 var groups map[string]*structs.TaskGroup 261 if s.job != nil { 262 groups = materializeTaskGroups(s.job) 263 } 264 265 // Lookup the allocations by JobID 266 allocs, err := s.state.AllocsByJob(s.eval.JobID) 267 if err != nil { 268 return fmt.Errorf("failed to get allocs for job '%s': %v", 269 s.eval.JobID, err) 270 } 271 272 // Filter out the allocations in a terminal state 273 allocs = s.filterCompleteAllocs(allocs) 274 275 // Determine the tainted nodes containing job allocs 276 tainted, err := taintedNodes(s.state, allocs) 277 if err != nil { 278 return fmt.Errorf("failed to get tainted nodes for job '%s': %v", 279 s.eval.JobID, err) 280 } 281 282 // Diff the required and existing allocations 283 diff := diffAllocs(s.job, tainted, groups, allocs) 284 s.logger.Printf("[DEBUG] sched: %#v: %#v", s.eval, diff) 285 286 // Add all the allocs to stop 287 for _, e := range diff.stop { 288 s.plan.AppendUpdate(e.Alloc, structs.AllocDesiredStatusStop, allocNotNeeded) 289 } 290 291 // Attempt to do the upgrades in place 292 diff.update = inplaceUpdate(s.ctx, s.eval, s.job, s.stack, diff.update) 293 294 // Check if a rolling upgrade strategy is being used 295 limit := len(diff.update) + len(diff.migrate) 296 if s.job != nil && s.job.Update.Rolling() { 297 limit = s.job.Update.MaxParallel 298 } 299 300 // Treat migrations as an eviction and a new placement. 301 s.limitReached = evictAndPlace(s.ctx, diff, diff.migrate, allocMigrating, &limit) 302 303 // Treat non in-place updates as an eviction and new placement. 304 s.limitReached = s.limitReached || evictAndPlace(s.ctx, diff, diff.update, allocUpdating, &limit) 305 306 // Nothing remaining to do if placement is not required 307 if len(diff.place) == 0 { 308 return nil 309 } 310 311 // Compute the placements 312 return s.computePlacements(diff.place) 313 } 314 315 // computePlacements computes placements for allocations 316 func (s *GenericScheduler) computePlacements(place []allocTuple) error { 317 // Get the base nodes 318 nodes, byDC, err := readyNodesInDCs(s.state, s.job.Datacenters) 319 if err != nil { 320 return err 321 } 322 323 // Update the set of placement ndoes 324 s.stack.SetNodes(nodes) 325 326 // Track the failed task groups so that we can coalesce 327 // the failures together to avoid creating many failed allocs. 328 failedTG := make(map[*structs.TaskGroup]*structs.Allocation) 329 330 for _, missing := range place { 331 // Check if this task group has already failed 332 if alloc, ok := failedTG[missing.TaskGroup]; ok { 333 alloc.Metrics.CoalescedFailures += 1 334 continue 335 } 336 337 // Attempt to match the task group 338 option, size := s.stack.Select(missing.TaskGroup) 339 340 // Create an allocation for this 341 alloc := &structs.Allocation{ 342 ID: structs.GenerateUUID(), 343 EvalID: s.eval.ID, 344 Name: missing.Name, 345 JobID: s.job.ID, 346 TaskGroup: missing.TaskGroup.Name, 347 Resources: size, 348 Metrics: s.ctx.Metrics(), 349 } 350 351 // Store the available nodes by datacenter 352 s.ctx.Metrics().NodesAvailable = byDC 353 354 // Set fields based on if we found an allocation option 355 if option != nil { 356 // Generate service IDs tasks in this allocation 357 alloc.PopulateServiceIDs(missing.TaskGroup) 358 359 alloc.NodeID = option.Node.ID 360 alloc.TaskResources = option.TaskResources 361 alloc.DesiredStatus = structs.AllocDesiredStatusRun 362 alloc.ClientStatus = structs.AllocClientStatusPending 363 alloc.TaskStates = initTaskState(missing.TaskGroup, structs.TaskStatePending) 364 s.plan.AppendAlloc(alloc) 365 } else { 366 alloc.DesiredStatus = structs.AllocDesiredStatusFailed 367 alloc.DesiredDescription = "failed to find a node for placement" 368 alloc.ClientStatus = structs.AllocClientStatusFailed 369 alloc.TaskStates = initTaskState(missing.TaskGroup, structs.TaskStateDead) 370 s.plan.AppendFailed(alloc) 371 failedTG[missing.TaskGroup] = alloc 372 } 373 } 374 375 return nil 376 }