github.com/hspak/nomad@v0.7.2-0.20180309000617-bc4ae22a39a5/scheduler/generic_sched.go (about) 1 package scheduler 2 3 import ( 4 "fmt" 5 "log" 6 "time" 7 8 memdb "github.com/hashicorp/go-memdb" 9 "github.com/hashicorp/go-multierror" 10 "github.com/hashicorp/nomad/helper/uuid" 11 "github.com/hashicorp/nomad/nomad/structs" 12 ) 13 14 const ( 15 // maxServiceScheduleAttempts is used to limit the number of times 16 // we will attempt to schedule if we continue to hit conflicts for services. 17 maxServiceScheduleAttempts = 5 18 19 // maxBatchScheduleAttempts is used to limit the number of times 20 // we will attempt to schedule if we continue to hit conflicts for batch. 21 maxBatchScheduleAttempts = 2 22 23 // allocNotNeeded is the status used when a job no longer requires an allocation 24 allocNotNeeded = "alloc not needed due to job update" 25 26 // allocMigrating is the status used when we must migrate an allocation 27 allocMigrating = "alloc is being migrated" 28 29 // allocUpdating is the status used when a job requires an update 30 allocUpdating = "alloc is being updated due to job update" 31 32 // allocLost is the status used when an allocation is lost 33 allocLost = "alloc is lost since its node is down" 34 35 // allocInPlace is the status used when speculating on an in-place update 36 allocInPlace = "alloc updating in-place" 37 38 // blockedEvalMaxPlanDesc is the description used for blocked evals that are 39 // a result of hitting the max number of plan attempts 40 blockedEvalMaxPlanDesc = "created due to placement conflicts" 41 42 // blockedEvalFailedPlacements is the description used for blocked evals 43 // that are a result of failing to place all allocations. 44 blockedEvalFailedPlacements = "created to place remaining allocations" 45 ) 46 47 // SetStatusError is used to set the status of the evaluation to the given error 48 type SetStatusError struct { 49 Err error 50 EvalStatus string 51 } 52 53 func (s *SetStatusError) Error() string { 54 return s.Err.Error() 55 } 56 57 // GenericScheduler is used for 'service' and 'batch' type jobs. This scheduler is 58 // designed for long-lived services, and as such spends more time attempting 59 // to make a high quality placement. This is the primary scheduler for 60 // most workloads. It also supports a 'batch' mode to optimize for fast decision 61 // making at the cost of quality. 62 type GenericScheduler struct { 63 logger *log.Logger 64 state State 65 planner Planner 66 batch bool 67 68 eval *structs.Evaluation 69 job *structs.Job 70 plan *structs.Plan 71 planResult *structs.PlanResult 72 ctx *EvalContext 73 stack *GenericStack 74 75 followupEvalWait time.Duration 76 nextEval *structs.Evaluation 77 78 deployment *structs.Deployment 79 80 blocked *structs.Evaluation 81 failedTGAllocs map[string]*structs.AllocMetric 82 queuedAllocs map[string]int 83 } 84 85 // NewServiceScheduler is a factory function to instantiate a new service scheduler 86 func NewServiceScheduler(logger *log.Logger, state State, planner Planner) Scheduler { 87 s := &GenericScheduler{ 88 logger: logger, 89 state: state, 90 planner: planner, 91 batch: false, 92 } 93 return s 94 } 95 96 // NewBatchScheduler is a factory function to instantiate a new batch scheduler 97 func NewBatchScheduler(logger *log.Logger, state State, planner Planner) Scheduler { 98 s := &GenericScheduler{ 99 logger: logger, 100 state: state, 101 planner: planner, 102 batch: true, 103 } 104 return s 105 } 106 107 // Process is used to handle a single evaluation 108 func (s *GenericScheduler) Process(eval *structs.Evaluation) error { 109 // Store the evaluation 110 s.eval = eval 111 112 // Verify the evaluation trigger reason is understood 113 switch eval.TriggeredBy { 114 case structs.EvalTriggerJobRegister, structs.EvalTriggerNodeUpdate, 115 structs.EvalTriggerJobDeregister, structs.EvalTriggerRollingUpdate, 116 structs.EvalTriggerPeriodicJob, structs.EvalTriggerMaxPlans, 117 structs.EvalTriggerDeploymentWatcher, structs.EvalTriggerRetryFailedAlloc: 118 default: 119 desc := fmt.Sprintf("scheduler cannot handle '%s' evaluation reason", 120 eval.TriggeredBy) 121 return setStatus(s.logger, s.planner, s.eval, s.nextEval, s.blocked, 122 s.failedTGAllocs, structs.EvalStatusFailed, desc, s.queuedAllocs, 123 s.deployment.GetID()) 124 } 125 126 // Retry up to the maxScheduleAttempts and reset if progress is made. 127 progress := func() bool { return progressMade(s.planResult) } 128 limit := maxServiceScheduleAttempts 129 if s.batch { 130 limit = maxBatchScheduleAttempts 131 } 132 if err := retryMax(limit, s.process, progress); err != nil { 133 if statusErr, ok := err.(*SetStatusError); ok { 134 // Scheduling was tried but made no forward progress so create a 135 // blocked eval to retry once resources become available. 136 var mErr multierror.Error 137 if err := s.createBlockedEval(true); err != nil { 138 mErr.Errors = append(mErr.Errors, err) 139 } 140 if err := setStatus(s.logger, s.planner, s.eval, s.nextEval, s.blocked, 141 s.failedTGAllocs, statusErr.EvalStatus, err.Error(), 142 s.queuedAllocs, s.deployment.GetID()); err != nil { 143 mErr.Errors = append(mErr.Errors, err) 144 } 145 return mErr.ErrorOrNil() 146 } 147 return err 148 } 149 150 // If the current evaluation is a blocked evaluation and we didn't place 151 // everything, do not update the status to complete. 152 if s.eval.Status == structs.EvalStatusBlocked && len(s.failedTGAllocs) != 0 { 153 e := s.ctx.Eligibility() 154 newEval := s.eval.Copy() 155 newEval.EscapedComputedClass = e.HasEscaped() 156 newEval.ClassEligibility = e.GetClasses() 157 newEval.QuotaLimitReached = e.QuotaLimitReached() 158 return s.planner.ReblockEval(newEval) 159 } 160 161 // Update the status to complete 162 return setStatus(s.logger, s.planner, s.eval, s.nextEval, s.blocked, 163 s.failedTGAllocs, structs.EvalStatusComplete, "", s.queuedAllocs, 164 s.deployment.GetID()) 165 } 166 167 // createBlockedEval creates a blocked eval and submits it to the planner. If 168 // failure is set to true, the eval's trigger reason reflects that. 169 func (s *GenericScheduler) createBlockedEval(planFailure bool) error { 170 e := s.ctx.Eligibility() 171 escaped := e.HasEscaped() 172 173 // Only store the eligible classes if the eval hasn't escaped. 174 var classEligibility map[string]bool 175 if !escaped { 176 classEligibility = e.GetClasses() 177 } 178 179 s.blocked = s.eval.CreateBlockedEval(classEligibility, escaped, e.QuotaLimitReached()) 180 if planFailure { 181 s.blocked.TriggeredBy = structs.EvalTriggerMaxPlans 182 s.blocked.StatusDescription = blockedEvalMaxPlanDesc 183 } else { 184 s.blocked.StatusDescription = blockedEvalFailedPlacements 185 } 186 187 return s.planner.CreateEval(s.blocked) 188 } 189 190 // process is wrapped in retryMax to iteratively run the handler until we have no 191 // further work or we've made the maximum number of attempts. 192 func (s *GenericScheduler) process() (bool, error) { 193 // Lookup the Job by ID 194 var err error 195 ws := memdb.NewWatchSet() 196 s.job, err = s.state.JobByID(ws, s.eval.Namespace, s.eval.JobID) 197 if err != nil { 198 return false, fmt.Errorf("failed to get job %q: %v", s.eval.JobID, err) 199 } 200 201 numTaskGroups := 0 202 stopped := s.job.Stopped() 203 if !stopped { 204 numTaskGroups = len(s.job.TaskGroups) 205 } 206 s.queuedAllocs = make(map[string]int, numTaskGroups) 207 208 // Create a plan 209 s.plan = s.eval.MakePlan(s.job) 210 211 if !s.batch { 212 // Get any existing deployment 213 s.deployment, err = s.state.LatestDeploymentByJobID(ws, s.eval.Namespace, s.eval.JobID) 214 if err != nil { 215 return false, fmt.Errorf("failed to get job deployment %q: %v", s.eval.JobID, err) 216 } 217 } 218 219 // Reset the failed allocations 220 s.failedTGAllocs = nil 221 222 // Create an evaluation context 223 s.ctx = NewEvalContext(s.state, s.plan, s.logger) 224 225 // Construct the placement stack 226 s.stack = NewGenericStack(s.batch, s.ctx) 227 if !s.job.Stopped() { 228 s.stack.SetJob(s.job) 229 } 230 231 // Compute the target job allocations 232 if err := s.computeJobAllocs(); err != nil { 233 s.logger.Printf("[ERR] sched: %#v: %v", s.eval, err) 234 return false, err 235 } 236 237 // If there are failed allocations, we need to create a blocked evaluation 238 // to place the failed allocations when resources become available. If the 239 // current evaluation is already a blocked eval, we reuse it. 240 if s.eval.Status != structs.EvalStatusBlocked && len(s.failedTGAllocs) != 0 && s.blocked == nil { 241 if err := s.createBlockedEval(false); err != nil { 242 s.logger.Printf("[ERR] sched: %#v failed to make blocked eval: %v", s.eval, err) 243 return false, err 244 } 245 s.logger.Printf("[DEBUG] sched: %#v: failed to place all allocations, blocked eval '%s' created", s.eval, s.blocked.ID) 246 } 247 248 // If the plan is a no-op, we can bail. If AnnotatePlan is set submit the plan 249 // anyways to get the annotations. 250 if s.plan.IsNoOp() && !s.eval.AnnotatePlan { 251 return true, nil 252 } 253 254 // If we need a followup eval and we haven't created one, do so. 255 if s.followupEvalWait != 0 && s.nextEval == nil { 256 s.nextEval = s.eval.NextRollingEval(s.followupEvalWait) 257 if err := s.planner.CreateEval(s.nextEval); err != nil { 258 s.logger.Printf("[ERR] sched: %#v failed to make next eval for rolling migration: %v", s.eval, err) 259 return false, err 260 } 261 s.logger.Printf("[DEBUG] sched: %#v: rolling migration limit reached, next eval '%s' created", s.eval, s.nextEval.ID) 262 } 263 264 // Submit the plan and store the results. 265 result, newState, err := s.planner.SubmitPlan(s.plan) 266 s.planResult = result 267 if err != nil { 268 return false, err 269 } 270 271 // Decrement the number of allocations pending per task group based on the 272 // number of allocations successfully placed 273 adjustQueuedAllocations(s.logger, result, s.queuedAllocs) 274 275 // If we got a state refresh, try again since we have stale data 276 if newState != nil { 277 s.logger.Printf("[DEBUG] sched: %#v: refresh forced", s.eval) 278 s.state = newState 279 return false, nil 280 } 281 282 // Try again if the plan was not fully committed, potential conflict 283 fullCommit, expected, actual := result.FullCommit(s.plan) 284 if !fullCommit { 285 s.logger.Printf("[DEBUG] sched: %#v: attempted %d placements, %d placed", 286 s.eval, expected, actual) 287 if newState == nil { 288 return false, fmt.Errorf("missing state refresh after partial commit") 289 } 290 return false, nil 291 } 292 293 // Success! 294 return true, nil 295 } 296 297 // computeJobAllocs is used to reconcile differences between the job, 298 // existing allocations and node status to update the allocations. 299 func (s *GenericScheduler) computeJobAllocs() error { 300 // Lookup the allocations by JobID 301 ws := memdb.NewWatchSet() 302 allocs, err := s.state.AllocsByJob(ws, s.eval.Namespace, s.eval.JobID, true) 303 if err != nil { 304 return fmt.Errorf("failed to get allocs for job '%s': %v", 305 s.eval.JobID, err) 306 } 307 308 // Determine the tainted nodes containing job allocs 309 tainted, err := taintedNodes(s.state, allocs) 310 if err != nil { 311 return fmt.Errorf("failed to get tainted nodes for job '%s': %v", 312 s.eval.JobID, err) 313 } 314 315 // Update the allocations which are in pending/running state on tainted 316 // nodes to lost 317 updateNonTerminalAllocsToLost(s.plan, tainted, allocs) 318 319 reconciler := NewAllocReconciler(s.ctx.Logger(), 320 genericAllocUpdateFn(s.ctx, s.stack, s.eval.ID), 321 s.batch, s.eval.JobID, s.job, s.deployment, allocs, tainted) 322 results := reconciler.Compute() 323 s.logger.Printf("[DEBUG] sched: %#v: %#v", s.eval, results) 324 325 if s.eval.AnnotatePlan { 326 s.plan.Annotations = &structs.PlanAnnotations{ 327 DesiredTGUpdates: results.desiredTGUpdates, 328 } 329 } 330 331 // Add the deployment changes to the plan 332 s.plan.Deployment = results.deployment 333 s.plan.DeploymentUpdates = results.deploymentUpdates 334 335 // Store the the follow up eval wait duration. If set this will trigger a 336 // follow up eval to handle node draining. 337 s.followupEvalWait = results.followupEvalWait 338 339 // Update the stored deployment 340 if results.deployment != nil { 341 s.deployment = results.deployment 342 } 343 344 // Handle the stop 345 for _, stop := range results.stop { 346 s.plan.AppendUpdate(stop.alloc, structs.AllocDesiredStatusStop, stop.statusDescription, stop.clientStatus) 347 } 348 349 // Handle the in-place updates 350 for _, update := range results.inplaceUpdate { 351 if update.DeploymentID != s.deployment.GetID() { 352 update.DeploymentID = s.deployment.GetID() 353 update.DeploymentStatus = nil 354 } 355 s.ctx.Plan().AppendAlloc(update) 356 } 357 358 // Nothing remaining to do if placement is not required 359 if len(results.place)+len(results.destructiveUpdate) == 0 { 360 if !s.job.Stopped() { 361 for _, tg := range s.job.TaskGroups { 362 s.queuedAllocs[tg.Name] = 0 363 } 364 } 365 return nil 366 } 367 368 // Record the number of allocations that needs to be placed per Task Group 369 for _, place := range results.place { 370 s.queuedAllocs[place.taskGroup.Name] += 1 371 } 372 for _, destructive := range results.destructiveUpdate { 373 s.queuedAllocs[destructive.placeTaskGroup.Name] += 1 374 } 375 376 // Compute the placements 377 place := make([]placementResult, 0, len(results.place)) 378 for _, p := range results.place { 379 place = append(place, p) 380 } 381 382 destructive := make([]placementResult, 0, len(results.destructiveUpdate)) 383 for _, p := range results.destructiveUpdate { 384 destructive = append(destructive, p) 385 } 386 return s.computePlacements(destructive, place) 387 } 388 389 // computePlacements computes placements for allocations. It is given the set of 390 // destructive updates to place and the set of new placements to place. 391 func (s *GenericScheduler) computePlacements(destructive, place []placementResult) error { 392 // Get the base nodes 393 nodes, byDC, err := readyNodesInDCs(s.state, s.job.Datacenters) 394 if err != nil { 395 return err 396 } 397 398 var deploymentID string 399 if s.deployment != nil { 400 deploymentID = s.deployment.ID 401 } 402 403 // Update the set of placement nodes 404 s.stack.SetNodes(nodes) 405 406 // Have to handle destructive changes first as we need to discount their 407 // resources. To understand this imagine the resources were reduced and the 408 // count was scaled up. 409 for _, results := range [][]placementResult{destructive, place} { 410 for _, missing := range results { 411 // Get the task group 412 tg := missing.TaskGroup() 413 414 // Check if this task group has already failed 415 if metric, ok := s.failedTGAllocs[tg.Name]; ok { 416 metric.CoalescedFailures += 1 417 continue 418 } 419 420 // Find the preferred node 421 preferredNode, err := s.findPreferredNode(missing) 422 if err != nil { 423 return err 424 } 425 426 // Check if we should stop the previous allocation upon successful 427 // placement of its replacement. This allow atomic placements/stops. We 428 // stop the allocation before trying to find a replacement because this 429 // frees the resources currently used by the previous allocation. 430 stopPrevAlloc, stopPrevAllocDesc := missing.StopPreviousAlloc() 431 prevAllocation := missing.PreviousAllocation() 432 if stopPrevAlloc { 433 s.plan.AppendUpdate(prevAllocation, structs.AllocDesiredStatusStop, stopPrevAllocDesc, "") 434 } 435 436 // Compute penalty nodes for rescheduled allocs 437 selectOptions := getSelectOptions(prevAllocation, preferredNode) 438 option, _ := s.stack.Select(tg, selectOptions) 439 440 // Store the available nodes by datacenter 441 s.ctx.Metrics().NodesAvailable = byDC 442 443 // Set fields based on if we found an allocation option 444 if option != nil { 445 // Create an allocation for this 446 alloc := &structs.Allocation{ 447 ID: uuid.Generate(), 448 Namespace: s.job.Namespace, 449 EvalID: s.eval.ID, 450 Name: missing.Name(), 451 JobID: s.job.ID, 452 TaskGroup: tg.Name, 453 Metrics: s.ctx.Metrics(), 454 NodeID: option.Node.ID, 455 DeploymentID: deploymentID, 456 TaskResources: option.TaskResources, 457 DesiredStatus: structs.AllocDesiredStatusRun, 458 ClientStatus: structs.AllocClientStatusPending, 459 460 SharedResources: &structs.Resources{ 461 DiskMB: tg.EphemeralDisk.SizeMB, 462 }, 463 } 464 465 // If the new allocation is replacing an older allocation then we 466 // set the record the older allocation id so that they are chained 467 if prevAllocation != nil { 468 alloc.PreviousAllocation = prevAllocation.ID 469 if missing.IsRescheduling() { 470 updateRescheduleTracker(alloc, prevAllocation) 471 } 472 } 473 474 // If we are placing a canary and we found a match, add the canary 475 // to the deployment state object. 476 if missing.Canary() { 477 if state, ok := s.deployment.TaskGroups[tg.Name]; ok { 478 state.PlacedCanaries = append(state.PlacedCanaries, alloc.ID) 479 } 480 } 481 482 // Track the placement 483 s.plan.AppendAlloc(alloc) 484 485 } else { 486 // Lazy initialize the failed map 487 if s.failedTGAllocs == nil { 488 s.failedTGAllocs = make(map[string]*structs.AllocMetric) 489 } 490 491 // Track the fact that we didn't find a placement 492 s.failedTGAllocs[tg.Name] = s.ctx.Metrics() 493 494 // If we weren't able to find a replacement for the allocation, back 495 // out the fact that we asked to stop the allocation. 496 if stopPrevAlloc { 497 s.plan.PopUpdate(prevAllocation) 498 } 499 } 500 501 } 502 } 503 504 return nil 505 } 506 507 // getSelectOptions sets up preferred nodes and penalty nodes 508 func getSelectOptions(prevAllocation *structs.Allocation, preferredNode *structs.Node) *SelectOptions { 509 selectOptions := &SelectOptions{} 510 if prevAllocation != nil { 511 penaltyNodes := make(map[string]struct{}) 512 penaltyNodes[prevAllocation.NodeID] = struct{}{} 513 if prevAllocation.RescheduleTracker != nil { 514 for _, reschedEvent := range prevAllocation.RescheduleTracker.Events { 515 penaltyNodes[reschedEvent.PrevNodeID] = struct{}{} 516 } 517 } 518 selectOptions.PenaltyNodeIDs = penaltyNodes 519 } 520 if preferredNode != nil { 521 selectOptions.PreferredNodes = []*structs.Node{preferredNode} 522 } 523 return selectOptions 524 } 525 526 // updateRescheduleTracker carries over previous restart attempts and adds the most recent restart 527 func updateRescheduleTracker(alloc *structs.Allocation, prev *structs.Allocation) { 528 var rescheduleEvents []*structs.RescheduleEvent 529 if prev.RescheduleTracker != nil { 530 for _, reschedEvent := range prev.RescheduleTracker.Events { 531 rescheduleEvents = append(rescheduleEvents, reschedEvent.Copy()) 532 } 533 } 534 rescheduleEvent := structs.NewRescheduleEvent(time.Now().UTC().UnixNano(), prev.ID, prev.NodeID) 535 rescheduleEvents = append(rescheduleEvents, rescheduleEvent) 536 alloc.RescheduleTracker = &structs.RescheduleTracker{Events: rescheduleEvents} 537 } 538 539 // findPreferredNode finds the preferred node for an allocation 540 func (s *GenericScheduler) findPreferredNode(place placementResult) (node *structs.Node, err error) { 541 if prev := place.PreviousAllocation(); prev != nil && place.TaskGroup().EphemeralDisk.Sticky == true { 542 var preferredNode *structs.Node 543 ws := memdb.NewWatchSet() 544 preferredNode, err = s.state.NodeByID(ws, prev.NodeID) 545 if preferredNode.Ready() { 546 node = preferredNode 547 } 548 } 549 return 550 }