github.com/quite/nomad@v0.8.6/scheduler/generic_sched.go (about) 1 package scheduler 2 3 import ( 4 "fmt" 5 "log" 6 "time" 7 8 memdb "github.com/hashicorp/go-memdb" 9 "github.com/hashicorp/go-multierror" 10 "github.com/hashicorp/nomad/helper/uuid" 11 "github.com/hashicorp/nomad/nomad/structs" 12 ) 13 14 const ( 15 // maxServiceScheduleAttempts is used to limit the number of times 16 // we will attempt to schedule if we continue to hit conflicts for services. 17 maxServiceScheduleAttempts = 5 18 19 // maxBatchScheduleAttempts is used to limit the number of times 20 // we will attempt to schedule if we continue to hit conflicts for batch. 21 maxBatchScheduleAttempts = 2 22 23 // allocNotNeeded is the status used when a job no longer requires an allocation 24 allocNotNeeded = "alloc not needed due to job update" 25 26 // allocMigrating is the status used when we must migrate an allocation 27 allocMigrating = "alloc is being migrated" 28 29 // allocUpdating is the status used when a job requires an update 30 allocUpdating = "alloc is being updated due to job update" 31 32 // allocLost is the status used when an allocation is lost 33 allocLost = "alloc is lost since its node is down" 34 35 // allocInPlace is the status used when speculating on an in-place update 36 allocInPlace = "alloc updating in-place" 37 38 // allocNodeTainted is the status used when stopping an alloc because it's 39 // node is tainted. 40 allocNodeTainted = "alloc not needed as node is tainted" 41 42 // blockedEvalMaxPlanDesc is the description used for blocked evals that are 43 // a result of hitting the max number of plan attempts 44 blockedEvalMaxPlanDesc = "created due to placement conflicts" 45 46 // blockedEvalFailedPlacements is the description used for blocked evals 47 // that are a result of failing to place all allocations. 48 blockedEvalFailedPlacements = "created to place remaining allocations" 49 50 // reschedulingFollowupEvalDesc is the description used when creating follow 51 // up evals for delayed rescheduling 52 reschedulingFollowupEvalDesc = "created for delayed rescheduling" 53 54 // maxPastRescheduleEvents is the maximum number of past reschedule event 55 // that we track when unlimited rescheduling is enabled 56 maxPastRescheduleEvents = 5 57 ) 58 59 // SetStatusError is used to set the status of the evaluation to the given error 60 type SetStatusError struct { 61 Err error 62 EvalStatus string 63 } 64 65 func (s *SetStatusError) Error() string { 66 return s.Err.Error() 67 } 68 69 // GenericScheduler is used for 'service' and 'batch' type jobs. This scheduler is 70 // designed for long-lived services, and as such spends more time attempting 71 // to make a high quality placement. This is the primary scheduler for 72 // most workloads. It also supports a 'batch' mode to optimize for fast decision 73 // making at the cost of quality. 74 type GenericScheduler struct { 75 logger *log.Logger 76 state State 77 planner Planner 78 batch bool 79 80 eval *structs.Evaluation 81 job *structs.Job 82 plan *structs.Plan 83 planResult *structs.PlanResult 84 ctx *EvalContext 85 stack *GenericStack 86 87 followUpEvals []*structs.Evaluation 88 89 deployment *structs.Deployment 90 91 blocked *structs.Evaluation 92 failedTGAllocs map[string]*structs.AllocMetric 93 queuedAllocs map[string]int 94 } 95 96 // NewServiceScheduler is a factory function to instantiate a new service scheduler 97 func NewServiceScheduler(logger *log.Logger, state State, planner Planner) Scheduler { 98 s := &GenericScheduler{ 99 logger: logger, 100 state: state, 101 planner: planner, 102 batch: false, 103 } 104 return s 105 } 106 107 // NewBatchScheduler is a factory function to instantiate a new batch scheduler 108 func NewBatchScheduler(logger *log.Logger, state State, planner Planner) Scheduler { 109 s := &GenericScheduler{ 110 logger: logger, 111 state: state, 112 planner: planner, 113 batch: true, 114 } 115 return s 116 } 117 118 // Process is used to handle a single evaluation 119 func (s *GenericScheduler) Process(eval *structs.Evaluation) error { 120 // Store the evaluation 121 s.eval = eval 122 123 // Verify the evaluation trigger reason is understood 124 switch eval.TriggeredBy { 125 case structs.EvalTriggerJobRegister, structs.EvalTriggerJobDeregister, 126 structs.EvalTriggerNodeDrain, structs.EvalTriggerNodeUpdate, 127 structs.EvalTriggerRollingUpdate, structs.EvalTriggerQueuedAllocs, 128 structs.EvalTriggerPeriodicJob, structs.EvalTriggerMaxPlans, 129 structs.EvalTriggerDeploymentWatcher, structs.EvalTriggerRetryFailedAlloc, 130 structs.EvalTriggerFailedFollowUp: 131 default: 132 desc := fmt.Sprintf("scheduler cannot handle '%s' evaluation reason", 133 eval.TriggeredBy) 134 return setStatus(s.logger, s.planner, s.eval, nil, s.blocked, 135 s.failedTGAllocs, structs.EvalStatusFailed, desc, s.queuedAllocs, 136 s.deployment.GetID()) 137 } 138 139 // Retry up to the maxScheduleAttempts and reset if progress is made. 140 progress := func() bool { return progressMade(s.planResult) } 141 limit := maxServiceScheduleAttempts 142 if s.batch { 143 limit = maxBatchScheduleAttempts 144 } 145 if err := retryMax(limit, s.process, progress); err != nil { 146 if statusErr, ok := err.(*SetStatusError); ok { 147 // Scheduling was tried but made no forward progress so create a 148 // blocked eval to retry once resources become available. 149 var mErr multierror.Error 150 if err := s.createBlockedEval(true); err != nil { 151 mErr.Errors = append(mErr.Errors, err) 152 } 153 if err := setStatus(s.logger, s.planner, s.eval, nil, s.blocked, 154 s.failedTGAllocs, statusErr.EvalStatus, err.Error(), 155 s.queuedAllocs, s.deployment.GetID()); err != nil { 156 mErr.Errors = append(mErr.Errors, err) 157 } 158 return mErr.ErrorOrNil() 159 } 160 return err 161 } 162 163 // If the current evaluation is a blocked evaluation and we didn't place 164 // everything, do not update the status to complete. 165 if s.eval.Status == structs.EvalStatusBlocked && len(s.failedTGAllocs) != 0 { 166 e := s.ctx.Eligibility() 167 newEval := s.eval.Copy() 168 newEval.EscapedComputedClass = e.HasEscaped() 169 newEval.ClassEligibility = e.GetClasses() 170 newEval.QuotaLimitReached = e.QuotaLimitReached() 171 return s.planner.ReblockEval(newEval) 172 } 173 174 // Update the status to complete 175 return setStatus(s.logger, s.planner, s.eval, nil, s.blocked, 176 s.failedTGAllocs, structs.EvalStatusComplete, "", s.queuedAllocs, 177 s.deployment.GetID()) 178 } 179 180 // createBlockedEval creates a blocked eval and submits it to the planner. If 181 // failure is set to true, the eval's trigger reason reflects that. 182 func (s *GenericScheduler) createBlockedEval(planFailure bool) error { 183 e := s.ctx.Eligibility() 184 escaped := e.HasEscaped() 185 186 // Only store the eligible classes if the eval hasn't escaped. 187 var classEligibility map[string]bool 188 if !escaped { 189 classEligibility = e.GetClasses() 190 } 191 192 s.blocked = s.eval.CreateBlockedEval(classEligibility, escaped, e.QuotaLimitReached()) 193 if planFailure { 194 s.blocked.TriggeredBy = structs.EvalTriggerMaxPlans 195 s.blocked.StatusDescription = blockedEvalMaxPlanDesc 196 } else { 197 s.blocked.StatusDescription = blockedEvalFailedPlacements 198 } 199 200 return s.planner.CreateEval(s.blocked) 201 } 202 203 // process is wrapped in retryMax to iteratively run the handler until we have no 204 // further work or we've made the maximum number of attempts. 205 func (s *GenericScheduler) process() (bool, error) { 206 // Lookup the Job by ID 207 var err error 208 ws := memdb.NewWatchSet() 209 s.job, err = s.state.JobByID(ws, s.eval.Namespace, s.eval.JobID) 210 if err != nil { 211 return false, fmt.Errorf("failed to get job %q: %v", s.eval.JobID, err) 212 } 213 214 numTaskGroups := 0 215 stopped := s.job.Stopped() 216 if !stopped { 217 numTaskGroups = len(s.job.TaskGroups) 218 } 219 s.queuedAllocs = make(map[string]int, numTaskGroups) 220 s.followUpEvals = nil 221 222 // Create a plan 223 s.plan = s.eval.MakePlan(s.job) 224 225 if !s.batch { 226 // Get any existing deployment 227 s.deployment, err = s.state.LatestDeploymentByJobID(ws, s.eval.Namespace, s.eval.JobID) 228 if err != nil { 229 return false, fmt.Errorf("failed to get job deployment %q: %v", s.eval.JobID, err) 230 } 231 } 232 233 // Reset the failed allocations 234 s.failedTGAllocs = nil 235 236 // Create an evaluation context 237 s.ctx = NewEvalContext(s.state, s.plan, s.logger) 238 239 // Construct the placement stack 240 s.stack = NewGenericStack(s.batch, s.ctx) 241 if !s.job.Stopped() { 242 s.stack.SetJob(s.job) 243 } 244 245 // Compute the target job allocations 246 if err := s.computeJobAllocs(); err != nil { 247 s.logger.Printf("[ERR] sched: %#v: %v", s.eval, err) 248 return false, err 249 } 250 251 // If there are failed allocations, we need to create a blocked evaluation 252 // to place the failed allocations when resources become available. If the 253 // current evaluation is already a blocked eval, we reuse it. 254 if s.eval.Status != structs.EvalStatusBlocked && len(s.failedTGAllocs) != 0 && s.blocked == nil { 255 if err := s.createBlockedEval(false); err != nil { 256 s.logger.Printf("[ERR] sched: %#v failed to make blocked eval: %v", s.eval, err) 257 return false, err 258 } 259 s.logger.Printf("[DEBUG] sched: %#v: failed to place all allocations, blocked eval '%s' created", s.eval, s.blocked.ID) 260 } 261 262 // If the plan is a no-op, we can bail. If AnnotatePlan is set submit the plan 263 // anyways to get the annotations. 264 if s.plan.IsNoOp() && !s.eval.AnnotatePlan { 265 return true, nil 266 } 267 268 // Create follow up evals for any delayed reschedule eligible allocations 269 if len(s.followUpEvals) > 0 { 270 for _, eval := range s.followUpEvals { 271 eval.PreviousEval = s.eval.ID 272 // TODO(preetha) this should be batching evals before inserting them 273 if err := s.planner.CreateEval(eval); err != nil { 274 s.logger.Printf("[ERR] sched: %#v failed to make next eval for rescheduling: %v", s.eval, err) 275 return false, err 276 } 277 s.logger.Printf("[DEBUG] sched: %#v: found reschedulable allocs, next eval '%s' created", s.eval, eval.ID) 278 } 279 } 280 281 // Submit the plan and store the results. 282 result, newState, err := s.planner.SubmitPlan(s.plan) 283 s.planResult = result 284 if err != nil { 285 return false, err 286 } 287 288 // Decrement the number of allocations pending per task group based on the 289 // number of allocations successfully placed 290 adjustQueuedAllocations(s.logger, result, s.queuedAllocs) 291 292 // If we got a state refresh, try again since we have stale data 293 if newState != nil { 294 s.logger.Printf("[DEBUG] sched: %#v: refresh forced", s.eval) 295 s.state = newState 296 return false, nil 297 } 298 299 // Try again if the plan was not fully committed, potential conflict 300 fullCommit, expected, actual := result.FullCommit(s.plan) 301 if !fullCommit { 302 s.logger.Printf("[DEBUG] sched: %#v: attempted %d placements, %d placed", 303 s.eval, expected, actual) 304 if newState == nil { 305 return false, fmt.Errorf("missing state refresh after partial commit") 306 } 307 return false, nil 308 } 309 310 // Success! 311 return true, nil 312 } 313 314 // computeJobAllocs is used to reconcile differences between the job, 315 // existing allocations and node status to update the allocations. 316 func (s *GenericScheduler) computeJobAllocs() error { 317 // Lookup the allocations by JobID 318 ws := memdb.NewWatchSet() 319 allocs, err := s.state.AllocsByJob(ws, s.eval.Namespace, s.eval.JobID, true) 320 if err != nil { 321 return fmt.Errorf("failed to get allocs for job '%s': %v", 322 s.eval.JobID, err) 323 } 324 325 // Determine the tainted nodes containing job allocs 326 tainted, err := taintedNodes(s.state, allocs) 327 if err != nil { 328 return fmt.Errorf("failed to get tainted nodes for job '%s': %v", 329 s.eval.JobID, err) 330 } 331 332 // Update the allocations which are in pending/running state on tainted 333 // nodes to lost 334 updateNonTerminalAllocsToLost(s.plan, tainted, allocs) 335 336 reconciler := NewAllocReconciler(s.ctx.Logger(), 337 genericAllocUpdateFn(s.ctx, s.stack, s.eval.ID), 338 s.batch, s.eval.JobID, s.job, s.deployment, allocs, tainted, s.eval.ID) 339 results := reconciler.Compute() 340 s.logger.Printf("[DEBUG] sched: %#v: %#v", s.eval, results) 341 342 if s.eval.AnnotatePlan { 343 s.plan.Annotations = &structs.PlanAnnotations{ 344 DesiredTGUpdates: results.desiredTGUpdates, 345 } 346 } 347 348 // Add the deployment changes to the plan 349 s.plan.Deployment = results.deployment 350 s.plan.DeploymentUpdates = results.deploymentUpdates 351 352 // Store all the follow up evaluations from rescheduled allocations 353 if len(results.desiredFollowupEvals) > 0 { 354 for _, evals := range results.desiredFollowupEvals { 355 s.followUpEvals = append(s.followUpEvals, evals...) 356 } 357 } 358 359 // Update the stored deployment 360 if results.deployment != nil { 361 s.deployment = results.deployment 362 } 363 364 // Handle the stop 365 for _, stop := range results.stop { 366 s.plan.AppendUpdate(stop.alloc, structs.AllocDesiredStatusStop, stop.statusDescription, stop.clientStatus) 367 } 368 369 // Handle the in-place updates 370 for _, update := range results.inplaceUpdate { 371 if update.DeploymentID != s.deployment.GetID() { 372 update.DeploymentID = s.deployment.GetID() 373 update.DeploymentStatus = nil 374 } 375 s.ctx.Plan().AppendAlloc(update) 376 } 377 378 // Handle the annotation updates 379 for _, update := range results.attributeUpdates { 380 s.ctx.Plan().AppendAlloc(update) 381 } 382 383 // Nothing remaining to do if placement is not required 384 if len(results.place)+len(results.destructiveUpdate) == 0 { 385 // If the job has been purged we don't have access to the job. Otherwise 386 // set the queued allocs to zero. This is true if the job is being 387 // stopped as well. 388 if s.job != nil { 389 for _, tg := range s.job.TaskGroups { 390 s.queuedAllocs[tg.Name] = 0 391 } 392 } 393 return nil 394 } 395 396 // Record the number of allocations that needs to be placed per Task Group 397 for _, place := range results.place { 398 s.queuedAllocs[place.taskGroup.Name] += 1 399 } 400 for _, destructive := range results.destructiveUpdate { 401 s.queuedAllocs[destructive.placeTaskGroup.Name] += 1 402 } 403 404 // Compute the placements 405 place := make([]placementResult, 0, len(results.place)) 406 for _, p := range results.place { 407 place = append(place, p) 408 } 409 410 destructive := make([]placementResult, 0, len(results.destructiveUpdate)) 411 for _, p := range results.destructiveUpdate { 412 destructive = append(destructive, p) 413 } 414 return s.computePlacements(destructive, place) 415 } 416 417 // computePlacements computes placements for allocations. It is given the set of 418 // destructive updates to place and the set of new placements to place. 419 func (s *GenericScheduler) computePlacements(destructive, place []placementResult) error { 420 // Get the base nodes 421 nodes, byDC, err := readyNodesInDCs(s.state, s.job.Datacenters) 422 if err != nil { 423 return err 424 } 425 426 var deploymentID string 427 if s.deployment != nil && s.deployment.Active() { 428 deploymentID = s.deployment.ID 429 } 430 431 // Update the set of placement nodes 432 s.stack.SetNodes(nodes) 433 434 // Capture current time to use as the start time for any rescheduled allocations 435 now := time.Now() 436 437 // Have to handle destructive changes first as we need to discount their 438 // resources. To understand this imagine the resources were reduced and the 439 // count was scaled up. 440 for _, results := range [][]placementResult{destructive, place} { 441 for _, missing := range results { 442 // Get the task group 443 tg := missing.TaskGroup() 444 445 // Check if this task group has already failed 446 if metric, ok := s.failedTGAllocs[tg.Name]; ok { 447 metric.CoalescedFailures += 1 448 continue 449 } 450 451 // Find the preferred node 452 preferredNode, err := s.findPreferredNode(missing) 453 if err != nil { 454 return err 455 } 456 457 // Check if we should stop the previous allocation upon successful 458 // placement of its replacement. This allow atomic placements/stops. We 459 // stop the allocation before trying to find a replacement because this 460 // frees the resources currently used by the previous allocation. 461 stopPrevAlloc, stopPrevAllocDesc := missing.StopPreviousAlloc() 462 prevAllocation := missing.PreviousAllocation() 463 if stopPrevAlloc { 464 s.plan.AppendUpdate(prevAllocation, structs.AllocDesiredStatusStop, stopPrevAllocDesc, "") 465 } 466 467 // Compute penalty nodes for rescheduled allocs 468 selectOptions := getSelectOptions(prevAllocation, preferredNode) 469 option, _ := s.stack.Select(tg, selectOptions) 470 471 // Store the available nodes by datacenter 472 s.ctx.Metrics().NodesAvailable = byDC 473 474 // Set fields based on if we found an allocation option 475 if option != nil { 476 // Create an allocation for this 477 alloc := &structs.Allocation{ 478 ID: uuid.Generate(), 479 Namespace: s.job.Namespace, 480 EvalID: s.eval.ID, 481 Name: missing.Name(), 482 JobID: s.job.ID, 483 TaskGroup: tg.Name, 484 Metrics: s.ctx.Metrics(), 485 NodeID: option.Node.ID, 486 DeploymentID: deploymentID, 487 TaskResources: option.TaskResources, 488 DesiredStatus: structs.AllocDesiredStatusRun, 489 ClientStatus: structs.AllocClientStatusPending, 490 491 SharedResources: &structs.Resources{ 492 DiskMB: tg.EphemeralDisk.SizeMB, 493 }, 494 } 495 496 // If the new allocation is replacing an older allocation then we 497 // set the record the older allocation id so that they are chained 498 if prevAllocation != nil { 499 alloc.PreviousAllocation = prevAllocation.ID 500 if missing.IsRescheduling() { 501 updateRescheduleTracker(alloc, prevAllocation, now) 502 } 503 } 504 505 // If we are placing a canary and we found a match, add the canary 506 // to the deployment state object and mark it as a canary. 507 if missing.Canary() { 508 if state, ok := s.deployment.TaskGroups[tg.Name]; ok { 509 state.PlacedCanaries = append(state.PlacedCanaries, alloc.ID) 510 } 511 512 alloc.DeploymentStatus = &structs.AllocDeploymentStatus{ 513 Canary: true, 514 } 515 } 516 517 // Track the placement 518 s.plan.AppendAlloc(alloc) 519 520 } else { 521 // Lazy initialize the failed map 522 if s.failedTGAllocs == nil { 523 s.failedTGAllocs = make(map[string]*structs.AllocMetric) 524 } 525 526 // Track the fact that we didn't find a placement 527 s.failedTGAllocs[tg.Name] = s.ctx.Metrics() 528 529 // If we weren't able to find a replacement for the allocation, back 530 // out the fact that we asked to stop the allocation. 531 if stopPrevAlloc { 532 s.plan.PopUpdate(prevAllocation) 533 } 534 } 535 536 } 537 } 538 539 return nil 540 } 541 542 // getSelectOptions sets up preferred nodes and penalty nodes 543 func getSelectOptions(prevAllocation *structs.Allocation, preferredNode *structs.Node) *SelectOptions { 544 selectOptions := &SelectOptions{} 545 if prevAllocation != nil { 546 penaltyNodes := make(map[string]struct{}) 547 penaltyNodes[prevAllocation.NodeID] = struct{}{} 548 if prevAllocation.RescheduleTracker != nil { 549 for _, reschedEvent := range prevAllocation.RescheduleTracker.Events { 550 penaltyNodes[reschedEvent.PrevNodeID] = struct{}{} 551 } 552 } 553 selectOptions.PenaltyNodeIDs = penaltyNodes 554 } 555 if preferredNode != nil { 556 selectOptions.PreferredNodes = []*structs.Node{preferredNode} 557 } 558 return selectOptions 559 } 560 561 // updateRescheduleTracker carries over previous restart attempts and adds the most recent restart 562 func updateRescheduleTracker(alloc *structs.Allocation, prev *structs.Allocation, now time.Time) { 563 reschedPolicy := prev.ReschedulePolicy() 564 var rescheduleEvents []*structs.RescheduleEvent 565 if prev.RescheduleTracker != nil { 566 var interval time.Duration 567 if reschedPolicy != nil { 568 interval = reschedPolicy.Interval 569 } 570 // If attempts is set copy all events in the interval range 571 if reschedPolicy.Attempts > 0 { 572 for _, reschedEvent := range prev.RescheduleTracker.Events { 573 timeDiff := now.UnixNano() - reschedEvent.RescheduleTime 574 // Only copy over events that are within restart interval 575 // This keeps the list of events small in cases where there's a long chain of old restart events 576 if interval > 0 && timeDiff <= interval.Nanoseconds() { 577 rescheduleEvents = append(rescheduleEvents, reschedEvent.Copy()) 578 } 579 } 580 } else { 581 // Only copy the last n if unlimited is set 582 start := 0 583 if len(prev.RescheduleTracker.Events) > maxPastRescheduleEvents { 584 start = len(prev.RescheduleTracker.Events) - maxPastRescheduleEvents 585 } 586 for i := start; i < len(prev.RescheduleTracker.Events); i++ { 587 reschedEvent := prev.RescheduleTracker.Events[i] 588 rescheduleEvents = append(rescheduleEvents, reschedEvent.Copy()) 589 } 590 } 591 } 592 nextDelay := prev.NextDelay() 593 rescheduleEvent := structs.NewRescheduleEvent(now.UnixNano(), prev.ID, prev.NodeID, nextDelay) 594 rescheduleEvents = append(rescheduleEvents, rescheduleEvent) 595 alloc.RescheduleTracker = &structs.RescheduleTracker{Events: rescheduleEvents} 596 } 597 598 // findPreferredNode finds the preferred node for an allocation 599 func (s *GenericScheduler) findPreferredNode(place placementResult) (*structs.Node, error) { 600 if prev := place.PreviousAllocation(); prev != nil && place.TaskGroup().EphemeralDisk.Sticky == true { 601 var preferredNode *structs.Node 602 ws := memdb.NewWatchSet() 603 preferredNode, err := s.state.NodeByID(ws, prev.NodeID) 604 if err != nil { 605 return nil, err 606 } 607 608 if preferredNode != nil && preferredNode.Ready() { 609 return preferredNode, nil 610 } 611 } 612 return nil, nil 613 }