github.com/bigcommerce/nomad@v0.9.3-bc/scheduler/generic_sched.go (about) 1 package scheduler 2 3 import ( 4 "fmt" 5 "time" 6 7 log "github.com/hashicorp/go-hclog" 8 "github.com/hashicorp/go-memdb" 9 "github.com/hashicorp/go-multierror" 10 "github.com/hashicorp/nomad/helper/uuid" 11 "github.com/hashicorp/nomad/nomad/structs" 12 ) 13 14 const ( 15 // maxServiceScheduleAttempts is used to limit the number of times 16 // we will attempt to schedule if we continue to hit conflicts for services. 17 maxServiceScheduleAttempts = 5 18 19 // maxBatchScheduleAttempts is used to limit the number of times 20 // we will attempt to schedule if we continue to hit conflicts for batch. 21 maxBatchScheduleAttempts = 2 22 23 // allocNotNeeded is the status used when a job no longer requires an allocation 24 allocNotNeeded = "alloc not needed due to job update" 25 26 // allocMigrating is the status used when we must migrate an allocation 27 allocMigrating = "alloc is being migrated" 28 29 // allocUpdating is the status used when a job requires an update 30 allocUpdating = "alloc is being updated due to job update" 31 32 // allocLost is the status used when an allocation is lost 33 allocLost = "alloc is lost since its node is down" 34 35 // allocInPlace is the status used when speculating on an in-place update 36 allocInPlace = "alloc updating in-place" 37 38 // allocNodeTainted is the status used when stopping an alloc because it's 39 // node is tainted. 40 allocNodeTainted = "alloc not needed as node is tainted" 41 42 // blockedEvalMaxPlanDesc is the description used for blocked evals that are 43 // a result of hitting the max number of plan attempts 44 blockedEvalMaxPlanDesc = "created due to placement conflicts" 45 46 // blockedEvalFailedPlacements is the description used for blocked evals 47 // that are a result of failing to place all allocations. 48 blockedEvalFailedPlacements = "created to place remaining allocations" 49 50 // reschedulingFollowupEvalDesc is the description used when creating follow 51 // up evals for delayed rescheduling 52 reschedulingFollowupEvalDesc = "created for delayed rescheduling" 53 54 // maxPastRescheduleEvents is the maximum number of past reschedule event 55 // that we track when unlimited rescheduling is enabled 56 maxPastRescheduleEvents = 5 57 ) 58 59 // SetStatusError is used to set the status of the evaluation to the given error 60 type SetStatusError struct { 61 Err error 62 EvalStatus string 63 } 64 65 func (s *SetStatusError) Error() string { 66 return s.Err.Error() 67 } 68 69 // GenericScheduler is used for 'service' and 'batch' type jobs. This scheduler is 70 // designed for long-lived services, and as such spends more time attempting 71 // to make a high quality placement. This is the primary scheduler for 72 // most workloads. It also supports a 'batch' mode to optimize for fast decision 73 // making at the cost of quality. 74 type GenericScheduler struct { 75 logger log.Logger 76 state State 77 planner Planner 78 batch bool 79 80 eval *structs.Evaluation 81 job *structs.Job 82 plan *structs.Plan 83 planResult *structs.PlanResult 84 ctx *EvalContext 85 stack *GenericStack 86 87 followUpEvals []*structs.Evaluation 88 89 deployment *structs.Deployment 90 91 blocked *structs.Evaluation 92 failedTGAllocs map[string]*structs.AllocMetric 93 queuedAllocs map[string]int 94 } 95 96 // NewServiceScheduler is a factory function to instantiate a new service scheduler 97 func NewServiceScheduler(logger log.Logger, state State, planner Planner) Scheduler { 98 s := &GenericScheduler{ 99 logger: logger.Named("service_sched"), 100 state: state, 101 planner: planner, 102 batch: false, 103 } 104 return s 105 } 106 107 // NewBatchScheduler is a factory function to instantiate a new batch scheduler 108 func NewBatchScheduler(logger log.Logger, state State, planner Planner) Scheduler { 109 s := &GenericScheduler{ 110 logger: logger.Named("batch_sched"), 111 state: state, 112 planner: planner, 113 batch: true, 114 } 115 return s 116 } 117 118 // Process is used to handle a single evaluation 119 func (s *GenericScheduler) Process(eval *structs.Evaluation) error { 120 // Store the evaluation 121 s.eval = eval 122 123 // Update our logger with the eval's information 124 s.logger = s.logger.With("eval_id", eval.ID, "job_id", eval.JobID, "namespace", eval.Namespace) 125 126 // Verify the evaluation trigger reason is understood 127 switch eval.TriggeredBy { 128 case structs.EvalTriggerJobRegister, structs.EvalTriggerJobDeregister, 129 structs.EvalTriggerNodeDrain, structs.EvalTriggerNodeUpdate, 130 structs.EvalTriggerAllocStop, 131 structs.EvalTriggerRollingUpdate, structs.EvalTriggerQueuedAllocs, 132 structs.EvalTriggerPeriodicJob, structs.EvalTriggerMaxPlans, 133 structs.EvalTriggerDeploymentWatcher, structs.EvalTriggerRetryFailedAlloc, 134 structs.EvalTriggerFailedFollowUp, structs.EvalTriggerPreemption: 135 default: 136 desc := fmt.Sprintf("scheduler cannot handle '%s' evaluation reason", 137 eval.TriggeredBy) 138 return setStatus(s.logger, s.planner, s.eval, nil, s.blocked, 139 s.failedTGAllocs, structs.EvalStatusFailed, desc, s.queuedAllocs, 140 s.deployment.GetID()) 141 } 142 143 // Retry up to the maxScheduleAttempts and reset if progress is made. 144 progress := func() bool { return progressMade(s.planResult) } 145 limit := maxServiceScheduleAttempts 146 if s.batch { 147 limit = maxBatchScheduleAttempts 148 } 149 if err := retryMax(limit, s.process, progress); err != nil { 150 if statusErr, ok := err.(*SetStatusError); ok { 151 // Scheduling was tried but made no forward progress so create a 152 // blocked eval to retry once resources become available. 153 var mErr multierror.Error 154 if err := s.createBlockedEval(true); err != nil { 155 mErr.Errors = append(mErr.Errors, err) 156 } 157 if err := setStatus(s.logger, s.planner, s.eval, nil, s.blocked, 158 s.failedTGAllocs, statusErr.EvalStatus, err.Error(), 159 s.queuedAllocs, s.deployment.GetID()); err != nil { 160 mErr.Errors = append(mErr.Errors, err) 161 } 162 return mErr.ErrorOrNil() 163 } 164 return err 165 } 166 167 // If the current evaluation is a blocked evaluation and we didn't place 168 // everything, do not update the status to complete. 169 if s.eval.Status == structs.EvalStatusBlocked && len(s.failedTGAllocs) != 0 { 170 e := s.ctx.Eligibility() 171 newEval := s.eval.Copy() 172 newEval.EscapedComputedClass = e.HasEscaped() 173 newEval.ClassEligibility = e.GetClasses() 174 newEval.QuotaLimitReached = e.QuotaLimitReached() 175 return s.planner.ReblockEval(newEval) 176 } 177 178 // Update the status to complete 179 return setStatus(s.logger, s.planner, s.eval, nil, s.blocked, 180 s.failedTGAllocs, structs.EvalStatusComplete, "", s.queuedAllocs, 181 s.deployment.GetID()) 182 } 183 184 // createBlockedEval creates a blocked eval and submits it to the planner. If 185 // failure is set to true, the eval's trigger reason reflects that. 186 func (s *GenericScheduler) createBlockedEval(planFailure bool) error { 187 e := s.ctx.Eligibility() 188 escaped := e.HasEscaped() 189 190 // Only store the eligible classes if the eval hasn't escaped. 191 var classEligibility map[string]bool 192 if !escaped { 193 classEligibility = e.GetClasses() 194 } 195 196 s.blocked = s.eval.CreateBlockedEval(classEligibility, escaped, e.QuotaLimitReached()) 197 if planFailure { 198 s.blocked.TriggeredBy = structs.EvalTriggerMaxPlans 199 s.blocked.StatusDescription = blockedEvalMaxPlanDesc 200 } else { 201 s.blocked.StatusDescription = blockedEvalFailedPlacements 202 } 203 204 return s.planner.CreateEval(s.blocked) 205 } 206 207 // process is wrapped in retryMax to iteratively run the handler until we have no 208 // further work or we've made the maximum number of attempts. 209 func (s *GenericScheduler) process() (bool, error) { 210 // Lookup the Job by ID 211 var err error 212 ws := memdb.NewWatchSet() 213 s.job, err = s.state.JobByID(ws, s.eval.Namespace, s.eval.JobID) 214 if err != nil { 215 return false, fmt.Errorf("failed to get job %q: %v", s.eval.JobID, err) 216 } 217 218 numTaskGroups := 0 219 stopped := s.job.Stopped() 220 if !stopped { 221 numTaskGroups = len(s.job.TaskGroups) 222 } 223 s.queuedAllocs = make(map[string]int, numTaskGroups) 224 s.followUpEvals = nil 225 226 // Create a plan 227 s.plan = s.eval.MakePlan(s.job) 228 229 if !s.batch { 230 // Get any existing deployment 231 s.deployment, err = s.state.LatestDeploymentByJobID(ws, s.eval.Namespace, s.eval.JobID) 232 if err != nil { 233 return false, fmt.Errorf("failed to get job deployment %q: %v", s.eval.JobID, err) 234 } 235 } 236 237 // Reset the failed allocations 238 s.failedTGAllocs = nil 239 240 // Create an evaluation context 241 s.ctx = NewEvalContext(s.state, s.plan, s.logger) 242 243 // Construct the placement stack 244 s.stack = NewGenericStack(s.batch, s.ctx) 245 if !s.job.Stopped() { 246 s.stack.SetJob(s.job) 247 } 248 249 // Compute the target job allocations 250 if err := s.computeJobAllocs(); err != nil { 251 s.logger.Error("failed to compute job allocations", "error", err) 252 return false, err 253 } 254 255 // If there are failed allocations, we need to create a blocked evaluation 256 // to place the failed allocations when resources become available. If the 257 // current evaluation is already a blocked eval, we reuse it. 258 if s.eval.Status != structs.EvalStatusBlocked && len(s.failedTGAllocs) != 0 && s.blocked == nil { 259 if err := s.createBlockedEval(false); err != nil { 260 s.logger.Error("failed to make blocked eval", "error", err) 261 return false, err 262 } 263 s.logger.Debug("failed to place all allocations, blocked eval created", "blocked_eval_id", s.blocked.ID) 264 } 265 266 // If the plan is a no-op, we can bail. If AnnotatePlan is set submit the plan 267 // anyways to get the annotations. 268 if s.plan.IsNoOp() && !s.eval.AnnotatePlan { 269 return true, nil 270 } 271 272 // Create follow up evals for any delayed reschedule eligible allocations 273 if len(s.followUpEvals) > 0 { 274 for _, eval := range s.followUpEvals { 275 eval.PreviousEval = s.eval.ID 276 // TODO(preetha) this should be batching evals before inserting them 277 if err := s.planner.CreateEval(eval); err != nil { 278 s.logger.Error("failed to make next eval for rescheduling", "error", err) 279 return false, err 280 } 281 s.logger.Debug("found reschedulable allocs, followup eval created", "followup_eval_id", eval.ID) 282 } 283 } 284 285 // Submit the plan and store the results. 286 result, newState, err := s.planner.SubmitPlan(s.plan) 287 s.planResult = result 288 if err != nil { 289 return false, err 290 } 291 292 // Decrement the number of allocations pending per task group based on the 293 // number of allocations successfully placed 294 adjustQueuedAllocations(s.logger, result, s.queuedAllocs) 295 296 // If we got a state refresh, try again since we have stale data 297 if newState != nil { 298 s.logger.Debug("refresh forced") 299 s.state = newState 300 return false, nil 301 } 302 303 // Try again if the plan was not fully committed, potential conflict 304 fullCommit, expected, actual := result.FullCommit(s.plan) 305 if !fullCommit { 306 s.logger.Debug("plan didn't fully commit", "attempted", expected, "placed", actual) 307 if newState == nil { 308 return false, fmt.Errorf("missing state refresh after partial commit") 309 } 310 return false, nil 311 } 312 313 // Success! 314 return true, nil 315 } 316 317 // computeJobAllocs is used to reconcile differences between the job, 318 // existing allocations and node status to update the allocations. 319 func (s *GenericScheduler) computeJobAllocs() error { 320 // Lookup the allocations by JobID 321 ws := memdb.NewWatchSet() 322 allocs, err := s.state.AllocsByJob(ws, s.eval.Namespace, s.eval.JobID, true) 323 if err != nil { 324 return fmt.Errorf("failed to get allocs for job '%s': %v", 325 s.eval.JobID, err) 326 } 327 328 // Determine the tainted nodes containing job allocs 329 tainted, err := taintedNodes(s.state, allocs) 330 if err != nil { 331 return fmt.Errorf("failed to get tainted nodes for job '%s': %v", 332 s.eval.JobID, err) 333 } 334 335 // Update the allocations which are in pending/running state on tainted 336 // nodes to lost 337 updateNonTerminalAllocsToLost(s.plan, tainted, allocs) 338 339 reconciler := NewAllocReconciler(s.logger, 340 genericAllocUpdateFn(s.ctx, s.stack, s.eval.ID), 341 s.batch, s.eval.JobID, s.job, s.deployment, allocs, tainted, s.eval.ID) 342 results := reconciler.Compute() 343 s.logger.Debug("reconciled current state with desired state", "results", log.Fmt("%#v", results)) 344 345 if s.eval.AnnotatePlan { 346 s.plan.Annotations = &structs.PlanAnnotations{ 347 DesiredTGUpdates: results.desiredTGUpdates, 348 } 349 } 350 351 // Add the deployment changes to the plan 352 s.plan.Deployment = results.deployment 353 s.plan.DeploymentUpdates = results.deploymentUpdates 354 355 // Store all the follow up evaluations from rescheduled allocations 356 if len(results.desiredFollowupEvals) > 0 { 357 for _, evals := range results.desiredFollowupEvals { 358 s.followUpEvals = append(s.followUpEvals, evals...) 359 } 360 } 361 362 // Update the stored deployment 363 if results.deployment != nil { 364 s.deployment = results.deployment 365 } 366 367 // Handle the stop 368 for _, stop := range results.stop { 369 s.plan.AppendStoppedAlloc(stop.alloc, stop.statusDescription, stop.clientStatus) 370 } 371 372 // Handle the in-place updates 373 for _, update := range results.inplaceUpdate { 374 if update.DeploymentID != s.deployment.GetID() { 375 update.DeploymentID = s.deployment.GetID() 376 update.DeploymentStatus = nil 377 } 378 s.ctx.Plan().AppendAlloc(update) 379 } 380 381 // Handle the annotation updates 382 for _, update := range results.attributeUpdates { 383 s.ctx.Plan().AppendAlloc(update) 384 } 385 386 // Nothing remaining to do if placement is not required 387 if len(results.place)+len(results.destructiveUpdate) == 0 { 388 // If the job has been purged we don't have access to the job. Otherwise 389 // set the queued allocs to zero. This is true if the job is being 390 // stopped as well. 391 if s.job != nil { 392 for _, tg := range s.job.TaskGroups { 393 s.queuedAllocs[tg.Name] = 0 394 } 395 } 396 return nil 397 } 398 399 // Record the number of allocations that needs to be placed per Task Group 400 for _, place := range results.place { 401 s.queuedAllocs[place.taskGroup.Name] += 1 402 } 403 for _, destructive := range results.destructiveUpdate { 404 s.queuedAllocs[destructive.placeTaskGroup.Name] += 1 405 } 406 407 // Compute the placements 408 place := make([]placementResult, 0, len(results.place)) 409 for _, p := range results.place { 410 place = append(place, p) 411 } 412 413 destructive := make([]placementResult, 0, len(results.destructiveUpdate)) 414 for _, p := range results.destructiveUpdate { 415 destructive = append(destructive, p) 416 } 417 return s.computePlacements(destructive, place) 418 } 419 420 // computePlacements computes placements for allocations. It is given the set of 421 // destructive updates to place and the set of new placements to place. 422 func (s *GenericScheduler) computePlacements(destructive, place []placementResult) error { 423 // Get the base nodes 424 nodes, byDC, err := readyNodesInDCs(s.state, s.job.Datacenters) 425 if err != nil { 426 return err 427 } 428 429 var deploymentID string 430 if s.deployment != nil && s.deployment.Active() { 431 deploymentID = s.deployment.ID 432 } 433 434 // Update the set of placement nodes 435 s.stack.SetNodes(nodes) 436 437 // Capture current time to use as the start time for any rescheduled allocations 438 now := time.Now() 439 440 // Have to handle destructive changes first as we need to discount their 441 // resources. To understand this imagine the resources were reduced and the 442 // count was scaled up. 443 for _, results := range [][]placementResult{destructive, place} { 444 for _, missing := range results { 445 // Get the task group 446 tg := missing.TaskGroup() 447 448 // Check if this task group has already failed 449 if metric, ok := s.failedTGAllocs[tg.Name]; ok { 450 metric.CoalescedFailures += 1 451 continue 452 } 453 454 // Find the preferred node 455 preferredNode, err := s.findPreferredNode(missing) 456 if err != nil { 457 return err 458 } 459 460 // Check if we should stop the previous allocation upon successful 461 // placement of its replacement. This allow atomic placements/stops. We 462 // stop the allocation before trying to find a replacement because this 463 // frees the resources currently used by the previous allocation. 464 stopPrevAlloc, stopPrevAllocDesc := missing.StopPreviousAlloc() 465 prevAllocation := missing.PreviousAllocation() 466 if stopPrevAlloc { 467 s.plan.AppendStoppedAlloc(prevAllocation, stopPrevAllocDesc, "") 468 } 469 470 // Compute penalty nodes for rescheduled allocs 471 selectOptions := getSelectOptions(prevAllocation, preferredNode) 472 option := s.selectNextOption(tg, selectOptions) 473 474 // Store the available nodes by datacenter 475 s.ctx.Metrics().NodesAvailable = byDC 476 477 // Compute top K scoring node metadata 478 s.ctx.Metrics().PopulateScoreMetaData() 479 480 // Set fields based on if we found an allocation option 481 if option != nil { 482 resources := &structs.AllocatedResources{ 483 Tasks: option.TaskResources, 484 Shared: structs.AllocatedSharedResources{ 485 DiskMB: int64(tg.EphemeralDisk.SizeMB), 486 }, 487 } 488 489 // Create an allocation for this 490 alloc := &structs.Allocation{ 491 ID: uuid.Generate(), 492 Namespace: s.job.Namespace, 493 EvalID: s.eval.ID, 494 Name: missing.Name(), 495 JobID: s.job.ID, 496 TaskGroup: tg.Name, 497 Metrics: s.ctx.Metrics(), 498 NodeID: option.Node.ID, 499 NodeName: option.Node.Name, 500 DeploymentID: deploymentID, 501 TaskResources: resources.OldTaskResources(), 502 AllocatedResources: resources, 503 DesiredStatus: structs.AllocDesiredStatusRun, 504 ClientStatus: structs.AllocClientStatusPending, 505 SharedResources: &structs.Resources{ 506 DiskMB: tg.EphemeralDisk.SizeMB, 507 }, 508 } 509 510 // If the new allocation is replacing an older allocation then we 511 // set the record the older allocation id so that they are chained 512 if prevAllocation != nil { 513 alloc.PreviousAllocation = prevAllocation.ID 514 if missing.IsRescheduling() { 515 updateRescheduleTracker(alloc, prevAllocation, now) 516 } 517 } 518 519 // If we are placing a canary and we found a match, add the canary 520 // to the deployment state object and mark it as a canary. 521 if missing.Canary() && s.deployment != nil { 522 if state, ok := s.deployment.TaskGroups[tg.Name]; ok { 523 state.PlacedCanaries = append(state.PlacedCanaries, alloc.ID) 524 } 525 526 alloc.DeploymentStatus = &structs.AllocDeploymentStatus{ 527 Canary: true, 528 } 529 } 530 531 s.handlePreemptions(option, alloc, missing) 532 533 // Track the placement 534 s.plan.AppendAlloc(alloc) 535 536 } else { 537 // Lazy initialize the failed map 538 if s.failedTGAllocs == nil { 539 s.failedTGAllocs = make(map[string]*structs.AllocMetric) 540 } 541 542 // Track the fact that we didn't find a placement 543 s.failedTGAllocs[tg.Name] = s.ctx.Metrics() 544 545 // If we weren't able to find a replacement for the allocation, back 546 // out the fact that we asked to stop the allocation. 547 if stopPrevAlloc { 548 s.plan.PopUpdate(prevAllocation) 549 } 550 } 551 552 } 553 } 554 555 return nil 556 } 557 558 // getSelectOptions sets up preferred nodes and penalty nodes 559 func getSelectOptions(prevAllocation *structs.Allocation, preferredNode *structs.Node) *SelectOptions { 560 selectOptions := &SelectOptions{} 561 if prevAllocation != nil { 562 penaltyNodes := make(map[string]struct{}) 563 penaltyNodes[prevAllocation.NodeID] = struct{}{} 564 if prevAllocation.RescheduleTracker != nil { 565 for _, reschedEvent := range prevAllocation.RescheduleTracker.Events { 566 penaltyNodes[reschedEvent.PrevNodeID] = struct{}{} 567 } 568 } 569 selectOptions.PenaltyNodeIDs = penaltyNodes 570 } 571 if preferredNode != nil { 572 selectOptions.PreferredNodes = []*structs.Node{preferredNode} 573 } 574 return selectOptions 575 } 576 577 // updateRescheduleTracker carries over previous restart attempts and adds the most recent restart 578 func updateRescheduleTracker(alloc *structs.Allocation, prev *structs.Allocation, now time.Time) { 579 reschedPolicy := prev.ReschedulePolicy() 580 var rescheduleEvents []*structs.RescheduleEvent 581 if prev.RescheduleTracker != nil { 582 var interval time.Duration 583 if reschedPolicy != nil { 584 interval = reschedPolicy.Interval 585 } 586 // If attempts is set copy all events in the interval range 587 if reschedPolicy.Attempts > 0 { 588 for _, reschedEvent := range prev.RescheduleTracker.Events { 589 timeDiff := now.UnixNano() - reschedEvent.RescheduleTime 590 // Only copy over events that are within restart interval 591 // This keeps the list of events small in cases where there's a long chain of old restart events 592 if interval > 0 && timeDiff <= interval.Nanoseconds() { 593 rescheduleEvents = append(rescheduleEvents, reschedEvent.Copy()) 594 } 595 } 596 } else { 597 // Only copy the last n if unlimited is set 598 start := 0 599 if len(prev.RescheduleTracker.Events) > maxPastRescheduleEvents { 600 start = len(prev.RescheduleTracker.Events) - maxPastRescheduleEvents 601 } 602 for i := start; i < len(prev.RescheduleTracker.Events); i++ { 603 reschedEvent := prev.RescheduleTracker.Events[i] 604 rescheduleEvents = append(rescheduleEvents, reschedEvent.Copy()) 605 } 606 } 607 } 608 nextDelay := prev.NextDelay() 609 rescheduleEvent := structs.NewRescheduleEvent(now.UnixNano(), prev.ID, prev.NodeID, nextDelay) 610 rescheduleEvents = append(rescheduleEvents, rescheduleEvent) 611 alloc.RescheduleTracker = &structs.RescheduleTracker{Events: rescheduleEvents} 612 } 613 614 // findPreferredNode finds the preferred node for an allocation 615 func (s *GenericScheduler) findPreferredNode(place placementResult) (*structs.Node, error) { 616 if prev := place.PreviousAllocation(); prev != nil && place.TaskGroup().EphemeralDisk.Sticky == true { 617 var preferredNode *structs.Node 618 ws := memdb.NewWatchSet() 619 preferredNode, err := s.state.NodeByID(ws, prev.NodeID) 620 if err != nil { 621 return nil, err 622 } 623 624 if preferredNode != nil && preferredNode.Ready() { 625 return preferredNode, nil 626 } 627 } 628 return nil, nil 629 }