github.com/zoomfoo/nomad@v0.8.5-0.20180907175415-f28fd3a1a056/scheduler/generic_sched.go (about) 1 package scheduler 2 3 import ( 4 "fmt" 5 "log" 6 "time" 7 8 memdb "github.com/hashicorp/go-memdb" 9 "github.com/hashicorp/go-multierror" 10 "github.com/hashicorp/nomad/helper/uuid" 11 "github.com/hashicorp/nomad/nomad/structs" 12 ) 13 14 const ( 15 // maxServiceScheduleAttempts is used to limit the number of times 16 // we will attempt to schedule if we continue to hit conflicts for services. 17 maxServiceScheduleAttempts = 5 18 19 // maxBatchScheduleAttempts is used to limit the number of times 20 // we will attempt to schedule if we continue to hit conflicts for batch. 21 maxBatchScheduleAttempts = 2 22 23 // allocNotNeeded is the status used when a job no longer requires an allocation 24 allocNotNeeded = "alloc not needed due to job update" 25 26 // allocMigrating is the status used when we must migrate an allocation 27 allocMigrating = "alloc is being migrated" 28 29 // allocUpdating is the status used when a job requires an update 30 allocUpdating = "alloc is being updated due to job update" 31 32 // allocLost is the status used when an allocation is lost 33 allocLost = "alloc is lost since its node is down" 34 35 // allocInPlace is the status used when speculating on an in-place update 36 allocInPlace = "alloc updating in-place" 37 38 // allocNodeTainted is the status used when stopping an alloc because it's 39 // node is tainted. 40 allocNodeTainted = "alloc not needed as node is tainted" 41 42 // blockedEvalMaxPlanDesc is the description used for blocked evals that are 43 // a result of hitting the max number of plan attempts 44 blockedEvalMaxPlanDesc = "created due to placement conflicts" 45 46 // blockedEvalFailedPlacements is the description used for blocked evals 47 // that are a result of failing to place all allocations. 48 blockedEvalFailedPlacements = "created to place remaining allocations" 49 50 // reschedulingFollowupEvalDesc is the description used when creating follow 51 // up evals for delayed rescheduling 52 reschedulingFollowupEvalDesc = "created for delayed rescheduling" 53 54 // maxPastRescheduleEvents is the maximum number of past reschedule event 55 // that we track when unlimited rescheduling is enabled 56 maxPastRescheduleEvents = 5 57 ) 58 59 // SetStatusError is used to set the status of the evaluation to the given error 60 type SetStatusError struct { 61 Err error 62 EvalStatus string 63 } 64 65 func (s *SetStatusError) Error() string { 66 return s.Err.Error() 67 } 68 69 // GenericScheduler is used for 'service' and 'batch' type jobs. This scheduler is 70 // designed for long-lived services, and as such spends more time attempting 71 // to make a high quality placement. This is the primary scheduler for 72 // most workloads. It also supports a 'batch' mode to optimize for fast decision 73 // making at the cost of quality. 74 type GenericScheduler struct { 75 logger *log.Logger 76 state State 77 planner Planner 78 batch bool 79 80 eval *structs.Evaluation 81 job *structs.Job 82 plan *structs.Plan 83 planResult *structs.PlanResult 84 ctx *EvalContext 85 stack *GenericStack 86 87 followUpEvals []*structs.Evaluation 88 89 deployment *structs.Deployment 90 91 blocked *structs.Evaluation 92 failedTGAllocs map[string]*structs.AllocMetric 93 queuedAllocs map[string]int 94 } 95 96 // NewServiceScheduler is a factory function to instantiate a new service scheduler 97 func NewServiceScheduler(logger *log.Logger, state State, planner Planner) Scheduler { 98 s := &GenericScheduler{ 99 logger: logger, 100 state: state, 101 planner: planner, 102 batch: false, 103 } 104 return s 105 } 106 107 // NewBatchScheduler is a factory function to instantiate a new batch scheduler 108 func NewBatchScheduler(logger *log.Logger, state State, planner Planner) Scheduler { 109 s := &GenericScheduler{ 110 logger: logger, 111 state: state, 112 planner: planner, 113 batch: true, 114 } 115 return s 116 } 117 118 // Process is used to handle a single evaluation 119 func (s *GenericScheduler) Process(eval *structs.Evaluation) error { 120 // Store the evaluation 121 s.eval = eval 122 123 // Verify the evaluation trigger reason is understood 124 switch eval.TriggeredBy { 125 case structs.EvalTriggerJobRegister, structs.EvalTriggerJobDeregister, 126 structs.EvalTriggerNodeDrain, structs.EvalTriggerNodeUpdate, 127 structs.EvalTriggerRollingUpdate, 128 structs.EvalTriggerPeriodicJob, structs.EvalTriggerMaxPlans, 129 structs.EvalTriggerDeploymentWatcher, structs.EvalTriggerRetryFailedAlloc: 130 default: 131 desc := fmt.Sprintf("scheduler cannot handle '%s' evaluation reason", 132 eval.TriggeredBy) 133 return setStatus(s.logger, s.planner, s.eval, nil, s.blocked, 134 s.failedTGAllocs, structs.EvalStatusFailed, desc, s.queuedAllocs, 135 s.deployment.GetID()) 136 } 137 138 // Retry up to the maxScheduleAttempts and reset if progress is made. 139 progress := func() bool { return progressMade(s.planResult) } 140 limit := maxServiceScheduleAttempts 141 if s.batch { 142 limit = maxBatchScheduleAttempts 143 } 144 if err := retryMax(limit, s.process, progress); err != nil { 145 if statusErr, ok := err.(*SetStatusError); ok { 146 // Scheduling was tried but made no forward progress so create a 147 // blocked eval to retry once resources become available. 148 var mErr multierror.Error 149 if err := s.createBlockedEval(true); err != nil { 150 mErr.Errors = append(mErr.Errors, err) 151 } 152 if err := setStatus(s.logger, s.planner, s.eval, nil, s.blocked, 153 s.failedTGAllocs, statusErr.EvalStatus, err.Error(), 154 s.queuedAllocs, s.deployment.GetID()); err != nil { 155 mErr.Errors = append(mErr.Errors, err) 156 } 157 return mErr.ErrorOrNil() 158 } 159 return err 160 } 161 162 // If the current evaluation is a blocked evaluation and we didn't place 163 // everything, do not update the status to complete. 164 if s.eval.Status == structs.EvalStatusBlocked && len(s.failedTGAllocs) != 0 { 165 e := s.ctx.Eligibility() 166 newEval := s.eval.Copy() 167 newEval.EscapedComputedClass = e.HasEscaped() 168 newEval.ClassEligibility = e.GetClasses() 169 newEval.QuotaLimitReached = e.QuotaLimitReached() 170 return s.planner.ReblockEval(newEval) 171 } 172 173 // Update the status to complete 174 return setStatus(s.logger, s.planner, s.eval, nil, s.blocked, 175 s.failedTGAllocs, structs.EvalStatusComplete, "", s.queuedAllocs, 176 s.deployment.GetID()) 177 } 178 179 // createBlockedEval creates a blocked eval and submits it to the planner. If 180 // failure is set to true, the eval's trigger reason reflects that. 181 func (s *GenericScheduler) createBlockedEval(planFailure bool) error { 182 e := s.ctx.Eligibility() 183 escaped := e.HasEscaped() 184 185 // Only store the eligible classes if the eval hasn't escaped. 186 var classEligibility map[string]bool 187 if !escaped { 188 classEligibility = e.GetClasses() 189 } 190 191 s.blocked = s.eval.CreateBlockedEval(classEligibility, escaped, e.QuotaLimitReached()) 192 if planFailure { 193 s.blocked.TriggeredBy = structs.EvalTriggerMaxPlans 194 s.blocked.StatusDescription = blockedEvalMaxPlanDesc 195 } else { 196 s.blocked.StatusDescription = blockedEvalFailedPlacements 197 } 198 199 return s.planner.CreateEval(s.blocked) 200 } 201 202 // process is wrapped in retryMax to iteratively run the handler until we have no 203 // further work or we've made the maximum number of attempts. 204 func (s *GenericScheduler) process() (bool, error) { 205 // Lookup the Job by ID 206 var err error 207 ws := memdb.NewWatchSet() 208 s.job, err = s.state.JobByID(ws, s.eval.Namespace, s.eval.JobID) 209 if err != nil { 210 return false, fmt.Errorf("failed to get job %q: %v", s.eval.JobID, err) 211 } 212 213 numTaskGroups := 0 214 stopped := s.job.Stopped() 215 if !stopped { 216 numTaskGroups = len(s.job.TaskGroups) 217 } 218 s.queuedAllocs = make(map[string]int, numTaskGroups) 219 s.followUpEvals = nil 220 221 // Create a plan 222 s.plan = s.eval.MakePlan(s.job) 223 224 if !s.batch { 225 // Get any existing deployment 226 s.deployment, err = s.state.LatestDeploymentByJobID(ws, s.eval.Namespace, s.eval.JobID) 227 if err != nil { 228 return false, fmt.Errorf("failed to get job deployment %q: %v", s.eval.JobID, err) 229 } 230 } 231 232 // Reset the failed allocations 233 s.failedTGAllocs = nil 234 235 // Create an evaluation context 236 s.ctx = NewEvalContext(s.state, s.plan, s.logger) 237 238 // Construct the placement stack 239 s.stack = NewGenericStack(s.batch, s.ctx) 240 if !s.job.Stopped() { 241 s.stack.SetJob(s.job) 242 } 243 244 // Compute the target job allocations 245 if err := s.computeJobAllocs(); err != nil { 246 s.logger.Printf("[ERR] sched: %#v: %v", s.eval, err) 247 return false, err 248 } 249 250 // If there are failed allocations, we need to create a blocked evaluation 251 // to place the failed allocations when resources become available. If the 252 // current evaluation is already a blocked eval, we reuse it. 253 if s.eval.Status != structs.EvalStatusBlocked && len(s.failedTGAllocs) != 0 && s.blocked == nil { 254 if err := s.createBlockedEval(false); err != nil { 255 s.logger.Printf("[ERR] sched: %#v failed to make blocked eval: %v", s.eval, err) 256 return false, err 257 } 258 s.logger.Printf("[DEBUG] sched: %#v: failed to place all allocations, blocked eval '%s' created", s.eval, s.blocked.ID) 259 } 260 261 // If the plan is a no-op, we can bail. If AnnotatePlan is set submit the plan 262 // anyways to get the annotations. 263 if s.plan.IsNoOp() && !s.eval.AnnotatePlan { 264 return true, nil 265 } 266 267 // Create follow up evals for any delayed reschedule eligible allocations 268 if len(s.followUpEvals) > 0 { 269 for _, eval := range s.followUpEvals { 270 eval.PreviousEval = s.eval.ID 271 // TODO(preetha) this should be batching evals before inserting them 272 if err := s.planner.CreateEval(eval); err != nil { 273 s.logger.Printf("[ERR] sched: %#v failed to make next eval for rescheduling: %v", s.eval, err) 274 return false, err 275 } 276 s.logger.Printf("[DEBUG] sched: %#v: found reschedulable allocs, next eval '%s' created", s.eval, eval.ID) 277 } 278 } 279 280 // Submit the plan and store the results. 281 result, newState, err := s.planner.SubmitPlan(s.plan) 282 s.planResult = result 283 if err != nil { 284 return false, err 285 } 286 287 // Decrement the number of allocations pending per task group based on the 288 // number of allocations successfully placed 289 adjustQueuedAllocations(s.logger, result, s.queuedAllocs) 290 291 // If we got a state refresh, try again since we have stale data 292 if newState != nil { 293 s.logger.Printf("[DEBUG] sched: %#v: refresh forced", s.eval) 294 s.state = newState 295 return false, nil 296 } 297 298 // Try again if the plan was not fully committed, potential conflict 299 fullCommit, expected, actual := result.FullCommit(s.plan) 300 if !fullCommit { 301 s.logger.Printf("[DEBUG] sched: %#v: attempted %d placements, %d placed", 302 s.eval, expected, actual) 303 if newState == nil { 304 return false, fmt.Errorf("missing state refresh after partial commit") 305 } 306 return false, nil 307 } 308 309 // Success! 310 return true, nil 311 } 312 313 // computeJobAllocs is used to reconcile differences between the job, 314 // existing allocations and node status to update the allocations. 315 func (s *GenericScheduler) computeJobAllocs() error { 316 // Lookup the allocations by JobID 317 ws := memdb.NewWatchSet() 318 allocs, err := s.state.AllocsByJob(ws, s.eval.Namespace, s.eval.JobID, true) 319 if err != nil { 320 return fmt.Errorf("failed to get allocs for job '%s': %v", 321 s.eval.JobID, err) 322 } 323 324 // Determine the tainted nodes containing job allocs 325 tainted, err := taintedNodes(s.state, allocs) 326 if err != nil { 327 return fmt.Errorf("failed to get tainted nodes for job '%s': %v", 328 s.eval.JobID, err) 329 } 330 331 // Update the allocations which are in pending/running state on tainted 332 // nodes to lost 333 updateNonTerminalAllocsToLost(s.plan, tainted, allocs) 334 335 reconciler := NewAllocReconciler(s.ctx.Logger(), 336 genericAllocUpdateFn(s.ctx, s.stack, s.eval.ID), 337 s.batch, s.eval.JobID, s.job, s.deployment, allocs, tainted, s.eval.ID) 338 results := reconciler.Compute() 339 s.logger.Printf("[DEBUG] sched: %#v: %#v", s.eval, results) 340 341 if s.eval.AnnotatePlan { 342 s.plan.Annotations = &structs.PlanAnnotations{ 343 DesiredTGUpdates: results.desiredTGUpdates, 344 } 345 } 346 347 // Add the deployment changes to the plan 348 s.plan.Deployment = results.deployment 349 s.plan.DeploymentUpdates = results.deploymentUpdates 350 351 // Store all the follow up evaluations from rescheduled allocations 352 if len(results.desiredFollowupEvals) > 0 { 353 for _, evals := range results.desiredFollowupEvals { 354 s.followUpEvals = append(s.followUpEvals, evals...) 355 } 356 } 357 358 // Update the stored deployment 359 if results.deployment != nil { 360 s.deployment = results.deployment 361 } 362 363 // Handle the stop 364 for _, stop := range results.stop { 365 s.plan.AppendUpdate(stop.alloc, structs.AllocDesiredStatusStop, stop.statusDescription, stop.clientStatus) 366 } 367 368 // Handle the in-place updates 369 for _, update := range results.inplaceUpdate { 370 if update.DeploymentID != s.deployment.GetID() { 371 update.DeploymentID = s.deployment.GetID() 372 update.DeploymentStatus = nil 373 } 374 s.ctx.Plan().AppendAlloc(update) 375 } 376 377 // Handle the annotation updates 378 for _, update := range results.attributeUpdates { 379 s.ctx.Plan().AppendAlloc(update) 380 } 381 382 // Nothing remaining to do if placement is not required 383 if len(results.place)+len(results.destructiveUpdate) == 0 { 384 // If the job has been purged we don't have access to the job. Otherwise 385 // set the queued allocs to zero. This is true if the job is being 386 // stopped as well. 387 if s.job != nil { 388 for _, tg := range s.job.TaskGroups { 389 s.queuedAllocs[tg.Name] = 0 390 } 391 } 392 return nil 393 } 394 395 // Record the number of allocations that needs to be placed per Task Group 396 for _, place := range results.place { 397 s.queuedAllocs[place.taskGroup.Name] += 1 398 } 399 for _, destructive := range results.destructiveUpdate { 400 s.queuedAllocs[destructive.placeTaskGroup.Name] += 1 401 } 402 403 // Compute the placements 404 place := make([]placementResult, 0, len(results.place)) 405 for _, p := range results.place { 406 place = append(place, p) 407 } 408 409 destructive := make([]placementResult, 0, len(results.destructiveUpdate)) 410 for _, p := range results.destructiveUpdate { 411 destructive = append(destructive, p) 412 } 413 return s.computePlacements(destructive, place) 414 } 415 416 // computePlacements computes placements for allocations. It is given the set of 417 // destructive updates to place and the set of new placements to place. 418 func (s *GenericScheduler) computePlacements(destructive, place []placementResult) error { 419 // Get the base nodes 420 nodes, byDC, err := readyNodesInDCs(s.state, s.job.Datacenters) 421 if err != nil { 422 return err 423 } 424 425 var deploymentID string 426 if s.deployment != nil && s.deployment.Active() { 427 deploymentID = s.deployment.ID 428 } 429 430 // Update the set of placement nodes 431 s.stack.SetNodes(nodes) 432 433 // Capture current time to use as the start time for any rescheduled allocations 434 now := time.Now() 435 436 // Have to handle destructive changes first as we need to discount their 437 // resources. To understand this imagine the resources were reduced and the 438 // count was scaled up. 439 for _, results := range [][]placementResult{destructive, place} { 440 for _, missing := range results { 441 // Get the task group 442 tg := missing.TaskGroup() 443 444 // Check if this task group has already failed 445 if metric, ok := s.failedTGAllocs[tg.Name]; ok { 446 metric.CoalescedFailures += 1 447 continue 448 } 449 450 // Find the preferred node 451 preferredNode, err := s.findPreferredNode(missing) 452 if err != nil { 453 return err 454 } 455 456 // Check if we should stop the previous allocation upon successful 457 // placement of its replacement. This allow atomic placements/stops. We 458 // stop the allocation before trying to find a replacement because this 459 // frees the resources currently used by the previous allocation. 460 stopPrevAlloc, stopPrevAllocDesc := missing.StopPreviousAlloc() 461 prevAllocation := missing.PreviousAllocation() 462 if stopPrevAlloc { 463 s.plan.AppendUpdate(prevAllocation, structs.AllocDesiredStatusStop, stopPrevAllocDesc, "") 464 } 465 466 // Compute penalty nodes for rescheduled allocs 467 selectOptions := getSelectOptions(prevAllocation, preferredNode) 468 option, _ := s.stack.Select(tg, selectOptions) 469 470 // Store the available nodes by datacenter 471 s.ctx.Metrics().NodesAvailable = byDC 472 473 // Compute top K scoring node metadata 474 s.ctx.Metrics().PopulateScoreMetaData() 475 476 // Set fields based on if we found an allocation option 477 if option != nil { 478 // Create an allocation for this 479 alloc := &structs.Allocation{ 480 ID: uuid.Generate(), 481 Namespace: s.job.Namespace, 482 EvalID: s.eval.ID, 483 Name: missing.Name(), 484 JobID: s.job.ID, 485 TaskGroup: tg.Name, 486 Metrics: s.ctx.Metrics(), 487 NodeID: option.Node.ID, 488 DeploymentID: deploymentID, 489 TaskResources: option.TaskResources, 490 DesiredStatus: structs.AllocDesiredStatusRun, 491 ClientStatus: structs.AllocClientStatusPending, 492 493 SharedResources: &structs.Resources{ 494 DiskMB: tg.EphemeralDisk.SizeMB, 495 }, 496 } 497 498 // If the new allocation is replacing an older allocation then we 499 // set the record the older allocation id so that they are chained 500 if prevAllocation != nil { 501 alloc.PreviousAllocation = prevAllocation.ID 502 if missing.IsRescheduling() { 503 updateRescheduleTracker(alloc, prevAllocation, now) 504 } 505 } 506 507 // If we are placing a canary and we found a match, add the canary 508 // to the deployment state object and mark it as a canary. 509 if missing.Canary() { 510 if state, ok := s.deployment.TaskGroups[tg.Name]; ok { 511 state.PlacedCanaries = append(state.PlacedCanaries, alloc.ID) 512 } 513 514 alloc.DeploymentStatus = &structs.AllocDeploymentStatus{ 515 Canary: true, 516 } 517 } 518 519 // Track the placement 520 s.plan.AppendAlloc(alloc) 521 522 } else { 523 // Lazy initialize the failed map 524 if s.failedTGAllocs == nil { 525 s.failedTGAllocs = make(map[string]*structs.AllocMetric) 526 } 527 528 // Track the fact that we didn't find a placement 529 s.failedTGAllocs[tg.Name] = s.ctx.Metrics() 530 531 // If we weren't able to find a replacement for the allocation, back 532 // out the fact that we asked to stop the allocation. 533 if stopPrevAlloc { 534 s.plan.PopUpdate(prevAllocation) 535 } 536 } 537 538 } 539 } 540 541 return nil 542 } 543 544 // getSelectOptions sets up preferred nodes and penalty nodes 545 func getSelectOptions(prevAllocation *structs.Allocation, preferredNode *structs.Node) *SelectOptions { 546 selectOptions := &SelectOptions{} 547 if prevAllocation != nil { 548 penaltyNodes := make(map[string]struct{}) 549 penaltyNodes[prevAllocation.NodeID] = struct{}{} 550 if prevAllocation.RescheduleTracker != nil { 551 for _, reschedEvent := range prevAllocation.RescheduleTracker.Events { 552 penaltyNodes[reschedEvent.PrevNodeID] = struct{}{} 553 } 554 } 555 selectOptions.PenaltyNodeIDs = penaltyNodes 556 } 557 if preferredNode != nil { 558 selectOptions.PreferredNodes = []*structs.Node{preferredNode} 559 } 560 return selectOptions 561 } 562 563 // updateRescheduleTracker carries over previous restart attempts and adds the most recent restart 564 func updateRescheduleTracker(alloc *structs.Allocation, prev *structs.Allocation, now time.Time) { 565 reschedPolicy := prev.ReschedulePolicy() 566 var rescheduleEvents []*structs.RescheduleEvent 567 if prev.RescheduleTracker != nil { 568 var interval time.Duration 569 if reschedPolicy != nil { 570 interval = reschedPolicy.Interval 571 } 572 // If attempts is set copy all events in the interval range 573 if reschedPolicy.Attempts > 0 { 574 for _, reschedEvent := range prev.RescheduleTracker.Events { 575 timeDiff := now.UnixNano() - reschedEvent.RescheduleTime 576 // Only copy over events that are within restart interval 577 // This keeps the list of events small in cases where there's a long chain of old restart events 578 if interval > 0 && timeDiff <= interval.Nanoseconds() { 579 rescheduleEvents = append(rescheduleEvents, reschedEvent.Copy()) 580 } 581 } 582 } else { 583 // Only copy the last n if unlimited is set 584 start := 0 585 if len(prev.RescheduleTracker.Events) > maxPastRescheduleEvents { 586 start = len(prev.RescheduleTracker.Events) - maxPastRescheduleEvents 587 } 588 for i := start; i < len(prev.RescheduleTracker.Events); i++ { 589 reschedEvent := prev.RescheduleTracker.Events[i] 590 rescheduleEvents = append(rescheduleEvents, reschedEvent.Copy()) 591 } 592 } 593 } 594 nextDelay := prev.NextDelay() 595 rescheduleEvent := structs.NewRescheduleEvent(now.UnixNano(), prev.ID, prev.NodeID, nextDelay) 596 rescheduleEvents = append(rescheduleEvents, rescheduleEvent) 597 alloc.RescheduleTracker = &structs.RescheduleTracker{Events: rescheduleEvents} 598 } 599 600 // findPreferredNode finds the preferred node for an allocation 601 func (s *GenericScheduler) findPreferredNode(place placementResult) (*structs.Node, error) { 602 if prev := place.PreviousAllocation(); prev != nil && place.TaskGroup().EphemeralDisk.Sticky == true { 603 var preferredNode *structs.Node 604 ws := memdb.NewWatchSet() 605 preferredNode, err := s.state.NodeByID(ws, prev.NodeID) 606 if err != nil { 607 return nil, err 608 } 609 610 if preferredNode != nil && preferredNode.Ready() { 611 return preferredNode, nil 612 } 613 } 614 return nil, nil 615 }