github.com/manicqin/nomad@v0.9.5/scheduler/reconcile.go (about) 1 package scheduler 2 3 import ( 4 "fmt" 5 "time" 6 7 "sort" 8 9 log "github.com/hashicorp/go-hclog" 10 11 "github.com/hashicorp/nomad/helper" 12 "github.com/hashicorp/nomad/helper/uuid" 13 "github.com/hashicorp/nomad/nomad/structs" 14 ) 15 16 const ( 17 // batchedFailedAllocWindowSize is the window size used 18 // to batch up failed allocations before creating an eval 19 batchedFailedAllocWindowSize = 5 * time.Second 20 21 // rescheduleWindowSize is the window size relative to 22 // current time within which reschedulable allocations are placed. 23 // This helps protect against small clock drifts between servers 24 rescheduleWindowSize = 1 * time.Second 25 ) 26 27 // allocUpdateType takes an existing allocation and a new job definition and 28 // returns whether the allocation can ignore the change, requires a destructive 29 // update, or can be inplace updated. If it can be inplace updated, an updated 30 // allocation that has the new resources and alloc metrics attached will be 31 // returned. 32 type allocUpdateType func(existing *structs.Allocation, newJob *structs.Job, 33 newTG *structs.TaskGroup) (ignore, destructive bool, updated *structs.Allocation) 34 35 // allocReconciler is used to determine the set of allocations that require 36 // placement, inplace updating or stopping given the job specification and 37 // existing cluster state. The reconciler should only be used for batch and 38 // service jobs. 39 type allocReconciler struct { 40 // logger is used to log debug information. Logging should be kept at a 41 // minimal here 42 logger log.Logger 43 44 // canInplace is used to check if the allocation can be inplace upgraded 45 allocUpdateFn allocUpdateType 46 47 // batch marks whether the job is a batch job 48 batch bool 49 50 // job is the job being operated on, it may be nil if the job is being 51 // stopped via a purge 52 job *structs.Job 53 54 // jobID is the ID of the job being operated on. The job may be nil if it is 55 // being stopped so we require this separately. 56 jobID string 57 58 // oldDeployment is the last deployment for the job 59 oldDeployment *structs.Deployment 60 61 // deployment is the current deployment for the job 62 deployment *structs.Deployment 63 64 // deploymentPaused marks whether the deployment is paused 65 deploymentPaused bool 66 67 // deploymentFailed marks whether the deployment is failed 68 deploymentFailed bool 69 70 // taintedNodes contains a map of nodes that are tainted 71 taintedNodes map[string]*structs.Node 72 73 // existingAllocs is non-terminal existing allocations 74 existingAllocs []*structs.Allocation 75 76 // evalID is the ID of the evaluation that triggered the reconciler 77 evalID string 78 79 // now is the time used when determining rescheduling eligibility 80 // defaults to time.Now, and overidden in unit tests 81 now time.Time 82 83 // result is the results of the reconcile. During computation it can be 84 // used to store intermediate state 85 result *reconcileResults 86 } 87 88 // reconcileResults contains the results of the reconciliation and should be 89 // applied by the scheduler. 90 type reconcileResults struct { 91 // deployment is the deployment that should be created or updated as a 92 // result of scheduling 93 deployment *structs.Deployment 94 95 // deploymentUpdates contains a set of deployment updates that should be 96 // applied as a result of scheduling 97 deploymentUpdates []*structs.DeploymentStatusUpdate 98 99 // place is the set of allocations to place by the scheduler 100 place []allocPlaceResult 101 102 // destructiveUpdate is the set of allocations to apply a destructive update to 103 destructiveUpdate []allocDestructiveResult 104 105 // inplaceUpdate is the set of allocations to apply an inplace update to 106 inplaceUpdate []*structs.Allocation 107 108 // stop is the set of allocations to stop 109 stop []allocStopResult 110 111 // attributeUpdates are updates to the allocation that are not from a 112 // jobspec change. 113 attributeUpdates map[string]*structs.Allocation 114 115 // desiredTGUpdates captures the desired set of changes to make for each 116 // task group. 117 desiredTGUpdates map[string]*structs.DesiredUpdates 118 119 // desiredFollowupEvals is the map of follow up evaluations to create per task group 120 // This is used to create a delayed evaluation for rescheduling failed allocations. 121 desiredFollowupEvals map[string][]*structs.Evaluation 122 } 123 124 // delayedRescheduleInfo contains the allocation id and a time when its eligible to be rescheduled. 125 // this is used to create follow up evaluations 126 type delayedRescheduleInfo struct { 127 128 // allocID is the ID of the allocation eligible to be rescheduled 129 allocID string 130 131 alloc *structs.Allocation 132 133 // rescheduleTime is the time to use in the delayed evaluation 134 rescheduleTime time.Time 135 } 136 137 func (r *reconcileResults) GoString() string { 138 base := fmt.Sprintf("Total changes: (place %d) (destructive %d) (inplace %d) (stop %d)", 139 len(r.place), len(r.destructiveUpdate), len(r.inplaceUpdate), len(r.stop)) 140 141 if r.deployment != nil { 142 base += fmt.Sprintf("\nCreated Deployment: %q", r.deployment.ID) 143 } 144 for _, u := range r.deploymentUpdates { 145 base += fmt.Sprintf("\nDeployment Update for ID %q: Status %q; Description %q", 146 u.DeploymentID, u.Status, u.StatusDescription) 147 } 148 for tg, u := range r.desiredTGUpdates { 149 base += fmt.Sprintf("\nDesired Changes for %q: %#v", tg, u) 150 } 151 return base 152 } 153 154 // Changes returns the number of total changes 155 func (r *reconcileResults) Changes() int { 156 return len(r.place) + len(r.inplaceUpdate) + len(r.stop) 157 } 158 159 // NewAllocReconciler creates a new reconciler that should be used to determine 160 // the changes required to bring the cluster state inline with the declared jobspec 161 func NewAllocReconciler(logger log.Logger, allocUpdateFn allocUpdateType, batch bool, 162 jobID string, job *structs.Job, deployment *structs.Deployment, 163 existingAllocs []*structs.Allocation, taintedNodes map[string]*structs.Node, evalID string) *allocReconciler { 164 return &allocReconciler{ 165 logger: logger.Named("reconciler"), 166 allocUpdateFn: allocUpdateFn, 167 batch: batch, 168 jobID: jobID, 169 job: job, 170 deployment: deployment.Copy(), 171 existingAllocs: existingAllocs, 172 taintedNodes: taintedNodes, 173 evalID: evalID, 174 now: time.Now(), 175 result: &reconcileResults{ 176 desiredTGUpdates: make(map[string]*structs.DesiredUpdates), 177 desiredFollowupEvals: make(map[string][]*structs.Evaluation), 178 }, 179 } 180 } 181 182 // Compute reconciles the existing cluster state and returns the set of changes 183 // required to converge the job spec and state 184 func (a *allocReconciler) Compute() *reconcileResults { 185 // Create the allocation matrix 186 m := newAllocMatrix(a.job, a.existingAllocs) 187 188 // Handle stopping unneeded deployments 189 a.cancelDeployments() 190 191 // If we are just stopping a job we do not need to do anything more than 192 // stopping all running allocs 193 if a.job.Stopped() { 194 a.handleStop(m) 195 return a.result 196 } 197 198 // Detect if the deployment is paused 199 if a.deployment != nil { 200 a.deploymentPaused = a.deployment.Status == structs.DeploymentStatusPaused 201 a.deploymentFailed = a.deployment.Status == structs.DeploymentStatusFailed 202 } 203 204 // Reconcile each group 205 complete := true 206 for group, as := range m { 207 groupComplete := a.computeGroup(group, as) 208 complete = complete && groupComplete 209 } 210 211 // Mark the deployment as complete if possible 212 if a.deployment != nil && complete { 213 a.result.deploymentUpdates = append(a.result.deploymentUpdates, &structs.DeploymentStatusUpdate{ 214 DeploymentID: a.deployment.ID, 215 Status: structs.DeploymentStatusSuccessful, 216 StatusDescription: structs.DeploymentStatusDescriptionSuccessful, 217 }) 218 } 219 220 // Set the description of a created deployment 221 if d := a.result.deployment; d != nil { 222 if d.RequiresPromotion() { 223 if d.HasAutoPromote() { 224 d.StatusDescription = structs.DeploymentStatusDescriptionRunningAutoPromotion 225 } else { 226 d.StatusDescription = structs.DeploymentStatusDescriptionRunningNeedsPromotion 227 } 228 } 229 } 230 231 return a.result 232 } 233 234 // cancelDeployments cancels any deployment that is not needed 235 func (a *allocReconciler) cancelDeployments() { 236 // If the job is stopped and there is a non-terminal deployment, cancel it 237 if a.job.Stopped() { 238 if a.deployment != nil && a.deployment.Active() { 239 a.result.deploymentUpdates = append(a.result.deploymentUpdates, &structs.DeploymentStatusUpdate{ 240 DeploymentID: a.deployment.ID, 241 Status: structs.DeploymentStatusCancelled, 242 StatusDescription: structs.DeploymentStatusDescriptionStoppedJob, 243 }) 244 } 245 246 // Nothing else to do 247 a.oldDeployment = a.deployment 248 a.deployment = nil 249 return 250 } 251 252 d := a.deployment 253 if d == nil { 254 return 255 } 256 257 // Check if the deployment is active and referencing an older job and cancel it 258 if d.JobCreateIndex != a.job.CreateIndex || d.JobVersion != a.job.Version { 259 if d.Active() { 260 a.result.deploymentUpdates = append(a.result.deploymentUpdates, &structs.DeploymentStatusUpdate{ 261 DeploymentID: a.deployment.ID, 262 Status: structs.DeploymentStatusCancelled, 263 StatusDescription: structs.DeploymentStatusDescriptionNewerJob, 264 }) 265 } 266 267 a.oldDeployment = d 268 a.deployment = nil 269 } 270 271 // Clear it as the current deployment if it is successful 272 if d.Status == structs.DeploymentStatusSuccessful { 273 a.oldDeployment = d 274 a.deployment = nil 275 } 276 } 277 278 // handleStop marks all allocations to be stopped, handling the lost case 279 func (a *allocReconciler) handleStop(m allocMatrix) { 280 for group, as := range m { 281 as = filterByTerminal(as) 282 untainted, migrate, lost := as.filterByTainted(a.taintedNodes) 283 a.markStop(untainted, "", allocNotNeeded) 284 a.markStop(migrate, "", allocNotNeeded) 285 a.markStop(lost, structs.AllocClientStatusLost, allocLost) 286 desiredChanges := new(structs.DesiredUpdates) 287 desiredChanges.Stop = uint64(len(as)) 288 a.result.desiredTGUpdates[group] = desiredChanges 289 } 290 } 291 292 // markStop is a helper for marking a set of allocation for stop with a 293 // particular client status and description. 294 func (a *allocReconciler) markStop(allocs allocSet, clientStatus, statusDescription string) { 295 for _, alloc := range allocs { 296 a.result.stop = append(a.result.stop, allocStopResult{ 297 alloc: alloc, 298 clientStatus: clientStatus, 299 statusDescription: statusDescription, 300 }) 301 } 302 } 303 304 // computeGroup reconciles state for a particular task group. It returns whether 305 // the deployment it is for is complete with regards to the task group. 306 func (a *allocReconciler) computeGroup(group string, all allocSet) bool { 307 // Create the desired update object for the group 308 desiredChanges := new(structs.DesiredUpdates) 309 a.result.desiredTGUpdates[group] = desiredChanges 310 311 // Get the task group. The task group may be nil if the job was updates such 312 // that the task group no longer exists 313 tg := a.job.LookupTaskGroup(group) 314 315 // If the task group is nil, then the task group has been removed so all we 316 // need to do is stop everything 317 if tg == nil { 318 untainted, migrate, lost := all.filterByTainted(a.taintedNodes) 319 a.markStop(untainted, "", allocNotNeeded) 320 a.markStop(migrate, "", allocNotNeeded) 321 a.markStop(lost, structs.AllocClientStatusLost, allocLost) 322 desiredChanges.Stop = uint64(len(untainted) + len(migrate) + len(lost)) 323 return true 324 } 325 326 // Get the deployment state for the group 327 var dstate *structs.DeploymentState 328 existingDeployment := false 329 if a.deployment != nil { 330 dstate, existingDeployment = a.deployment.TaskGroups[group] 331 } 332 if !existingDeployment { 333 dstate = &structs.DeploymentState{} 334 if !tg.Update.IsEmpty() { 335 dstate.AutoRevert = tg.Update.AutoRevert 336 dstate.AutoPromote = tg.Update.AutoPromote 337 dstate.ProgressDeadline = tg.Update.ProgressDeadline 338 } 339 } 340 341 // Filter allocations that do not need to be considered because they are 342 // from an older job version and are terminal. 343 all, ignore := a.filterOldTerminalAllocs(all) 344 desiredChanges.Ignore += uint64(len(ignore)) 345 346 // canaries is the set of canaries for the current deployment and all is all 347 // allocs including the canaries 348 canaries, all := a.handleGroupCanaries(all, desiredChanges) 349 350 // Determine what set of allocations are on tainted nodes 351 untainted, migrate, lost := all.filterByTainted(a.taintedNodes) 352 353 // Determine what set of terminal allocations need to be rescheduled 354 untainted, rescheduleNow, rescheduleLater := untainted.filterByRescheduleable(a.batch, a.now, a.evalID, a.deployment) 355 356 // Create batched follow up evaluations for allocations that are 357 // reschedulable later and mark the allocations for in place updating 358 a.handleDelayedReschedules(rescheduleLater, all, tg.Name) 359 360 // Create a structure for choosing names. Seed with the taken names which is 361 // the union of untainted and migrating nodes (includes canaries) 362 nameIndex := newAllocNameIndex(a.jobID, group, tg.Count, untainted.union(migrate, rescheduleNow)) 363 364 // Stop any unneeded allocations and update the untainted set to not 365 // included stopped allocations. 366 canaryState := dstate != nil && dstate.DesiredCanaries != 0 && !dstate.Promoted 367 stop := a.computeStop(tg, nameIndex, untainted, migrate, lost, canaries, canaryState) 368 desiredChanges.Stop += uint64(len(stop)) 369 untainted = untainted.difference(stop) 370 371 // Do inplace upgrades where possible and capture the set of upgrades that 372 // need to be done destructively. 373 ignore, inplace, destructive := a.computeUpdates(tg, untainted) 374 desiredChanges.Ignore += uint64(len(ignore)) 375 desiredChanges.InPlaceUpdate += uint64(len(inplace)) 376 if !existingDeployment { 377 dstate.DesiredTotal += len(destructive) + len(inplace) 378 } 379 380 // Remove the canaries now that we have handled rescheduling so that we do 381 // not consider them when making placement decisions. 382 if canaryState { 383 untainted = untainted.difference(canaries) 384 } 385 386 // The fact that we have destructive updates and have less canaries than is 387 // desired means we need to create canaries 388 numDestructive := len(destructive) 389 strategy := tg.Update 390 canariesPromoted := dstate != nil && dstate.Promoted 391 requireCanary := numDestructive != 0 && strategy != nil && len(canaries) < strategy.Canary && !canariesPromoted 392 if requireCanary && !a.deploymentPaused && !a.deploymentFailed { 393 number := strategy.Canary - len(canaries) 394 desiredChanges.Canary += uint64(number) 395 if !existingDeployment { 396 dstate.DesiredCanaries = strategy.Canary 397 } 398 399 for _, name := range nameIndex.NextCanaries(uint(number), canaries, destructive) { 400 a.result.place = append(a.result.place, allocPlaceResult{ 401 name: name, 402 canary: true, 403 taskGroup: tg, 404 }) 405 } 406 } 407 408 // Determine how many we can place 409 canaryState = dstate != nil && dstate.DesiredCanaries != 0 && !dstate.Promoted 410 limit := a.computeLimit(tg, untainted, destructive, migrate, canaryState) 411 412 // Place if: 413 // * The deployment is not paused or failed 414 // * Not placing any canaries 415 // * If there are any canaries that they have been promoted 416 place := a.computePlacements(tg, nameIndex, untainted, migrate, rescheduleNow) 417 if !existingDeployment { 418 dstate.DesiredTotal += len(place) 419 } 420 421 // deploymentPlaceReady tracks whether the deployment is in a state where 422 // placements can be made without any other consideration. 423 deploymentPlaceReady := !a.deploymentPaused && !a.deploymentFailed && !canaryState 424 425 if deploymentPlaceReady { 426 desiredChanges.Place += uint64(len(place)) 427 for _, p := range place { 428 a.result.place = append(a.result.place, p) 429 } 430 a.markStop(rescheduleNow, "", allocRescheduled) 431 desiredChanges.Stop += uint64(len(rescheduleNow)) 432 433 min := helper.IntMin(len(place), limit) 434 limit -= min 435 } else if !deploymentPlaceReady { 436 // We do not want to place additional allocations but in the case we 437 // have lost allocations or allocations that require rescheduling now, 438 // we do so regardless to avoid odd user experiences. 439 if len(lost) != 0 { 440 allowed := helper.IntMin(len(lost), len(place)) 441 desiredChanges.Place += uint64(allowed) 442 for _, p := range place[:allowed] { 443 a.result.place = append(a.result.place, p) 444 } 445 } 446 447 // Handle rescheduling of failed allocations even if the deployment is 448 // failed. We do not reschedule if the allocation is part of the failed 449 // deployment. 450 if now := len(rescheduleNow); now != 0 { 451 for _, p := range place { 452 prev := p.PreviousAllocation() 453 if p.IsRescheduling() && !(a.deploymentFailed && prev != nil && a.deployment.ID == prev.DeploymentID) { 454 a.result.place = append(a.result.place, p) 455 desiredChanges.Place++ 456 457 a.result.stop = append(a.result.stop, allocStopResult{ 458 alloc: prev, 459 statusDescription: allocRescheduled, 460 }) 461 desiredChanges.Stop++ 462 } 463 } 464 } 465 } 466 467 if deploymentPlaceReady { 468 // Do all destructive updates 469 min := helper.IntMin(len(destructive), limit) 470 desiredChanges.DestructiveUpdate += uint64(min) 471 desiredChanges.Ignore += uint64(len(destructive) - min) 472 for _, alloc := range destructive.nameOrder()[:min] { 473 a.result.destructiveUpdate = append(a.result.destructiveUpdate, allocDestructiveResult{ 474 placeName: alloc.Name, 475 placeTaskGroup: tg, 476 stopAlloc: alloc, 477 stopStatusDescription: allocUpdating, 478 }) 479 } 480 } else { 481 desiredChanges.Ignore += uint64(len(destructive)) 482 } 483 484 // Migrate all the allocations 485 desiredChanges.Migrate += uint64(len(migrate)) 486 for _, alloc := range migrate.nameOrder() { 487 a.result.stop = append(a.result.stop, allocStopResult{ 488 alloc: alloc, 489 statusDescription: allocMigrating, 490 }) 491 a.result.place = append(a.result.place, allocPlaceResult{ 492 name: alloc.Name, 493 canary: false, 494 taskGroup: tg, 495 previousAlloc: alloc, 496 }) 497 } 498 499 // Create new deployment if: 500 // 1. Updating a job specification 501 // 2. No running allocations (first time running a job) 502 updatingSpec := len(destructive) != 0 || len(a.result.inplaceUpdate) != 0 503 hadRunning := false 504 for _, alloc := range all { 505 if alloc.Job.Version == a.job.Version && alloc.Job.CreateIndex == a.job.CreateIndex { 506 hadRunning = true 507 break 508 } 509 } 510 511 // Create a new deployment if necessary 512 if !existingDeployment && !strategy.IsEmpty() && dstate.DesiredTotal != 0 && (!hadRunning || updatingSpec) { 513 // A previous group may have made the deployment already 514 if a.deployment == nil { 515 a.deployment = structs.NewDeployment(a.job) 516 a.result.deployment = a.deployment 517 } 518 519 // Attach the groups deployment state to the deployment 520 a.deployment.TaskGroups[group] = dstate 521 } 522 523 // deploymentComplete is whether the deployment is complete which largely 524 // means that no placements were made or desired to be made 525 deploymentComplete := len(destructive)+len(inplace)+len(place)+len(migrate)+len(rescheduleNow)+len(rescheduleLater) == 0 && !requireCanary 526 527 // Final check to see if the deployment is complete is to ensure everything 528 // is healthy 529 if deploymentComplete && a.deployment != nil { 530 if dstate, ok := a.deployment.TaskGroups[group]; ok { 531 if dstate.HealthyAllocs < helper.IntMax(dstate.DesiredTotal, dstate.DesiredCanaries) || // Make sure we have enough healthy allocs 532 (dstate.DesiredCanaries > 0 && !dstate.Promoted) { // Make sure we are promoted if we have canaries 533 deploymentComplete = false 534 } 535 } 536 } 537 538 return deploymentComplete 539 } 540 541 // filterOldTerminalAllocs filters allocations that should be ignored since they 542 // are allocations that are terminal from a previous job version. 543 func (a *allocReconciler) filterOldTerminalAllocs(all allocSet) (filtered, ignore allocSet) { 544 if !a.batch { 545 return all, nil 546 } 547 548 filtered = filtered.union(all) 549 ignored := make(map[string]*structs.Allocation) 550 551 // Ignore terminal batch jobs from older versions 552 for id, alloc := range filtered { 553 older := alloc.Job.Version < a.job.Version || alloc.Job.CreateIndex < a.job.CreateIndex 554 if older && alloc.TerminalStatus() { 555 delete(filtered, id) 556 ignored[id] = alloc 557 } 558 } 559 560 return filtered, ignored 561 } 562 563 // handleGroupCanaries handles the canaries for the group by stopping the 564 // unneeded ones and returning the current set of canaries and the updated total 565 // set of allocs for the group 566 func (a *allocReconciler) handleGroupCanaries(all allocSet, desiredChanges *structs.DesiredUpdates) (canaries, newAll allocSet) { 567 // Stop any canary from an older deployment or from a failed one 568 var stop []string 569 570 // Cancel any non-promoted canaries from the older deployment 571 if a.oldDeployment != nil { 572 for _, s := range a.oldDeployment.TaskGroups { 573 if !s.Promoted { 574 stop = append(stop, s.PlacedCanaries...) 575 } 576 } 577 } 578 579 // Cancel any non-promoted canaries from a failed deployment 580 if a.deployment != nil && a.deployment.Status == structs.DeploymentStatusFailed { 581 for _, s := range a.deployment.TaskGroups { 582 if !s.Promoted { 583 stop = append(stop, s.PlacedCanaries...) 584 } 585 } 586 } 587 588 // stopSet is the allocSet that contains the canaries we desire to stop from 589 // above. 590 stopSet := all.fromKeys(stop) 591 a.markStop(stopSet, "", allocNotNeeded) 592 desiredChanges.Stop += uint64(len(stopSet)) 593 all = all.difference(stopSet) 594 595 // Capture our current set of canaries and handle any migrations that are 596 // needed by just stopping them. 597 if a.deployment != nil { 598 var canaryIDs []string 599 for _, s := range a.deployment.TaskGroups { 600 canaryIDs = append(canaryIDs, s.PlacedCanaries...) 601 } 602 603 canaries = all.fromKeys(canaryIDs) 604 untainted, migrate, lost := canaries.filterByTainted(a.taintedNodes) 605 a.markStop(migrate, "", allocMigrating) 606 a.markStop(lost, structs.AllocClientStatusLost, allocLost) 607 608 canaries = untainted 609 all = all.difference(migrate, lost) 610 } 611 612 return canaries, all 613 } 614 615 // computeLimit returns the placement limit for a particular group. The inputs 616 // are the group definition, the untainted, destructive, and migrate allocation 617 // set and whether we are in a canary state. 618 func (a *allocReconciler) computeLimit(group *structs.TaskGroup, untainted, destructive, migrate allocSet, canaryState bool) int { 619 // If there is no update strategy or deployment for the group we can deploy 620 // as many as the group has 621 if group.Update.IsEmpty() || len(destructive)+len(migrate) == 0 { 622 return group.Count 623 } else if a.deploymentPaused || a.deploymentFailed { 624 // If the deployment is paused or failed, do not create anything else 625 return 0 626 } 627 628 // If we have canaries and they have not been promoted the limit is 0 629 if canaryState { 630 return 0 631 } 632 633 // If we have been promoted or there are no canaries, the limit is the 634 // configured MaxParallel minus any outstanding non-healthy alloc for the 635 // deployment 636 limit := group.Update.MaxParallel 637 if a.deployment != nil { 638 partOf, _ := untainted.filterByDeployment(a.deployment.ID) 639 for _, alloc := range partOf { 640 // An unhealthy allocation means nothing else should be happen. 641 if alloc.DeploymentStatus.IsUnhealthy() { 642 return 0 643 } 644 645 if !alloc.DeploymentStatus.IsHealthy() { 646 limit-- 647 } 648 } 649 } 650 651 // The limit can be less than zero in the case that the job was changed such 652 // that it required destructive changes and the count was scaled up. 653 if limit < 0 { 654 return 0 655 } 656 657 return limit 658 } 659 660 // computePlacement returns the set of allocations to place given the group 661 // definition, the set of untainted, migrating and reschedule allocations for the group. 662 func (a *allocReconciler) computePlacements(group *structs.TaskGroup, 663 nameIndex *allocNameIndex, untainted, migrate allocSet, reschedule allocSet) []allocPlaceResult { 664 665 // Add rescheduled placement results 666 var place []allocPlaceResult 667 for _, alloc := range reschedule { 668 place = append(place, allocPlaceResult{ 669 name: alloc.Name, 670 taskGroup: group, 671 previousAlloc: alloc, 672 reschedule: true, 673 canary: alloc.DeploymentStatus.IsCanary(), 674 }) 675 } 676 677 // Hot path the nothing to do case 678 existing := len(untainted) + len(migrate) + len(reschedule) 679 if existing >= group.Count { 680 return place 681 } 682 683 // Add remaining placement results 684 if existing < group.Count { 685 for _, name := range nameIndex.Next(uint(group.Count - existing)) { 686 place = append(place, allocPlaceResult{ 687 name: name, 688 taskGroup: group, 689 }) 690 } 691 } 692 693 return place 694 } 695 696 // computeStop returns the set of allocations that are marked for stopping given 697 // the group definition, the set of allocations in various states and whether we 698 // are canarying. 699 func (a *allocReconciler) computeStop(group *structs.TaskGroup, nameIndex *allocNameIndex, 700 untainted, migrate, lost, canaries allocSet, canaryState bool) allocSet { 701 702 // Mark all lost allocations for stop. Previous allocation doesn't matter 703 // here since it is on a lost node 704 var stop allocSet 705 stop = stop.union(lost) 706 a.markStop(lost, structs.AllocClientStatusLost, allocLost) 707 708 // If we are still deploying or creating canaries, don't stop them 709 if canaryState { 710 untainted = untainted.difference(canaries) 711 } 712 713 // Hot path the nothing to do case 714 remove := len(untainted) + len(migrate) - group.Count 715 if remove <= 0 { 716 return stop 717 } 718 719 // Filter out any terminal allocations from the untainted set 720 // This is so that we don't try to mark them as stopped redundantly 721 untainted = filterByTerminal(untainted) 722 723 // Prefer stopping any alloc that has the same name as the canaries if we 724 // are promoted 725 if !canaryState && len(canaries) != 0 { 726 canaryNames := canaries.nameSet() 727 for id, alloc := range untainted.difference(canaries) { 728 if _, match := canaryNames[alloc.Name]; match { 729 stop[id] = alloc 730 a.result.stop = append(a.result.stop, allocStopResult{ 731 alloc: alloc, 732 statusDescription: allocNotNeeded, 733 }) 734 delete(untainted, id) 735 736 remove-- 737 if remove == 0 { 738 return stop 739 } 740 } 741 } 742 } 743 744 // Prefer selecting from the migrating set before stopping existing allocs 745 if len(migrate) != 0 { 746 mNames := newAllocNameIndex(a.jobID, group.Name, group.Count, migrate) 747 removeNames := mNames.Highest(uint(remove)) 748 for id, alloc := range migrate { 749 if _, match := removeNames[alloc.Name]; !match { 750 continue 751 } 752 a.result.stop = append(a.result.stop, allocStopResult{ 753 alloc: alloc, 754 statusDescription: allocNotNeeded, 755 }) 756 delete(migrate, id) 757 stop[id] = alloc 758 nameIndex.UnsetIndex(alloc.Index()) 759 760 remove-- 761 if remove == 0 { 762 return stop 763 } 764 } 765 } 766 767 // Select the allocs with the highest count to remove 768 removeNames := nameIndex.Highest(uint(remove)) 769 for id, alloc := range untainted { 770 if _, ok := removeNames[alloc.Name]; ok { 771 stop[id] = alloc 772 a.result.stop = append(a.result.stop, allocStopResult{ 773 alloc: alloc, 774 statusDescription: allocNotNeeded, 775 }) 776 delete(untainted, id) 777 778 remove-- 779 if remove == 0 { 780 return stop 781 } 782 } 783 } 784 785 // It is possible that we didn't stop as many as we should have if there 786 // were allocations with duplicate names. 787 for id, alloc := range untainted { 788 stop[id] = alloc 789 a.result.stop = append(a.result.stop, allocStopResult{ 790 alloc: alloc, 791 statusDescription: allocNotNeeded, 792 }) 793 delete(untainted, id) 794 795 remove-- 796 if remove == 0 { 797 return stop 798 } 799 } 800 801 return stop 802 } 803 804 // computeUpdates determines which allocations for the passed group require 805 // updates. Three groups are returned: 806 // 1. Those that require no upgrades 807 // 2. Those that can be upgraded in-place. These are added to the results 808 // automatically since the function contains the correct state to do so, 809 // 3. Those that require destructive updates 810 func (a *allocReconciler) computeUpdates(group *structs.TaskGroup, untainted allocSet) (ignore, inplace, destructive allocSet) { 811 // Determine the set of allocations that need to be updated 812 ignore = make(map[string]*structs.Allocation) 813 inplace = make(map[string]*structs.Allocation) 814 destructive = make(map[string]*structs.Allocation) 815 816 for _, alloc := range untainted { 817 ignoreChange, destructiveChange, inplaceAlloc := a.allocUpdateFn(alloc, a.job, group) 818 if ignoreChange { 819 ignore[alloc.ID] = alloc 820 } else if destructiveChange { 821 destructive[alloc.ID] = alloc 822 } else { 823 inplace[alloc.ID] = alloc 824 a.result.inplaceUpdate = append(a.result.inplaceUpdate, inplaceAlloc) 825 } 826 } 827 828 return 829 } 830 831 // handleDelayedReschedules creates batched followup evaluations with the WaitUntil field set 832 // for allocations that are eligible to be rescheduled later 833 func (a *allocReconciler) handleDelayedReschedules(rescheduleLater []*delayedRescheduleInfo, all allocSet, tgName string) { 834 if len(rescheduleLater) == 0 { 835 return 836 } 837 838 // Sort by time 839 sort.Slice(rescheduleLater, func(i, j int) bool { 840 return rescheduleLater[i].rescheduleTime.Before(rescheduleLater[j].rescheduleTime) 841 }) 842 843 var evals []*structs.Evaluation 844 nextReschedTime := rescheduleLater[0].rescheduleTime 845 allocIDToFollowupEvalID := make(map[string]string, len(rescheduleLater)) 846 847 // Create a new eval for the first batch 848 eval := &structs.Evaluation{ 849 ID: uuid.Generate(), 850 Namespace: a.job.Namespace, 851 Priority: a.job.Priority, 852 Type: a.job.Type, 853 TriggeredBy: structs.EvalTriggerRetryFailedAlloc, 854 JobID: a.job.ID, 855 JobModifyIndex: a.job.ModifyIndex, 856 Status: structs.EvalStatusPending, 857 StatusDescription: reschedulingFollowupEvalDesc, 858 WaitUntil: nextReschedTime, 859 } 860 evals = append(evals, eval) 861 862 for _, allocReschedInfo := range rescheduleLater { 863 if allocReschedInfo.rescheduleTime.Sub(nextReschedTime) < batchedFailedAllocWindowSize { 864 allocIDToFollowupEvalID[allocReschedInfo.allocID] = eval.ID 865 } else { 866 // Start a new batch 867 nextReschedTime = allocReschedInfo.rescheduleTime 868 // Create a new eval for the new batch 869 eval = &structs.Evaluation{ 870 ID: uuid.Generate(), 871 Namespace: a.job.Namespace, 872 Priority: a.job.Priority, 873 Type: a.job.Type, 874 TriggeredBy: structs.EvalTriggerRetryFailedAlloc, 875 JobID: a.job.ID, 876 JobModifyIndex: a.job.ModifyIndex, 877 Status: structs.EvalStatusPending, 878 WaitUntil: nextReschedTime, 879 } 880 evals = append(evals, eval) 881 // Set the evalID for the first alloc in this new batch 882 allocIDToFollowupEvalID[allocReschedInfo.allocID] = eval.ID 883 } 884 } 885 886 a.result.desiredFollowupEvals[tgName] = evals 887 888 // Initialize the annotations 889 if len(allocIDToFollowupEvalID) != 0 && a.result.attributeUpdates == nil { 890 a.result.attributeUpdates = make(map[string]*structs.Allocation) 891 } 892 893 // Create in-place updates for every alloc ID that needs to be updated with its follow up eval ID 894 for allocID, evalID := range allocIDToFollowupEvalID { 895 existingAlloc := all[allocID] 896 updatedAlloc := existingAlloc.Copy() 897 updatedAlloc.FollowupEvalID = evalID 898 a.result.attributeUpdates[updatedAlloc.ID] = updatedAlloc 899 } 900 }