github.com/emate/nomad@v0.8.2-wo-binpacking/scheduler/reconcile.go (about) 1 package scheduler 2 3 import ( 4 "fmt" 5 "log" 6 "time" 7 8 "sort" 9 10 "github.com/hashicorp/nomad/helper" 11 "github.com/hashicorp/nomad/helper/uuid" 12 "github.com/hashicorp/nomad/nomad/structs" 13 ) 14 15 const ( 16 // batchedFailedAllocWindowSize is the window size used 17 // to batch up failed allocations before creating an eval 18 batchedFailedAllocWindowSize = 5 * time.Second 19 20 // rescheduleWindowSize is the window size relative to 21 // current time within which reschedulable allocations are placed. 22 // This helps protect against small clock drifts between servers 23 rescheduleWindowSize = 1 * time.Second 24 ) 25 26 // allocUpdateType takes an existing allocation and a new job definition and 27 // returns whether the allocation can ignore the change, requires a destructive 28 // update, or can be inplace updated. If it can be inplace updated, an updated 29 // allocation that has the new resources and alloc metrics attached will be 30 // returned. 31 type allocUpdateType func(existing *structs.Allocation, newJob *structs.Job, 32 newTG *structs.TaskGroup) (ignore, destructive bool, updated *structs.Allocation) 33 34 // allocReconciler is used to determine the set of allocations that require 35 // placement, inplace updating or stopping given the job specification and 36 // existing cluster state. The reconciler should only be used for batch and 37 // service jobs. 38 type allocReconciler struct { 39 // logger is used to log debug information. Logging should be kept at a 40 // minimal here 41 logger *log.Logger 42 43 // canInplace is used to check if the allocation can be inplace upgraded 44 allocUpdateFn allocUpdateType 45 46 // batch marks whether the job is a batch job 47 batch bool 48 49 // job is the job being operated on, it may be nil if the job is being 50 // stopped via a purge 51 job *structs.Job 52 53 // jobID is the ID of the job being operated on. The job may be nil if it is 54 // being stopped so we require this separately. 55 jobID string 56 57 // oldDeployment is the last deployment for the job 58 oldDeployment *structs.Deployment 59 60 // deployment is the current deployment for the job 61 deployment *structs.Deployment 62 63 // deploymentPaused marks whether the deployment is paused 64 deploymentPaused bool 65 66 // deploymentFailed marks whether the deployment is failed 67 deploymentFailed bool 68 69 // taintedNodes contains a map of nodes that are tainted 70 taintedNodes map[string]*structs.Node 71 72 // existingAllocs is non-terminal existing allocations 73 existingAllocs []*structs.Allocation 74 75 // evalID is the ID of the evaluation that triggered the reconciler 76 evalID string 77 78 // now is the time used when determining rescheduling eligibility 79 // defaults to time.Now, and overidden in unit tests 80 now time.Time 81 82 // result is the results of the reconcile. During computation it can be 83 // used to store intermediate state 84 result *reconcileResults 85 } 86 87 // reconcileResults contains the results of the reconciliation and should be 88 // applied by the scheduler. 89 type reconcileResults struct { 90 // deployment is the deployment that should be created or updated as a 91 // result of scheduling 92 deployment *structs.Deployment 93 94 // deploymentUpdates contains a set of deployment updates that should be 95 // applied as a result of scheduling 96 deploymentUpdates []*structs.DeploymentStatusUpdate 97 98 // place is the set of allocations to place by the scheduler 99 place []allocPlaceResult 100 101 // destructiveUpdate is the set of allocations to apply a destructive update to 102 destructiveUpdate []allocDestructiveResult 103 104 // inplaceUpdate is the set of allocations to apply an inplace update to 105 inplaceUpdate []*structs.Allocation 106 107 // stop is the set of allocations to stop 108 stop []allocStopResult 109 110 // attributeUpdates are updates to the allocation that are not from a 111 // jobspec change. 112 attributeUpdates map[string]*structs.Allocation 113 114 // desiredTGUpdates captures the desired set of changes to make for each 115 // task group. 116 desiredTGUpdates map[string]*structs.DesiredUpdates 117 118 // desiredFollowupEvals is the map of follow up evaluations to create per task group 119 // This is used to create a delayed evaluation for rescheduling failed allocations. 120 desiredFollowupEvals map[string][]*structs.Evaluation 121 } 122 123 // delayedRescheduleInfo contains the allocation id and a time when its eligible to be rescheduled. 124 // this is used to create follow up evaluations 125 type delayedRescheduleInfo struct { 126 127 // allocID is the ID of the allocation eligible to be rescheduled 128 allocID string 129 130 // rescheduleTime is the time to use in the delayed evaluation 131 rescheduleTime time.Time 132 } 133 134 func (r *reconcileResults) GoString() string { 135 base := fmt.Sprintf("Total changes: (place %d) (destructive %d) (inplace %d) (stop %d)", 136 len(r.place), len(r.destructiveUpdate), len(r.inplaceUpdate), len(r.stop)) 137 138 if r.deployment != nil { 139 base += fmt.Sprintf("\nCreated Deployment: %q", r.deployment.ID) 140 } 141 for _, u := range r.deploymentUpdates { 142 base += fmt.Sprintf("\nDeployment Update for ID %q: Status %q; Description %q", 143 u.DeploymentID, u.Status, u.StatusDescription) 144 } 145 for tg, u := range r.desiredTGUpdates { 146 base += fmt.Sprintf("\nDesired Changes for %q: %#v", tg, u) 147 } 148 return base 149 } 150 151 // Changes returns the number of total changes 152 func (r *reconcileResults) Changes() int { 153 return len(r.place) + len(r.inplaceUpdate) + len(r.stop) 154 } 155 156 // NewAllocReconciler creates a new reconciler that should be used to determine 157 // the changes required to bring the cluster state inline with the declared jobspec 158 func NewAllocReconciler(logger *log.Logger, allocUpdateFn allocUpdateType, batch bool, 159 jobID string, job *structs.Job, deployment *structs.Deployment, 160 existingAllocs []*structs.Allocation, taintedNodes map[string]*structs.Node, evalID string) *allocReconciler { 161 return &allocReconciler{ 162 logger: logger, 163 allocUpdateFn: allocUpdateFn, 164 batch: batch, 165 jobID: jobID, 166 job: job, 167 deployment: deployment.Copy(), 168 existingAllocs: existingAllocs, 169 taintedNodes: taintedNodes, 170 evalID: evalID, 171 now: time.Now(), 172 result: &reconcileResults{ 173 desiredTGUpdates: make(map[string]*structs.DesiredUpdates), 174 desiredFollowupEvals: make(map[string][]*structs.Evaluation), 175 }, 176 } 177 } 178 179 // Compute reconciles the existing cluster state and returns the set of changes 180 // required to converge the job spec and state 181 func (a *allocReconciler) Compute() *reconcileResults { 182 // Create the allocation matrix 183 m := newAllocMatrix(a.job, a.existingAllocs) 184 185 // Handle stopping unneeded deployments 186 a.cancelDeployments() 187 188 // If we are just stopping a job we do not need to do anything more than 189 // stopping all running allocs 190 if a.job.Stopped() { 191 a.handleStop(m) 192 return a.result 193 } 194 195 // Detect if the deployment is paused 196 if a.deployment != nil { 197 // Detect if any allocs associated with this deploy have failed 198 // Failed allocations could edge trigger an evaluation before the deployment watcher 199 // runs and marks the deploy as failed. This block makes sure that is still 200 // considered a failed deploy 201 failedAllocsInDeploy := false 202 for _, as := range m { 203 for _, alloc := range as { 204 if alloc.DeploymentID == a.deployment.ID && alloc.ClientStatus == structs.AllocClientStatusFailed { 205 failedAllocsInDeploy = true 206 } 207 } 208 } 209 a.deploymentPaused = a.deployment.Status == structs.DeploymentStatusPaused 210 a.deploymentFailed = a.deployment.Status == structs.DeploymentStatusFailed || failedAllocsInDeploy 211 } 212 213 // Reconcile each group 214 complete := true 215 for group, as := range m { 216 groupComplete := a.computeGroup(group, as) 217 complete = complete && groupComplete 218 } 219 220 // Mark the deployment as complete if possible 221 if a.deployment != nil && complete { 222 a.result.deploymentUpdates = append(a.result.deploymentUpdates, &structs.DeploymentStatusUpdate{ 223 DeploymentID: a.deployment.ID, 224 Status: structs.DeploymentStatusSuccessful, 225 StatusDescription: structs.DeploymentStatusDescriptionSuccessful, 226 }) 227 } 228 229 // Set the description of a created deployment 230 if d := a.result.deployment; d != nil { 231 if d.RequiresPromotion() { 232 d.StatusDescription = structs.DeploymentStatusDescriptionRunningNeedsPromotion 233 } 234 } 235 236 return a.result 237 } 238 239 // cancelDeployments cancels any deployment that is not needed 240 func (a *allocReconciler) cancelDeployments() { 241 // If the job is stopped and there is a non-terminal deployment, cancel it 242 if a.job.Stopped() { 243 if a.deployment != nil && a.deployment.Active() { 244 a.result.deploymentUpdates = append(a.result.deploymentUpdates, &structs.DeploymentStatusUpdate{ 245 DeploymentID: a.deployment.ID, 246 Status: structs.DeploymentStatusCancelled, 247 StatusDescription: structs.DeploymentStatusDescriptionStoppedJob, 248 }) 249 } 250 251 // Nothing else to do 252 a.oldDeployment = a.deployment 253 a.deployment = nil 254 return 255 } 256 257 d := a.deployment 258 if d == nil { 259 return 260 } 261 262 // Check if the deployment is active and referencing an older job and cancel it 263 if d.JobCreateIndex != a.job.CreateIndex || d.JobVersion != a.job.Version { 264 if d.Active() { 265 a.result.deploymentUpdates = append(a.result.deploymentUpdates, &structs.DeploymentStatusUpdate{ 266 DeploymentID: a.deployment.ID, 267 Status: structs.DeploymentStatusCancelled, 268 StatusDescription: structs.DeploymentStatusDescriptionNewerJob, 269 }) 270 } 271 272 a.oldDeployment = d 273 a.deployment = nil 274 } 275 276 // Clear it as the current deployment if it is successful 277 if d.Status == structs.DeploymentStatusSuccessful { 278 a.oldDeployment = d 279 a.deployment = nil 280 } 281 } 282 283 // handleStop marks all allocations to be stopped, handling the lost case 284 func (a *allocReconciler) handleStop(m allocMatrix) { 285 for group, as := range m { 286 untainted, migrate, lost := as.filterByTainted(a.taintedNodes) 287 a.markStop(untainted, "", allocNotNeeded) 288 a.markStop(migrate, "", allocNotNeeded) 289 a.markStop(lost, structs.AllocClientStatusLost, allocLost) 290 desiredChanges := new(structs.DesiredUpdates) 291 desiredChanges.Stop = uint64(len(as)) 292 a.result.desiredTGUpdates[group] = desiredChanges 293 } 294 } 295 296 // markStop is a helper for marking a set of allocation for stop with a 297 // particular client status and description. 298 func (a *allocReconciler) markStop(allocs allocSet, clientStatus, statusDescription string) { 299 for _, alloc := range allocs { 300 a.result.stop = append(a.result.stop, allocStopResult{ 301 alloc: alloc, 302 clientStatus: clientStatus, 303 statusDescription: statusDescription, 304 }) 305 } 306 } 307 308 // computeGroup reconciles state for a particular task group. It returns whether 309 // the deployment it is for is complete with regards to the task group. 310 func (a *allocReconciler) computeGroup(group string, all allocSet) bool { 311 // Create the desired update object for the group 312 desiredChanges := new(structs.DesiredUpdates) 313 a.result.desiredTGUpdates[group] = desiredChanges 314 315 // Get the task group. The task group may be nil if the job was updates such 316 // that the task group no longer exists 317 tg := a.job.LookupTaskGroup(group) 318 319 // If the task group is nil, then the task group has been removed so all we 320 // need to do is stop everything 321 if tg == nil { 322 untainted, migrate, lost := all.filterByTainted(a.taintedNodes) 323 a.markStop(untainted, "", allocNotNeeded) 324 a.markStop(migrate, "", allocNotNeeded) 325 a.markStop(lost, structs.AllocClientStatusLost, allocLost) 326 desiredChanges.Stop = uint64(len(untainted) + len(migrate) + len(lost)) 327 return true 328 } 329 330 // Get the deployment state for the group 331 var dstate *structs.DeploymentState 332 existingDeployment := false 333 if a.deployment != nil { 334 dstate, existingDeployment = a.deployment.TaskGroups[group] 335 } 336 if !existingDeployment { 337 autorevert := false 338 if tg.Update != nil && tg.Update.AutoRevert { 339 autorevert = true 340 } 341 dstate = &structs.DeploymentState{ 342 AutoRevert: autorevert, 343 } 344 } 345 346 // Filter allocations that do not need to be considered because they are 347 // from an older job version and are terminal. 348 all, ignore := a.filterOldTerminalAllocs(all) 349 desiredChanges.Ignore += uint64(len(ignore)) 350 351 canaries, all := a.handleGroupCanaries(all, desiredChanges) 352 353 // Determine what set of allocations are on tainted nodes 354 untainted, migrate, lost := all.filterByTainted(a.taintedNodes) 355 356 // Determine what set of terminal allocations need to be rescheduled 357 untainted, rescheduleNow, rescheduleLater := untainted.filterByRescheduleable(a.batch, a.now, a.evalID) 358 359 // Create batched follow up evaluations for allocations that are 360 // reschedulable later and mark the allocations for in place updating 361 a.handleDelayedReschedules(rescheduleLater, all, tg.Name) 362 363 // Create a structure for choosing names. Seed with the taken names which is 364 // the union of untainted and migrating nodes (includes canaries) 365 nameIndex := newAllocNameIndex(a.jobID, group, tg.Count, untainted.union(migrate, rescheduleNow)) 366 367 // Stop any unneeded allocations and update the untainted set to not 368 // included stopped allocations. 369 canaryState := dstate != nil && dstate.DesiredCanaries != 0 && !dstate.Promoted 370 stop := a.computeStop(tg, nameIndex, untainted, migrate, lost, canaries, canaryState) 371 desiredChanges.Stop += uint64(len(stop)) 372 untainted = untainted.difference(stop) 373 374 // Having stopped un-needed allocations, append the canaries to the existing 375 // set of untainted because they are promoted. This will cause them to be 376 // treated like non-canaries 377 if !canaryState { 378 untainted = untainted.union(canaries) 379 nameIndex.Set(canaries) 380 } 381 382 // Do inplace upgrades where possible and capture the set of upgrades that 383 // need to be done destructively. 384 ignore, inplace, destructive := a.computeUpdates(tg, untainted) 385 desiredChanges.Ignore += uint64(len(ignore)) 386 desiredChanges.InPlaceUpdate += uint64(len(inplace)) 387 if !existingDeployment { 388 dstate.DesiredTotal += len(destructive) + len(inplace) 389 } 390 391 // The fact that we have destructive updates and have less canaries than is 392 // desired means we need to create canaries 393 numDestructive := len(destructive) 394 strategy := tg.Update 395 canariesPromoted := dstate != nil && dstate.Promoted 396 requireCanary := numDestructive != 0 && strategy != nil && len(canaries) < strategy.Canary && !canariesPromoted 397 if requireCanary && !a.deploymentPaused && !a.deploymentFailed { 398 number := strategy.Canary - len(canaries) 399 number = helper.IntMin(numDestructive, number) 400 desiredChanges.Canary += uint64(number) 401 if !existingDeployment { 402 dstate.DesiredCanaries = strategy.Canary 403 } 404 405 for _, name := range nameIndex.NextCanaries(uint(number), canaries, destructive) { 406 a.result.place = append(a.result.place, allocPlaceResult{ 407 name: name, 408 canary: true, 409 taskGroup: tg, 410 }) 411 } 412 } 413 414 // Determine how many we can place 415 canaryState = dstate != nil && dstate.DesiredCanaries != 0 && !dstate.Promoted 416 limit := a.computeLimit(tg, untainted, destructive, migrate, canaryState) 417 418 // Place if: 419 // * The deployment is not paused or failed 420 // * Not placing any canaries 421 // * If there are any canaries that they have been promoted 422 place := a.computePlacements(tg, nameIndex, untainted, migrate, rescheduleNow) 423 if !existingDeployment { 424 dstate.DesiredTotal += len(place) 425 } 426 427 // deploymentPlaceReady tracks whether the deployment is in a state where 428 // placements can be made without any other consideration. 429 deploymentPlaceReady := !a.deploymentPaused && !a.deploymentFailed && !canaryState 430 431 if deploymentPlaceReady { 432 desiredChanges.Place += uint64(len(place)) 433 for _, p := range place { 434 a.result.place = append(a.result.place, p) 435 } 436 437 min := helper.IntMin(len(place), limit) 438 limit -= min 439 } else if !deploymentPlaceReady && len(lost) != 0 { 440 // We are in a situation where we shouldn't be placing more than we need 441 // to but we have lost allocations. It is a very weird user experience 442 // if you have a node go down and Nomad doesn't replace the allocations 443 // because the deployment is paused/failed so we only place to recover 444 // the lost allocations. 445 allowed := helper.IntMin(len(lost), len(place)) 446 desiredChanges.Place += uint64(allowed) 447 for _, p := range place[:allowed] { 448 a.result.place = append(a.result.place, p) 449 } 450 } 451 452 if deploymentPlaceReady { 453 // Do all destructive updates 454 min := helper.IntMin(len(destructive), limit) 455 desiredChanges.DestructiveUpdate += uint64(min) 456 desiredChanges.Ignore += uint64(len(destructive) - min) 457 for _, alloc := range destructive.nameOrder()[:min] { 458 a.result.destructiveUpdate = append(a.result.destructiveUpdate, allocDestructiveResult{ 459 placeName: alloc.Name, 460 placeTaskGroup: tg, 461 stopAlloc: alloc, 462 stopStatusDescription: allocUpdating, 463 }) 464 } 465 } else { 466 desiredChanges.Ignore += uint64(len(destructive)) 467 } 468 469 // Calculate the allowed number of changes and set the desired changes 470 // accordingly. 471 if !a.deploymentFailed && !a.deploymentPaused { 472 desiredChanges.Migrate += uint64(len(migrate)) 473 } else { 474 desiredChanges.Stop += uint64(len(migrate)) 475 } 476 477 for _, alloc := range migrate.nameOrder() { 478 // If the deployment is failed or paused, don't replace it, just mark as stop. 479 if a.deploymentFailed || a.deploymentPaused { 480 a.result.stop = append(a.result.stop, allocStopResult{ 481 alloc: alloc, 482 statusDescription: allocNodeTainted, 483 }) 484 continue 485 } 486 487 a.result.stop = append(a.result.stop, allocStopResult{ 488 alloc: alloc, 489 statusDescription: allocMigrating, 490 }) 491 a.result.place = append(a.result.place, allocPlaceResult{ 492 name: alloc.Name, 493 canary: false, 494 taskGroup: tg, 495 previousAlloc: alloc, 496 }) 497 } 498 499 // Create new deployment if: 500 // 1. Updating a job specification 501 // 2. No running allocations (first time running a job) 502 updatingSpec := len(destructive) != 0 || len(a.result.inplaceUpdate) != 0 503 hadRunning := false 504 for _, alloc := range all { 505 if alloc.Job.Version == a.job.Version { 506 hadRunning = true 507 break 508 } 509 } 510 511 // Create a new deployment if necessary 512 if !existingDeployment && strategy != nil && dstate.DesiredTotal != 0 && (!hadRunning || updatingSpec) { 513 // A previous group may have made the deployment already 514 if a.deployment == nil { 515 a.deployment = structs.NewDeployment(a.job) 516 a.result.deployment = a.deployment 517 } 518 519 // Attach the groups deployment state to the deployment 520 a.deployment.TaskGroups[group] = dstate 521 } 522 523 // deploymentComplete is whether the deployment is complete which largely 524 // means that no placements were made or desired to be made 525 deploymentComplete := len(destructive)+len(inplace)+len(place)+len(migrate) == 0 && !requireCanary 526 527 // Final check to see if the deployment is complete is to ensure everything 528 // is healthy 529 if deploymentComplete && a.deployment != nil { 530 partOf, _ := untainted.filterByDeployment(a.deployment.ID) 531 for _, alloc := range partOf { 532 if !alloc.DeploymentStatus.IsHealthy() { 533 deploymentComplete = false 534 break 535 } 536 } 537 } 538 539 return deploymentComplete 540 } 541 542 // filterOldTerminalAllocs filters allocations that should be ignored since they 543 // are allocations that are terminal from a previous job version. 544 func (a *allocReconciler) filterOldTerminalAllocs(all allocSet) (filtered, ignore allocSet) { 545 if !a.batch { 546 return all, nil 547 } 548 549 filtered = filtered.union(all) 550 ignored := make(map[string]*structs.Allocation) 551 552 // Ignore terminal batch jobs from older versions 553 for id, alloc := range filtered { 554 older := alloc.Job.Version < a.job.Version || alloc.Job.CreateIndex < a.job.CreateIndex 555 if older && alloc.TerminalStatus() { 556 delete(filtered, id) 557 ignored[id] = alloc 558 } 559 } 560 561 return filtered, ignored 562 } 563 564 // handleGroupCanaries handles the canaries for the group by stopping the 565 // unneeded ones and returning the current set of canaries and the updated total 566 // set of allocs for the group 567 func (a *allocReconciler) handleGroupCanaries(all allocSet, desiredChanges *structs.DesiredUpdates) (canaries, newAll allocSet) { 568 // Stop any canary from an older deployment or from a failed one 569 var stop []string 570 571 // Cancel any non-promoted canaries from the older deployment 572 if a.oldDeployment != nil { 573 for _, s := range a.oldDeployment.TaskGroups { 574 if !s.Promoted { 575 stop = append(stop, s.PlacedCanaries...) 576 } 577 } 578 } 579 580 // Cancel any non-promoted canaries from a failed deployment 581 if a.deployment != nil && a.deployment.Status == structs.DeploymentStatusFailed { 582 for _, s := range a.deployment.TaskGroups { 583 if !s.Promoted { 584 stop = append(stop, s.PlacedCanaries...) 585 } 586 } 587 } 588 589 // stopSet is the allocSet that contains the canaries we desire to stop from 590 // above. 591 stopSet := all.fromKeys(stop) 592 a.markStop(stopSet, "", allocNotNeeded) 593 desiredChanges.Stop += uint64(len(stopSet)) 594 all = all.difference(stopSet) 595 596 // Capture our current set of canaries and handle any migrations that are 597 // needed by just stopping them. 598 if a.deployment != nil { 599 var canaryIDs []string 600 for _, s := range a.deployment.TaskGroups { 601 canaryIDs = append(canaryIDs, s.PlacedCanaries...) 602 } 603 604 canaries = all.fromKeys(canaryIDs) 605 untainted, migrate, lost := canaries.filterByTainted(a.taintedNodes) 606 a.markStop(migrate, "", allocMigrating) 607 a.markStop(lost, structs.AllocClientStatusLost, allocLost) 608 609 canaries = untainted 610 all = all.difference(migrate, lost) 611 } 612 613 return canaries, all 614 } 615 616 // computeLimit returns the placement limit for a particular group. The inputs 617 // are the group definition, the untainted, destructive, and migrate allocation 618 // set and whether we are in a canary state. 619 func (a *allocReconciler) computeLimit(group *structs.TaskGroup, untainted, destructive, migrate allocSet, canaryState bool) int { 620 // If there is no update strategy or deployment for the group we can deploy 621 // as many as the group has 622 if group.Update == nil || len(destructive)+len(migrate) == 0 { 623 return group.Count 624 } else if a.deploymentPaused || a.deploymentFailed { 625 // If the deployment is paused or failed, do not create anything else 626 return 0 627 } 628 629 // If we have canaries and they have not been promoted the limit is 0 630 if canaryState { 631 return 0 632 } 633 634 // If we have been promoted or there are no canaries, the limit is the 635 // configured MaxParallel minus any outstanding non-healthy alloc for the 636 // deployment 637 limit := group.Update.MaxParallel 638 if a.deployment != nil { 639 partOf, _ := untainted.filterByDeployment(a.deployment.ID) 640 for _, alloc := range partOf { 641 // An unhealthy allocation means nothing else should be happen. 642 if alloc.DeploymentStatus.IsUnhealthy() { 643 return 0 644 } 645 646 if !alloc.DeploymentStatus.IsHealthy() { 647 limit-- 648 } 649 } 650 } 651 652 // The limit can be less than zero in the case that the job was changed such 653 // that it required destructive changes and the count was scaled up. 654 if limit < 0 { 655 return 0 656 } 657 658 return limit 659 } 660 661 // computePlacement returns the set of allocations to place given the group 662 // definition, the set of untainted, migrating and reschedule allocations for the group. 663 func (a *allocReconciler) computePlacements(group *structs.TaskGroup, 664 nameIndex *allocNameIndex, untainted, migrate allocSet, reschedule allocSet) []allocPlaceResult { 665 666 // Hot path the nothing to do case 667 existing := len(untainted) + len(migrate) 668 if existing >= group.Count { 669 return nil 670 } 671 var place []allocPlaceResult 672 // Add rescheduled placement results 673 // Any allocations being rescheduled will remain at DesiredStatusRun ClientStatusFailed 674 for _, alloc := range reschedule { 675 place = append(place, allocPlaceResult{ 676 name: alloc.Name, 677 taskGroup: group, 678 previousAlloc: alloc, 679 reschedule: true, 680 }) 681 existing += 1 682 if existing == group.Count { 683 break 684 } 685 } 686 // Add remaining placement results 687 if existing < group.Count { 688 for _, name := range nameIndex.Next(uint(group.Count - existing)) { 689 place = append(place, allocPlaceResult{ 690 name: name, 691 taskGroup: group, 692 }) 693 } 694 } 695 696 return place 697 } 698 699 // computeStop returns the set of allocations that are marked for stopping given 700 // the group definition, the set of allocations in various states and whether we 701 // are canarying. 702 func (a *allocReconciler) computeStop(group *structs.TaskGroup, nameIndex *allocNameIndex, 703 untainted, migrate, lost, canaries allocSet, canaryState bool) allocSet { 704 705 // Mark all lost allocations for stop. Previous allocation doesn't matter 706 // here since it is on a lost node 707 var stop allocSet 708 stop = stop.union(lost) 709 a.markStop(lost, structs.AllocClientStatusLost, allocLost) 710 711 // If we are still deploying or creating canaries, don't stop them 712 if canaryState { 713 untainted = untainted.difference(canaries) 714 } 715 716 // Hot path the nothing to do case 717 remove := len(untainted) + len(migrate) - group.Count 718 if remove <= 0 { 719 return stop 720 } 721 722 // Filter out any terminal allocations from the untainted set 723 // This is so that we don't try to mark them as stopped redundantly 724 untainted = filterByTerminal(untainted) 725 726 // Prefer stopping any alloc that has the same name as the canaries if we 727 // are promoted 728 if !canaryState && len(canaries) != 0 { 729 canaryNames := canaries.nameSet() 730 for id, alloc := range untainted.difference(canaries) { 731 if _, match := canaryNames[alloc.Name]; match { 732 stop[id] = alloc 733 a.result.stop = append(a.result.stop, allocStopResult{ 734 alloc: alloc, 735 statusDescription: allocNotNeeded, 736 }) 737 delete(untainted, id) 738 739 remove-- 740 if remove == 0 { 741 return stop 742 } 743 } 744 } 745 } 746 747 // Prefer selecting from the migrating set before stopping existing allocs 748 if len(migrate) != 0 { 749 mNames := newAllocNameIndex(a.jobID, group.Name, group.Count, migrate) 750 removeNames := mNames.Highest(uint(remove)) 751 for id, alloc := range migrate { 752 if _, match := removeNames[alloc.Name]; !match { 753 continue 754 } 755 a.result.stop = append(a.result.stop, allocStopResult{ 756 alloc: alloc, 757 statusDescription: allocNotNeeded, 758 }) 759 delete(migrate, id) 760 stop[id] = alloc 761 nameIndex.UnsetIndex(alloc.Index()) 762 763 remove-- 764 if remove == 0 { 765 return stop 766 } 767 } 768 } 769 770 // Select the allocs with the highest count to remove 771 removeNames := nameIndex.Highest(uint(remove)) 772 for id, alloc := range untainted { 773 if _, ok := removeNames[alloc.Name]; ok { 774 stop[id] = alloc 775 a.result.stop = append(a.result.stop, allocStopResult{ 776 alloc: alloc, 777 statusDescription: allocNotNeeded, 778 }) 779 delete(untainted, id) 780 781 remove-- 782 if remove == 0 { 783 return stop 784 } 785 } 786 } 787 788 // It is possible that we didn't stop as many as we should have if there 789 // were allocations with duplicate names. 790 for id, alloc := range untainted { 791 stop[id] = alloc 792 a.result.stop = append(a.result.stop, allocStopResult{ 793 alloc: alloc, 794 statusDescription: allocNotNeeded, 795 }) 796 delete(untainted, id) 797 798 remove-- 799 if remove == 0 { 800 return stop 801 } 802 } 803 804 return stop 805 } 806 807 // computeUpdates determines which allocations for the passed group require 808 // updates. Three groups are returned: 809 // 1. Those that require no upgrades 810 // 2. Those that can be upgraded in-place. These are added to the results 811 // automatically since the function contains the correct state to do so, 812 // 3. Those that require destructive updates 813 func (a *allocReconciler) computeUpdates(group *structs.TaskGroup, untainted allocSet) (ignore, inplace, destructive allocSet) { 814 // Determine the set of allocations that need to be updated 815 ignore = make(map[string]*structs.Allocation) 816 inplace = make(map[string]*structs.Allocation) 817 destructive = make(map[string]*structs.Allocation) 818 819 for _, alloc := range untainted { 820 ignoreChange, destructiveChange, inplaceAlloc := a.allocUpdateFn(alloc, a.job, group) 821 if ignoreChange { 822 ignore[alloc.ID] = alloc 823 } else if destructiveChange { 824 destructive[alloc.ID] = alloc 825 } else { 826 inplace[alloc.ID] = alloc 827 a.result.inplaceUpdate = append(a.result.inplaceUpdate, inplaceAlloc) 828 } 829 } 830 831 return 832 } 833 834 // handleDelayedReschedules creates batched followup evaluations with the WaitUntil field set 835 // for allocations that are eligible to be rescheduled later 836 func (a *allocReconciler) handleDelayedReschedules(rescheduleLater []*delayedRescheduleInfo, all allocSet, tgName string) { 837 if len(rescheduleLater) == 0 { 838 return 839 } 840 841 // Sort by time 842 sort.Slice(rescheduleLater, func(i, j int) bool { 843 return rescheduleLater[i].rescheduleTime.Before(rescheduleLater[j].rescheduleTime) 844 }) 845 846 var evals []*structs.Evaluation 847 nextReschedTime := rescheduleLater[0].rescheduleTime 848 allocIDToFollowupEvalID := make(map[string]string, len(rescheduleLater)) 849 850 // Create a new eval for the first batch 851 eval := &structs.Evaluation{ 852 ID: uuid.Generate(), 853 Namespace: a.job.Namespace, 854 Priority: a.job.Priority, 855 Type: a.job.Type, 856 TriggeredBy: structs.EvalTriggerRetryFailedAlloc, 857 JobID: a.job.ID, 858 JobModifyIndex: a.job.ModifyIndex, 859 Status: structs.EvalStatusPending, 860 StatusDescription: reschedulingFollowupEvalDesc, 861 WaitUntil: nextReschedTime, 862 } 863 evals = append(evals, eval) 864 865 for _, allocReschedInfo := range rescheduleLater { 866 if allocReschedInfo.rescheduleTime.Sub(nextReschedTime) < batchedFailedAllocWindowSize { 867 allocIDToFollowupEvalID[allocReschedInfo.allocID] = eval.ID 868 } else { 869 // Start a new batch 870 nextReschedTime = allocReschedInfo.rescheduleTime 871 // Create a new eval for the new batch 872 eval = &structs.Evaluation{ 873 ID: uuid.Generate(), 874 Namespace: a.job.Namespace, 875 Priority: a.job.Priority, 876 Type: a.job.Type, 877 TriggeredBy: structs.EvalTriggerRetryFailedAlloc, 878 JobID: a.job.ID, 879 JobModifyIndex: a.job.ModifyIndex, 880 Status: structs.EvalStatusPending, 881 WaitUntil: nextReschedTime, 882 } 883 evals = append(evals, eval) 884 // Set the evalID for the first alloc in this new batch 885 allocIDToFollowupEvalID[allocReschedInfo.allocID] = eval.ID 886 } 887 } 888 889 a.result.desiredFollowupEvals[tgName] = evals 890 891 // Initialize the annotations 892 if len(allocIDToFollowupEvalID) != 0 && a.result.attributeUpdates == nil { 893 a.result.attributeUpdates = make(map[string]*structs.Allocation) 894 } 895 896 // Create in-place updates for every alloc ID that needs to be updated with its follow up eval ID 897 for allocID, evalID := range allocIDToFollowupEvalID { 898 existingAlloc := all[allocID] 899 updatedAlloc := existingAlloc.Copy() 900 updatedAlloc.FollowupEvalID = evalID 901 a.result.attributeUpdates[updatedAlloc.ID] = updatedAlloc 902 } 903 }