github.com/smithx10/nomad@v0.9.1-rc1/scheduler/reconcile.go (about) 1 package scheduler 2 3 import ( 4 "fmt" 5 "time" 6 7 "sort" 8 9 log "github.com/hashicorp/go-hclog" 10 11 "github.com/hashicorp/nomad/helper" 12 "github.com/hashicorp/nomad/helper/uuid" 13 "github.com/hashicorp/nomad/nomad/structs" 14 ) 15 16 const ( 17 // batchedFailedAllocWindowSize is the window size used 18 // to batch up failed allocations before creating an eval 19 batchedFailedAllocWindowSize = 5 * time.Second 20 21 // rescheduleWindowSize is the window size relative to 22 // current time within which reschedulable allocations are placed. 23 // This helps protect against small clock drifts between servers 24 rescheduleWindowSize = 1 * time.Second 25 ) 26 27 // allocUpdateType takes an existing allocation and a new job definition and 28 // returns whether the allocation can ignore the change, requires a destructive 29 // update, or can be inplace updated. If it can be inplace updated, an updated 30 // allocation that has the new resources and alloc metrics attached will be 31 // returned. 32 type allocUpdateType func(existing *structs.Allocation, newJob *structs.Job, 33 newTG *structs.TaskGroup) (ignore, destructive bool, updated *structs.Allocation) 34 35 // allocReconciler is used to determine the set of allocations that require 36 // placement, inplace updating or stopping given the job specification and 37 // existing cluster state. The reconciler should only be used for batch and 38 // service jobs. 39 type allocReconciler struct { 40 // logger is used to log debug information. Logging should be kept at a 41 // minimal here 42 logger log.Logger 43 44 // canInplace is used to check if the allocation can be inplace upgraded 45 allocUpdateFn allocUpdateType 46 47 // batch marks whether the job is a batch job 48 batch bool 49 50 // job is the job being operated on, it may be nil if the job is being 51 // stopped via a purge 52 job *structs.Job 53 54 // jobID is the ID of the job being operated on. The job may be nil if it is 55 // being stopped so we require this separately. 56 jobID string 57 58 // oldDeployment is the last deployment for the job 59 oldDeployment *structs.Deployment 60 61 // deployment is the current deployment for the job 62 deployment *structs.Deployment 63 64 // deploymentPaused marks whether the deployment is paused 65 deploymentPaused bool 66 67 // deploymentFailed marks whether the deployment is failed 68 deploymentFailed bool 69 70 // taintedNodes contains a map of nodes that are tainted 71 taintedNodes map[string]*structs.Node 72 73 // existingAllocs is non-terminal existing allocations 74 existingAllocs []*structs.Allocation 75 76 // evalID is the ID of the evaluation that triggered the reconciler 77 evalID string 78 79 // now is the time used when determining rescheduling eligibility 80 // defaults to time.Now, and overidden in unit tests 81 now time.Time 82 83 // result is the results of the reconcile. During computation it can be 84 // used to store intermediate state 85 result *reconcileResults 86 } 87 88 // reconcileResults contains the results of the reconciliation and should be 89 // applied by the scheduler. 90 type reconcileResults struct { 91 // deployment is the deployment that should be created or updated as a 92 // result of scheduling 93 deployment *structs.Deployment 94 95 // deploymentUpdates contains a set of deployment updates that should be 96 // applied as a result of scheduling 97 deploymentUpdates []*structs.DeploymentStatusUpdate 98 99 // place is the set of allocations to place by the scheduler 100 place []allocPlaceResult 101 102 // destructiveUpdate is the set of allocations to apply a destructive update to 103 destructiveUpdate []allocDestructiveResult 104 105 // inplaceUpdate is the set of allocations to apply an inplace update to 106 inplaceUpdate []*structs.Allocation 107 108 // stop is the set of allocations to stop 109 stop []allocStopResult 110 111 // attributeUpdates are updates to the allocation that are not from a 112 // jobspec change. 113 attributeUpdates map[string]*structs.Allocation 114 115 // desiredTGUpdates captures the desired set of changes to make for each 116 // task group. 117 desiredTGUpdates map[string]*structs.DesiredUpdates 118 119 // desiredFollowupEvals is the map of follow up evaluations to create per task group 120 // This is used to create a delayed evaluation for rescheduling failed allocations. 121 desiredFollowupEvals map[string][]*structs.Evaluation 122 } 123 124 // delayedRescheduleInfo contains the allocation id and a time when its eligible to be rescheduled. 125 // this is used to create follow up evaluations 126 type delayedRescheduleInfo struct { 127 128 // allocID is the ID of the allocation eligible to be rescheduled 129 allocID string 130 131 // rescheduleTime is the time to use in the delayed evaluation 132 rescheduleTime time.Time 133 } 134 135 func (r *reconcileResults) GoString() string { 136 base := fmt.Sprintf("Total changes: (place %d) (destructive %d) (inplace %d) (stop %d)", 137 len(r.place), len(r.destructiveUpdate), len(r.inplaceUpdate), len(r.stop)) 138 139 if r.deployment != nil { 140 base += fmt.Sprintf("\nCreated Deployment: %q", r.deployment.ID) 141 } 142 for _, u := range r.deploymentUpdates { 143 base += fmt.Sprintf("\nDeployment Update for ID %q: Status %q; Description %q", 144 u.DeploymentID, u.Status, u.StatusDescription) 145 } 146 for tg, u := range r.desiredTGUpdates { 147 base += fmt.Sprintf("\nDesired Changes for %q: %#v", tg, u) 148 } 149 return base 150 } 151 152 // Changes returns the number of total changes 153 func (r *reconcileResults) Changes() int { 154 return len(r.place) + len(r.inplaceUpdate) + len(r.stop) 155 } 156 157 // NewAllocReconciler creates a new reconciler that should be used to determine 158 // the changes required to bring the cluster state inline with the declared jobspec 159 func NewAllocReconciler(logger log.Logger, allocUpdateFn allocUpdateType, batch bool, 160 jobID string, job *structs.Job, deployment *structs.Deployment, 161 existingAllocs []*structs.Allocation, taintedNodes map[string]*structs.Node, evalID string) *allocReconciler { 162 return &allocReconciler{ 163 logger: logger.Named("reconciler"), 164 allocUpdateFn: allocUpdateFn, 165 batch: batch, 166 jobID: jobID, 167 job: job, 168 deployment: deployment.Copy(), 169 existingAllocs: existingAllocs, 170 taintedNodes: taintedNodes, 171 evalID: evalID, 172 now: time.Now(), 173 result: &reconcileResults{ 174 desiredTGUpdates: make(map[string]*structs.DesiredUpdates), 175 desiredFollowupEvals: make(map[string][]*structs.Evaluation), 176 }, 177 } 178 } 179 180 // Compute reconciles the existing cluster state and returns the set of changes 181 // required to converge the job spec and state 182 func (a *allocReconciler) Compute() *reconcileResults { 183 // Create the allocation matrix 184 m := newAllocMatrix(a.job, a.existingAllocs) 185 186 // Handle stopping unneeded deployments 187 a.cancelDeployments() 188 189 // If we are just stopping a job we do not need to do anything more than 190 // stopping all running allocs 191 if a.job.Stopped() { 192 a.handleStop(m) 193 return a.result 194 } 195 196 // Detect if the deployment is paused 197 if a.deployment != nil { 198 a.deploymentPaused = a.deployment.Status == structs.DeploymentStatusPaused 199 a.deploymentFailed = a.deployment.Status == structs.DeploymentStatusFailed 200 } 201 202 // Reconcile each group 203 complete := true 204 for group, as := range m { 205 groupComplete := a.computeGroup(group, as) 206 complete = complete && groupComplete 207 } 208 209 // Mark the deployment as complete if possible 210 if a.deployment != nil && complete { 211 a.result.deploymentUpdates = append(a.result.deploymentUpdates, &structs.DeploymentStatusUpdate{ 212 DeploymentID: a.deployment.ID, 213 Status: structs.DeploymentStatusSuccessful, 214 StatusDescription: structs.DeploymentStatusDescriptionSuccessful, 215 }) 216 } 217 218 // Set the description of a created deployment 219 if d := a.result.deployment; d != nil { 220 if d.RequiresPromotion() { 221 d.StatusDescription = structs.DeploymentStatusDescriptionRunningNeedsPromotion 222 } 223 } 224 225 return a.result 226 } 227 228 // cancelDeployments cancels any deployment that is not needed 229 func (a *allocReconciler) cancelDeployments() { 230 // If the job is stopped and there is a non-terminal deployment, cancel it 231 if a.job.Stopped() { 232 if a.deployment != nil && a.deployment.Active() { 233 a.result.deploymentUpdates = append(a.result.deploymentUpdates, &structs.DeploymentStatusUpdate{ 234 DeploymentID: a.deployment.ID, 235 Status: structs.DeploymentStatusCancelled, 236 StatusDescription: structs.DeploymentStatusDescriptionStoppedJob, 237 }) 238 } 239 240 // Nothing else to do 241 a.oldDeployment = a.deployment 242 a.deployment = nil 243 return 244 } 245 246 d := a.deployment 247 if d == nil { 248 return 249 } 250 251 // Check if the deployment is active and referencing an older job and cancel it 252 if d.JobCreateIndex != a.job.CreateIndex || d.JobVersion != a.job.Version { 253 if d.Active() { 254 a.result.deploymentUpdates = append(a.result.deploymentUpdates, &structs.DeploymentStatusUpdate{ 255 DeploymentID: a.deployment.ID, 256 Status: structs.DeploymentStatusCancelled, 257 StatusDescription: structs.DeploymentStatusDescriptionNewerJob, 258 }) 259 } 260 261 a.oldDeployment = d 262 a.deployment = nil 263 } 264 265 // Clear it as the current deployment if it is successful 266 if d.Status == structs.DeploymentStatusSuccessful { 267 a.oldDeployment = d 268 a.deployment = nil 269 } 270 } 271 272 // handleStop marks all allocations to be stopped, handling the lost case 273 func (a *allocReconciler) handleStop(m allocMatrix) { 274 for group, as := range m { 275 as = filterByTerminal(as) 276 untainted, migrate, lost := as.filterByTainted(a.taintedNodes) 277 a.markStop(untainted, "", allocNotNeeded) 278 a.markStop(migrate, "", allocNotNeeded) 279 a.markStop(lost, structs.AllocClientStatusLost, allocLost) 280 desiredChanges := new(structs.DesiredUpdates) 281 desiredChanges.Stop = uint64(len(as)) 282 a.result.desiredTGUpdates[group] = desiredChanges 283 } 284 } 285 286 // markStop is a helper for marking a set of allocation for stop with a 287 // particular client status and description. 288 func (a *allocReconciler) markStop(allocs allocSet, clientStatus, statusDescription string) { 289 for _, alloc := range allocs { 290 a.result.stop = append(a.result.stop, allocStopResult{ 291 alloc: alloc, 292 clientStatus: clientStatus, 293 statusDescription: statusDescription, 294 }) 295 } 296 } 297 298 // computeGroup reconciles state for a particular task group. It returns whether 299 // the deployment it is for is complete with regards to the task group. 300 func (a *allocReconciler) computeGroup(group string, all allocSet) bool { 301 // Create the desired update object for the group 302 desiredChanges := new(structs.DesiredUpdates) 303 a.result.desiredTGUpdates[group] = desiredChanges 304 305 // Get the task group. The task group may be nil if the job was updates such 306 // that the task group no longer exists 307 tg := a.job.LookupTaskGroup(group) 308 309 // If the task group is nil, then the task group has been removed so all we 310 // need to do is stop everything 311 if tg == nil { 312 untainted, migrate, lost := all.filterByTainted(a.taintedNodes) 313 a.markStop(untainted, "", allocNotNeeded) 314 a.markStop(migrate, "", allocNotNeeded) 315 a.markStop(lost, structs.AllocClientStatusLost, allocLost) 316 desiredChanges.Stop = uint64(len(untainted) + len(migrate) + len(lost)) 317 return true 318 } 319 320 // Get the deployment state for the group 321 var dstate *structs.DeploymentState 322 existingDeployment := false 323 if a.deployment != nil { 324 dstate, existingDeployment = a.deployment.TaskGroups[group] 325 } 326 if !existingDeployment { 327 dstate = &structs.DeploymentState{} 328 if tg.Update != nil { 329 dstate.AutoRevert = tg.Update.AutoRevert 330 dstate.ProgressDeadline = tg.Update.ProgressDeadline 331 } 332 } 333 334 // Filter allocations that do not need to be considered because they are 335 // from an older job version and are terminal. 336 all, ignore := a.filterOldTerminalAllocs(all) 337 desiredChanges.Ignore += uint64(len(ignore)) 338 339 // canaries is the set of canaries for the current deployment and all is all 340 // allocs including the canaries 341 canaries, all := a.handleGroupCanaries(all, desiredChanges) 342 343 // Determine what set of allocations are on tainted nodes 344 untainted, migrate, lost := all.filterByTainted(a.taintedNodes) 345 346 // Determine what set of terminal allocations need to be rescheduled 347 untainted, rescheduleNow, rescheduleLater := untainted.filterByRescheduleable(a.batch, a.now, a.evalID, a.deployment) 348 349 // Create batched follow up evaluations for allocations that are 350 // reschedulable later and mark the allocations for in place updating 351 a.handleDelayedReschedules(rescheduleLater, all, tg.Name) 352 353 // Create a structure for choosing names. Seed with the taken names which is 354 // the union of untainted and migrating nodes (includes canaries) 355 nameIndex := newAllocNameIndex(a.jobID, group, tg.Count, untainted.union(migrate, rescheduleNow)) 356 357 // Stop any unneeded allocations and update the untainted set to not 358 // included stopped allocations. 359 canaryState := dstate != nil && dstate.DesiredCanaries != 0 && !dstate.Promoted 360 stop := a.computeStop(tg, nameIndex, untainted, migrate, lost, canaries, canaryState) 361 desiredChanges.Stop += uint64(len(stop)) 362 untainted = untainted.difference(stop) 363 364 // Do inplace upgrades where possible and capture the set of upgrades that 365 // need to be done destructively. 366 ignore, inplace, destructive := a.computeUpdates(tg, untainted) 367 desiredChanges.Ignore += uint64(len(ignore)) 368 desiredChanges.InPlaceUpdate += uint64(len(inplace)) 369 if !existingDeployment { 370 dstate.DesiredTotal += len(destructive) + len(inplace) 371 } 372 373 // Remove the canaries now that we have handled rescheduling so that we do 374 // not consider them when making placement decisions. 375 if canaryState { 376 untainted = untainted.difference(canaries) 377 } 378 379 // The fact that we have destructive updates and have less canaries than is 380 // desired means we need to create canaries 381 numDestructive := len(destructive) 382 strategy := tg.Update 383 canariesPromoted := dstate != nil && dstate.Promoted 384 requireCanary := numDestructive != 0 && strategy != nil && len(canaries) < strategy.Canary && !canariesPromoted 385 if requireCanary && !a.deploymentPaused && !a.deploymentFailed { 386 number := strategy.Canary - len(canaries) 387 desiredChanges.Canary += uint64(number) 388 if !existingDeployment { 389 dstate.DesiredCanaries = strategy.Canary 390 } 391 392 for _, name := range nameIndex.NextCanaries(uint(number), canaries, destructive) { 393 a.result.place = append(a.result.place, allocPlaceResult{ 394 name: name, 395 canary: true, 396 taskGroup: tg, 397 }) 398 } 399 } 400 401 // Determine how many we can place 402 canaryState = dstate != nil && dstate.DesiredCanaries != 0 && !dstate.Promoted 403 limit := a.computeLimit(tg, untainted, destructive, migrate, canaryState) 404 405 // Place if: 406 // * The deployment is not paused or failed 407 // * Not placing any canaries 408 // * If there are any canaries that they have been promoted 409 place := a.computePlacements(tg, nameIndex, untainted, migrate, rescheduleNow) 410 if !existingDeployment { 411 dstate.DesiredTotal += len(place) 412 } 413 414 // deploymentPlaceReady tracks whether the deployment is in a state where 415 // placements can be made without any other consideration. 416 deploymentPlaceReady := !a.deploymentPaused && !a.deploymentFailed && !canaryState 417 418 if deploymentPlaceReady { 419 desiredChanges.Place += uint64(len(place)) 420 for _, p := range place { 421 a.result.place = append(a.result.place, p) 422 } 423 424 min := helper.IntMin(len(place), limit) 425 limit -= min 426 } else if !deploymentPlaceReady { 427 // We do not want to place additional allocations but in the case we 428 // have lost allocations or allocations that require rescheduling now, 429 // we do so regardless to avoid odd user experiences. 430 if len(lost) != 0 { 431 allowed := helper.IntMin(len(lost), len(place)) 432 desiredChanges.Place += uint64(allowed) 433 for _, p := range place[:allowed] { 434 a.result.place = append(a.result.place, p) 435 } 436 } 437 438 // Handle rescheduling of failed allocations even if the deployment is 439 // failed. We do not reschedule if the allocation is part of the failed 440 // deployment. 441 if now := len(rescheduleNow); now != 0 { 442 for _, p := range place { 443 prev := p.PreviousAllocation() 444 if p.IsRescheduling() && !(a.deploymentFailed && prev != nil && a.deployment.ID == prev.DeploymentID) { 445 a.result.place = append(a.result.place, p) 446 desiredChanges.Place++ 447 } 448 } 449 } 450 } 451 452 if deploymentPlaceReady { 453 // Do all destructive updates 454 min := helper.IntMin(len(destructive), limit) 455 desiredChanges.DestructiveUpdate += uint64(min) 456 desiredChanges.Ignore += uint64(len(destructive) - min) 457 for _, alloc := range destructive.nameOrder()[:min] { 458 a.result.destructiveUpdate = append(a.result.destructiveUpdate, allocDestructiveResult{ 459 placeName: alloc.Name, 460 placeTaskGroup: tg, 461 stopAlloc: alloc, 462 stopStatusDescription: allocUpdating, 463 }) 464 } 465 } else { 466 desiredChanges.Ignore += uint64(len(destructive)) 467 } 468 469 // Migrate all the allocations 470 desiredChanges.Migrate += uint64(len(migrate)) 471 for _, alloc := range migrate.nameOrder() { 472 a.result.stop = append(a.result.stop, allocStopResult{ 473 alloc: alloc, 474 statusDescription: allocMigrating, 475 }) 476 a.result.place = append(a.result.place, allocPlaceResult{ 477 name: alloc.Name, 478 canary: false, 479 taskGroup: tg, 480 previousAlloc: alloc, 481 }) 482 } 483 484 // Create new deployment if: 485 // 1. Updating a job specification 486 // 2. No running allocations (first time running a job) 487 updatingSpec := len(destructive) != 0 || len(a.result.inplaceUpdate) != 0 488 hadRunning := false 489 for _, alloc := range all { 490 if alloc.Job.Version == a.job.Version && alloc.Job.CreateIndex == a.job.CreateIndex { 491 hadRunning = true 492 break 493 } 494 } 495 496 // Create a new deployment if necessary 497 if !existingDeployment && strategy != nil && dstate.DesiredTotal != 0 && (!hadRunning || updatingSpec) { 498 // A previous group may have made the deployment already 499 if a.deployment == nil { 500 a.deployment = structs.NewDeployment(a.job) 501 a.result.deployment = a.deployment 502 } 503 504 // Attach the groups deployment state to the deployment 505 a.deployment.TaskGroups[group] = dstate 506 } 507 508 // deploymentComplete is whether the deployment is complete which largely 509 // means that no placements were made or desired to be made 510 deploymentComplete := len(destructive)+len(inplace)+len(place)+len(migrate)+len(rescheduleNow)+len(rescheduleLater) == 0 && !requireCanary 511 512 // Final check to see if the deployment is complete is to ensure everything 513 // is healthy 514 if deploymentComplete && a.deployment != nil { 515 if dstate, ok := a.deployment.TaskGroups[group]; ok { 516 if dstate.HealthyAllocs < helper.IntMax(dstate.DesiredTotal, dstate.DesiredCanaries) || // Make sure we have enough healthy allocs 517 (dstate.DesiredCanaries > 0 && !dstate.Promoted) { // Make sure we are promoted if we have canaries 518 deploymentComplete = false 519 } 520 } 521 } 522 523 return deploymentComplete 524 } 525 526 // filterOldTerminalAllocs filters allocations that should be ignored since they 527 // are allocations that are terminal from a previous job version. 528 func (a *allocReconciler) filterOldTerminalAllocs(all allocSet) (filtered, ignore allocSet) { 529 if !a.batch { 530 return all, nil 531 } 532 533 filtered = filtered.union(all) 534 ignored := make(map[string]*structs.Allocation) 535 536 // Ignore terminal batch jobs from older versions 537 for id, alloc := range filtered { 538 older := alloc.Job.Version < a.job.Version || alloc.Job.CreateIndex < a.job.CreateIndex 539 if older && alloc.TerminalStatus() { 540 delete(filtered, id) 541 ignored[id] = alloc 542 } 543 } 544 545 return filtered, ignored 546 } 547 548 // handleGroupCanaries handles the canaries for the group by stopping the 549 // unneeded ones and returning the current set of canaries and the updated total 550 // set of allocs for the group 551 func (a *allocReconciler) handleGroupCanaries(all allocSet, desiredChanges *structs.DesiredUpdates) (canaries, newAll allocSet) { 552 // Stop any canary from an older deployment or from a failed one 553 var stop []string 554 555 // Cancel any non-promoted canaries from the older deployment 556 if a.oldDeployment != nil { 557 for _, s := range a.oldDeployment.TaskGroups { 558 if !s.Promoted { 559 stop = append(stop, s.PlacedCanaries...) 560 } 561 } 562 } 563 564 // Cancel any non-promoted canaries from a failed deployment 565 if a.deployment != nil && a.deployment.Status == structs.DeploymentStatusFailed { 566 for _, s := range a.deployment.TaskGroups { 567 if !s.Promoted { 568 stop = append(stop, s.PlacedCanaries...) 569 } 570 } 571 } 572 573 // stopSet is the allocSet that contains the canaries we desire to stop from 574 // above. 575 stopSet := all.fromKeys(stop) 576 a.markStop(stopSet, "", allocNotNeeded) 577 desiredChanges.Stop += uint64(len(stopSet)) 578 all = all.difference(stopSet) 579 580 // Capture our current set of canaries and handle any migrations that are 581 // needed by just stopping them. 582 if a.deployment != nil { 583 var canaryIDs []string 584 for _, s := range a.deployment.TaskGroups { 585 canaryIDs = append(canaryIDs, s.PlacedCanaries...) 586 } 587 588 canaries = all.fromKeys(canaryIDs) 589 untainted, migrate, lost := canaries.filterByTainted(a.taintedNodes) 590 a.markStop(migrate, "", allocMigrating) 591 a.markStop(lost, structs.AllocClientStatusLost, allocLost) 592 593 canaries = untainted 594 all = all.difference(migrate, lost) 595 } 596 597 return canaries, all 598 } 599 600 // computeLimit returns the placement limit for a particular group. The inputs 601 // are the group definition, the untainted, destructive, and migrate allocation 602 // set and whether we are in a canary state. 603 func (a *allocReconciler) computeLimit(group *structs.TaskGroup, untainted, destructive, migrate allocSet, canaryState bool) int { 604 // If there is no update strategy or deployment for the group we can deploy 605 // as many as the group has 606 if group.Update == nil || len(destructive)+len(migrate) == 0 { 607 return group.Count 608 } else if a.deploymentPaused || a.deploymentFailed { 609 // If the deployment is paused or failed, do not create anything else 610 return 0 611 } 612 613 // If we have canaries and they have not been promoted the limit is 0 614 if canaryState { 615 return 0 616 } 617 618 // If we have been promoted or there are no canaries, the limit is the 619 // configured MaxParallel minus any outstanding non-healthy alloc for the 620 // deployment 621 limit := group.Update.MaxParallel 622 if a.deployment != nil { 623 partOf, _ := untainted.filterByDeployment(a.deployment.ID) 624 for _, alloc := range partOf { 625 // An unhealthy allocation means nothing else should be happen. 626 if alloc.DeploymentStatus.IsUnhealthy() { 627 return 0 628 } 629 630 if !alloc.DeploymentStatus.IsHealthy() { 631 limit-- 632 } 633 } 634 } 635 636 // The limit can be less than zero in the case that the job was changed such 637 // that it required destructive changes and the count was scaled up. 638 if limit < 0 { 639 return 0 640 } 641 642 return limit 643 } 644 645 // computePlacement returns the set of allocations to place given the group 646 // definition, the set of untainted, migrating and reschedule allocations for the group. 647 func (a *allocReconciler) computePlacements(group *structs.TaskGroup, 648 nameIndex *allocNameIndex, untainted, migrate allocSet, reschedule allocSet) []allocPlaceResult { 649 650 // Add rescheduled placement results 651 var place []allocPlaceResult 652 for _, alloc := range reschedule { 653 place = append(place, allocPlaceResult{ 654 name: alloc.Name, 655 taskGroup: group, 656 previousAlloc: alloc, 657 reschedule: true, 658 canary: alloc.DeploymentStatus.IsCanary(), 659 }) 660 } 661 662 // Hot path the nothing to do case 663 existing := len(untainted) + len(migrate) + len(reschedule) 664 if existing >= group.Count { 665 return place 666 } 667 668 // Add remaining placement results 669 if existing < group.Count { 670 for _, name := range nameIndex.Next(uint(group.Count - existing)) { 671 place = append(place, allocPlaceResult{ 672 name: name, 673 taskGroup: group, 674 }) 675 } 676 } 677 678 return place 679 } 680 681 // computeStop returns the set of allocations that are marked for stopping given 682 // the group definition, the set of allocations in various states and whether we 683 // are canarying. 684 func (a *allocReconciler) computeStop(group *structs.TaskGroup, nameIndex *allocNameIndex, 685 untainted, migrate, lost, canaries allocSet, canaryState bool) allocSet { 686 687 // Mark all lost allocations for stop. Previous allocation doesn't matter 688 // here since it is on a lost node 689 var stop allocSet 690 stop = stop.union(lost) 691 a.markStop(lost, structs.AllocClientStatusLost, allocLost) 692 693 // If we are still deploying or creating canaries, don't stop them 694 if canaryState { 695 untainted = untainted.difference(canaries) 696 } 697 698 // Hot path the nothing to do case 699 remove := len(untainted) + len(migrate) - group.Count 700 if remove <= 0 { 701 return stop 702 } 703 704 // Filter out any terminal allocations from the untainted set 705 // This is so that we don't try to mark them as stopped redundantly 706 untainted = filterByTerminal(untainted) 707 708 // Prefer stopping any alloc that has the same name as the canaries if we 709 // are promoted 710 if !canaryState && len(canaries) != 0 { 711 canaryNames := canaries.nameSet() 712 for id, alloc := range untainted.difference(canaries) { 713 if _, match := canaryNames[alloc.Name]; match { 714 stop[id] = alloc 715 a.result.stop = append(a.result.stop, allocStopResult{ 716 alloc: alloc, 717 statusDescription: allocNotNeeded, 718 }) 719 delete(untainted, id) 720 721 remove-- 722 if remove == 0 { 723 return stop 724 } 725 } 726 } 727 } 728 729 // Prefer selecting from the migrating set before stopping existing allocs 730 if len(migrate) != 0 { 731 mNames := newAllocNameIndex(a.jobID, group.Name, group.Count, migrate) 732 removeNames := mNames.Highest(uint(remove)) 733 for id, alloc := range migrate { 734 if _, match := removeNames[alloc.Name]; !match { 735 continue 736 } 737 a.result.stop = append(a.result.stop, allocStopResult{ 738 alloc: alloc, 739 statusDescription: allocNotNeeded, 740 }) 741 delete(migrate, id) 742 stop[id] = alloc 743 nameIndex.UnsetIndex(alloc.Index()) 744 745 remove-- 746 if remove == 0 { 747 return stop 748 } 749 } 750 } 751 752 // Select the allocs with the highest count to remove 753 removeNames := nameIndex.Highest(uint(remove)) 754 for id, alloc := range untainted { 755 if _, ok := removeNames[alloc.Name]; ok { 756 stop[id] = alloc 757 a.result.stop = append(a.result.stop, allocStopResult{ 758 alloc: alloc, 759 statusDescription: allocNotNeeded, 760 }) 761 delete(untainted, id) 762 763 remove-- 764 if remove == 0 { 765 return stop 766 } 767 } 768 } 769 770 // It is possible that we didn't stop as many as we should have if there 771 // were allocations with duplicate names. 772 for id, alloc := range untainted { 773 stop[id] = alloc 774 a.result.stop = append(a.result.stop, allocStopResult{ 775 alloc: alloc, 776 statusDescription: allocNotNeeded, 777 }) 778 delete(untainted, id) 779 780 remove-- 781 if remove == 0 { 782 return stop 783 } 784 } 785 786 return stop 787 } 788 789 // computeUpdates determines which allocations for the passed group require 790 // updates. Three groups are returned: 791 // 1. Those that require no upgrades 792 // 2. Those that can be upgraded in-place. These are added to the results 793 // automatically since the function contains the correct state to do so, 794 // 3. Those that require destructive updates 795 func (a *allocReconciler) computeUpdates(group *structs.TaskGroup, untainted allocSet) (ignore, inplace, destructive allocSet) { 796 // Determine the set of allocations that need to be updated 797 ignore = make(map[string]*structs.Allocation) 798 inplace = make(map[string]*structs.Allocation) 799 destructive = make(map[string]*structs.Allocation) 800 801 for _, alloc := range untainted { 802 ignoreChange, destructiveChange, inplaceAlloc := a.allocUpdateFn(alloc, a.job, group) 803 if ignoreChange { 804 ignore[alloc.ID] = alloc 805 } else if destructiveChange { 806 destructive[alloc.ID] = alloc 807 } else { 808 inplace[alloc.ID] = alloc 809 a.result.inplaceUpdate = append(a.result.inplaceUpdate, inplaceAlloc) 810 } 811 } 812 813 return 814 } 815 816 // handleDelayedReschedules creates batched followup evaluations with the WaitUntil field set 817 // for allocations that are eligible to be rescheduled later 818 func (a *allocReconciler) handleDelayedReschedules(rescheduleLater []*delayedRescheduleInfo, all allocSet, tgName string) { 819 if len(rescheduleLater) == 0 { 820 return 821 } 822 823 // Sort by time 824 sort.Slice(rescheduleLater, func(i, j int) bool { 825 return rescheduleLater[i].rescheduleTime.Before(rescheduleLater[j].rescheduleTime) 826 }) 827 828 var evals []*structs.Evaluation 829 nextReschedTime := rescheduleLater[0].rescheduleTime 830 allocIDToFollowupEvalID := make(map[string]string, len(rescheduleLater)) 831 832 // Create a new eval for the first batch 833 eval := &structs.Evaluation{ 834 ID: uuid.Generate(), 835 Namespace: a.job.Namespace, 836 Priority: a.job.Priority, 837 Type: a.job.Type, 838 TriggeredBy: structs.EvalTriggerRetryFailedAlloc, 839 JobID: a.job.ID, 840 JobModifyIndex: a.job.ModifyIndex, 841 Status: structs.EvalStatusPending, 842 StatusDescription: reschedulingFollowupEvalDesc, 843 WaitUntil: nextReschedTime, 844 } 845 evals = append(evals, eval) 846 847 for _, allocReschedInfo := range rescheduleLater { 848 if allocReschedInfo.rescheduleTime.Sub(nextReschedTime) < batchedFailedAllocWindowSize { 849 allocIDToFollowupEvalID[allocReschedInfo.allocID] = eval.ID 850 } else { 851 // Start a new batch 852 nextReschedTime = allocReschedInfo.rescheduleTime 853 // Create a new eval for the new batch 854 eval = &structs.Evaluation{ 855 ID: uuid.Generate(), 856 Namespace: a.job.Namespace, 857 Priority: a.job.Priority, 858 Type: a.job.Type, 859 TriggeredBy: structs.EvalTriggerRetryFailedAlloc, 860 JobID: a.job.ID, 861 JobModifyIndex: a.job.ModifyIndex, 862 Status: structs.EvalStatusPending, 863 WaitUntil: nextReschedTime, 864 } 865 evals = append(evals, eval) 866 // Set the evalID for the first alloc in this new batch 867 allocIDToFollowupEvalID[allocReschedInfo.allocID] = eval.ID 868 } 869 } 870 871 a.result.desiredFollowupEvals[tgName] = evals 872 873 // Initialize the annotations 874 if len(allocIDToFollowupEvalID) != 0 && a.result.attributeUpdates == nil { 875 a.result.attributeUpdates = make(map[string]*structs.Allocation) 876 } 877 878 // Create in-place updates for every alloc ID that needs to be updated with its follow up eval ID 879 for allocID, evalID := range allocIDToFollowupEvalID { 880 existingAlloc := all[allocID] 881 updatedAlloc := existingAlloc.Copy() 882 updatedAlloc.FollowupEvalID = evalID 883 a.result.attributeUpdates[updatedAlloc.ID] = updatedAlloc 884 } 885 }