github.com/anuvu/nomad@v0.8.7-atom1/scheduler/reconcile.go (about) 1 package scheduler 2 3 import ( 4 "fmt" 5 "log" 6 "time" 7 8 "sort" 9 10 "github.com/hashicorp/nomad/helper" 11 "github.com/hashicorp/nomad/helper/uuid" 12 "github.com/hashicorp/nomad/nomad/structs" 13 ) 14 15 const ( 16 // batchedFailedAllocWindowSize is the window size used 17 // to batch up failed allocations before creating an eval 18 batchedFailedAllocWindowSize = 5 * time.Second 19 20 // rescheduleWindowSize is the window size relative to 21 // current time within which reschedulable allocations are placed. 22 // This helps protect against small clock drifts between servers 23 rescheduleWindowSize = 1 * time.Second 24 ) 25 26 // allocUpdateType takes an existing allocation and a new job definition and 27 // returns whether the allocation can ignore the change, requires a destructive 28 // update, or can be inplace updated. If it can be inplace updated, an updated 29 // allocation that has the new resources and alloc metrics attached will be 30 // returned. 31 type allocUpdateType func(existing *structs.Allocation, newJob *structs.Job, 32 newTG *structs.TaskGroup) (ignore, destructive bool, updated *structs.Allocation) 33 34 // allocReconciler is used to determine the set of allocations that require 35 // placement, inplace updating or stopping given the job specification and 36 // existing cluster state. The reconciler should only be used for batch and 37 // service jobs. 38 type allocReconciler struct { 39 // logger is used to log debug information. Logging should be kept at a 40 // minimal here 41 logger *log.Logger 42 43 // canInplace is used to check if the allocation can be inplace upgraded 44 allocUpdateFn allocUpdateType 45 46 // batch marks whether the job is a batch job 47 batch bool 48 49 // job is the job being operated on, it may be nil if the job is being 50 // stopped via a purge 51 job *structs.Job 52 53 // jobID is the ID of the job being operated on. The job may be nil if it is 54 // being stopped so we require this separately. 55 jobID string 56 57 // oldDeployment is the last deployment for the job 58 oldDeployment *structs.Deployment 59 60 // deployment is the current deployment for the job 61 deployment *structs.Deployment 62 63 // deploymentPaused marks whether the deployment is paused 64 deploymentPaused bool 65 66 // deploymentFailed marks whether the deployment is failed 67 deploymentFailed bool 68 69 // taintedNodes contains a map of nodes that are tainted 70 taintedNodes map[string]*structs.Node 71 72 // existingAllocs is non-terminal existing allocations 73 existingAllocs []*structs.Allocation 74 75 // evalID is the ID of the evaluation that triggered the reconciler 76 evalID string 77 78 // now is the time used when determining rescheduling eligibility 79 // defaults to time.Now, and overidden in unit tests 80 now time.Time 81 82 // result is the results of the reconcile. During computation it can be 83 // used to store intermediate state 84 result *reconcileResults 85 } 86 87 // reconcileResults contains the results of the reconciliation and should be 88 // applied by the scheduler. 89 type reconcileResults struct { 90 // deployment is the deployment that should be created or updated as a 91 // result of scheduling 92 deployment *structs.Deployment 93 94 // deploymentUpdates contains a set of deployment updates that should be 95 // applied as a result of scheduling 96 deploymentUpdates []*structs.DeploymentStatusUpdate 97 98 // place is the set of allocations to place by the scheduler 99 place []allocPlaceResult 100 101 // destructiveUpdate is the set of allocations to apply a destructive update to 102 destructiveUpdate []allocDestructiveResult 103 104 // inplaceUpdate is the set of allocations to apply an inplace update to 105 inplaceUpdate []*structs.Allocation 106 107 // stop is the set of allocations to stop 108 stop []allocStopResult 109 110 // attributeUpdates are updates to the allocation that are not from a 111 // jobspec change. 112 attributeUpdates map[string]*structs.Allocation 113 114 // desiredTGUpdates captures the desired set of changes to make for each 115 // task group. 116 desiredTGUpdates map[string]*structs.DesiredUpdates 117 118 // desiredFollowupEvals is the map of follow up evaluations to create per task group 119 // This is used to create a delayed evaluation for rescheduling failed allocations. 120 desiredFollowupEvals map[string][]*structs.Evaluation 121 } 122 123 // delayedRescheduleInfo contains the allocation id and a time when its eligible to be rescheduled. 124 // this is used to create follow up evaluations 125 type delayedRescheduleInfo struct { 126 127 // allocID is the ID of the allocation eligible to be rescheduled 128 allocID string 129 130 // rescheduleTime is the time to use in the delayed evaluation 131 rescheduleTime time.Time 132 } 133 134 func (r *reconcileResults) GoString() string { 135 base := fmt.Sprintf("Total changes: (place %d) (destructive %d) (inplace %d) (stop %d)", 136 len(r.place), len(r.destructiveUpdate), len(r.inplaceUpdate), len(r.stop)) 137 138 if r.deployment != nil { 139 base += fmt.Sprintf("\nCreated Deployment: %q", r.deployment.ID) 140 } 141 for _, u := range r.deploymentUpdates { 142 base += fmt.Sprintf("\nDeployment Update for ID %q: Status %q; Description %q", 143 u.DeploymentID, u.Status, u.StatusDescription) 144 } 145 for tg, u := range r.desiredTGUpdates { 146 base += fmt.Sprintf("\nDesired Changes for %q: %#v", tg, u) 147 } 148 return base 149 } 150 151 // Changes returns the number of total changes 152 func (r *reconcileResults) Changes() int { 153 return len(r.place) + len(r.inplaceUpdate) + len(r.stop) 154 } 155 156 // NewAllocReconciler creates a new reconciler that should be used to determine 157 // the changes required to bring the cluster state inline with the declared jobspec 158 func NewAllocReconciler(logger *log.Logger, allocUpdateFn allocUpdateType, batch bool, 159 jobID string, job *structs.Job, deployment *structs.Deployment, 160 existingAllocs []*structs.Allocation, taintedNodes map[string]*structs.Node, evalID string) *allocReconciler { 161 return &allocReconciler{ 162 logger: logger, 163 allocUpdateFn: allocUpdateFn, 164 batch: batch, 165 jobID: jobID, 166 job: job, 167 deployment: deployment.Copy(), 168 existingAllocs: existingAllocs, 169 taintedNodes: taintedNodes, 170 evalID: evalID, 171 now: time.Now(), 172 result: &reconcileResults{ 173 desiredTGUpdates: make(map[string]*structs.DesiredUpdates), 174 desiredFollowupEvals: make(map[string][]*structs.Evaluation), 175 }, 176 } 177 } 178 179 // Compute reconciles the existing cluster state and returns the set of changes 180 // required to converge the job spec and state 181 func (a *allocReconciler) Compute() *reconcileResults { 182 // Create the allocation matrix 183 m := newAllocMatrix(a.job, a.existingAllocs) 184 185 // Handle stopping unneeded deployments 186 a.cancelDeployments() 187 188 // If we are just stopping a job we do not need to do anything more than 189 // stopping all running allocs 190 if a.job.Stopped() { 191 a.handleStop(m) 192 return a.result 193 } 194 195 // Detect if the deployment is paused 196 if a.deployment != nil { 197 a.deploymentPaused = a.deployment.Status == structs.DeploymentStatusPaused 198 a.deploymentFailed = a.deployment.Status == structs.DeploymentStatusFailed 199 } 200 201 // Reconcile each group 202 complete := true 203 for group, as := range m { 204 groupComplete := a.computeGroup(group, as) 205 complete = complete && groupComplete 206 } 207 208 // Mark the deployment as complete if possible 209 if a.deployment != nil && complete { 210 a.result.deploymentUpdates = append(a.result.deploymentUpdates, &structs.DeploymentStatusUpdate{ 211 DeploymentID: a.deployment.ID, 212 Status: structs.DeploymentStatusSuccessful, 213 StatusDescription: structs.DeploymentStatusDescriptionSuccessful, 214 }) 215 } 216 217 // Set the description of a created deployment 218 if d := a.result.deployment; d != nil { 219 if d.RequiresPromotion() { 220 d.StatusDescription = structs.DeploymentStatusDescriptionRunningNeedsPromotion 221 } 222 } 223 224 return a.result 225 } 226 227 // cancelDeployments cancels any deployment that is not needed 228 func (a *allocReconciler) cancelDeployments() { 229 // If the job is stopped and there is a non-terminal deployment, cancel it 230 if a.job.Stopped() { 231 if a.deployment != nil && a.deployment.Active() { 232 a.result.deploymentUpdates = append(a.result.deploymentUpdates, &structs.DeploymentStatusUpdate{ 233 DeploymentID: a.deployment.ID, 234 Status: structs.DeploymentStatusCancelled, 235 StatusDescription: structs.DeploymentStatusDescriptionStoppedJob, 236 }) 237 } 238 239 // Nothing else to do 240 a.oldDeployment = a.deployment 241 a.deployment = nil 242 return 243 } 244 245 d := a.deployment 246 if d == nil { 247 return 248 } 249 250 // Check if the deployment is active and referencing an older job and cancel it 251 if d.JobCreateIndex != a.job.CreateIndex || d.JobVersion != a.job.Version { 252 if d.Active() { 253 a.result.deploymentUpdates = append(a.result.deploymentUpdates, &structs.DeploymentStatusUpdate{ 254 DeploymentID: a.deployment.ID, 255 Status: structs.DeploymentStatusCancelled, 256 StatusDescription: structs.DeploymentStatusDescriptionNewerJob, 257 }) 258 } 259 260 a.oldDeployment = d 261 a.deployment = nil 262 } 263 264 // Clear it as the current deployment if it is successful 265 if d.Status == structs.DeploymentStatusSuccessful { 266 a.oldDeployment = d 267 a.deployment = nil 268 } 269 } 270 271 // handleStop marks all allocations to be stopped, handling the lost case 272 func (a *allocReconciler) handleStop(m allocMatrix) { 273 for group, as := range m { 274 as = filterByTerminal(as) 275 untainted, migrate, lost := as.filterByTainted(a.taintedNodes) 276 a.markStop(untainted, "", allocNotNeeded) 277 a.markStop(migrate, "", allocNotNeeded) 278 a.markStop(lost, structs.AllocClientStatusLost, allocLost) 279 desiredChanges := new(structs.DesiredUpdates) 280 desiredChanges.Stop = uint64(len(as)) 281 a.result.desiredTGUpdates[group] = desiredChanges 282 } 283 } 284 285 // markStop is a helper for marking a set of allocation for stop with a 286 // particular client status and description. 287 func (a *allocReconciler) markStop(allocs allocSet, clientStatus, statusDescription string) { 288 for _, alloc := range allocs { 289 a.result.stop = append(a.result.stop, allocStopResult{ 290 alloc: alloc, 291 clientStatus: clientStatus, 292 statusDescription: statusDescription, 293 }) 294 } 295 } 296 297 // computeGroup reconciles state for a particular task group. It returns whether 298 // the deployment it is for is complete with regards to the task group. 299 func (a *allocReconciler) computeGroup(group string, all allocSet) bool { 300 // Create the desired update object for the group 301 desiredChanges := new(structs.DesiredUpdates) 302 a.result.desiredTGUpdates[group] = desiredChanges 303 304 // Get the task group. The task group may be nil if the job was updates such 305 // that the task group no longer exists 306 tg := a.job.LookupTaskGroup(group) 307 308 // If the task group is nil, then the task group has been removed so all we 309 // need to do is stop everything 310 if tg == nil { 311 untainted, migrate, lost := all.filterByTainted(a.taintedNodes) 312 a.markStop(untainted, "", allocNotNeeded) 313 a.markStop(migrate, "", allocNotNeeded) 314 a.markStop(lost, structs.AllocClientStatusLost, allocLost) 315 desiredChanges.Stop = uint64(len(untainted) + len(migrate) + len(lost)) 316 return true 317 } 318 319 // Get the deployment state for the group 320 var dstate *structs.DeploymentState 321 existingDeployment := false 322 if a.deployment != nil { 323 dstate, existingDeployment = a.deployment.TaskGroups[group] 324 } 325 if !existingDeployment { 326 dstate = &structs.DeploymentState{} 327 if tg.Update != nil { 328 dstate.AutoRevert = tg.Update.AutoRevert 329 dstate.ProgressDeadline = tg.Update.ProgressDeadline 330 } 331 } 332 333 // Filter allocations that do not need to be considered because they are 334 // from an older job version and are terminal. 335 all, ignore := a.filterOldTerminalAllocs(all) 336 desiredChanges.Ignore += uint64(len(ignore)) 337 338 // canaries is the set of canaries for the current deployment and all is all 339 // allocs including the canaries 340 canaries, all := a.handleGroupCanaries(all, desiredChanges) 341 342 // Determine what set of allocations are on tainted nodes 343 untainted, migrate, lost := all.filterByTainted(a.taintedNodes) 344 345 // Determine what set of terminal allocations need to be rescheduled 346 untainted, rescheduleNow, rescheduleLater := untainted.filterByRescheduleable(a.batch, a.now, a.evalID, a.deployment) 347 348 // Create batched follow up evaluations for allocations that are 349 // reschedulable later and mark the allocations for in place updating 350 a.handleDelayedReschedules(rescheduleLater, all, tg.Name) 351 352 // Create a structure for choosing names. Seed with the taken names which is 353 // the union of untainted and migrating nodes (includes canaries) 354 nameIndex := newAllocNameIndex(a.jobID, group, tg.Count, untainted.union(migrate, rescheduleNow)) 355 356 // Stop any unneeded allocations and update the untainted set to not 357 // included stopped allocations. 358 canaryState := dstate != nil && dstate.DesiredCanaries != 0 && !dstate.Promoted 359 stop := a.computeStop(tg, nameIndex, untainted, migrate, lost, canaries, canaryState) 360 desiredChanges.Stop += uint64(len(stop)) 361 untainted = untainted.difference(stop) 362 363 // Do inplace upgrades where possible and capture the set of upgrades that 364 // need to be done destructively. 365 ignore, inplace, destructive := a.computeUpdates(tg, untainted) 366 desiredChanges.Ignore += uint64(len(ignore)) 367 desiredChanges.InPlaceUpdate += uint64(len(inplace)) 368 if !existingDeployment { 369 dstate.DesiredTotal += len(destructive) + len(inplace) 370 } 371 372 // Remove the canaries now that we have handled rescheduling so that we do 373 // not consider them when making placement decisions. 374 if canaryState { 375 untainted = untainted.difference(canaries) 376 } 377 378 // The fact that we have destructive updates and have less canaries than is 379 // desired means we need to create canaries 380 numDestructive := len(destructive) 381 strategy := tg.Update 382 canariesPromoted := dstate != nil && dstate.Promoted 383 requireCanary := numDestructive != 0 && strategy != nil && len(canaries) < strategy.Canary && !canariesPromoted 384 if requireCanary && !a.deploymentPaused && !a.deploymentFailed { 385 number := strategy.Canary - len(canaries) 386 desiredChanges.Canary += uint64(number) 387 if !existingDeployment { 388 dstate.DesiredCanaries = strategy.Canary 389 } 390 391 for _, name := range nameIndex.NextCanaries(uint(number), canaries, destructive) { 392 a.result.place = append(a.result.place, allocPlaceResult{ 393 name: name, 394 canary: true, 395 taskGroup: tg, 396 }) 397 } 398 } 399 400 // Determine how many we can place 401 canaryState = dstate != nil && dstate.DesiredCanaries != 0 && !dstate.Promoted 402 limit := a.computeLimit(tg, untainted, destructive, migrate, canaryState) 403 404 // Place if: 405 // * The deployment is not paused or failed 406 // * Not placing any canaries 407 // * If there are any canaries that they have been promoted 408 place := a.computePlacements(tg, nameIndex, untainted, migrate, rescheduleNow) 409 if !existingDeployment { 410 dstate.DesiredTotal += len(place) 411 } 412 413 // deploymentPlaceReady tracks whether the deployment is in a state where 414 // placements can be made without any other consideration. 415 deploymentPlaceReady := !a.deploymentPaused && !a.deploymentFailed && !canaryState 416 417 if deploymentPlaceReady { 418 desiredChanges.Place += uint64(len(place)) 419 for _, p := range place { 420 a.result.place = append(a.result.place, p) 421 } 422 423 min := helper.IntMin(len(place), limit) 424 limit -= min 425 } else if !deploymentPlaceReady { 426 // We do not want to place additional allocations but in the case we 427 // have lost allocations or allocations that require rescheduling now, 428 // we do so regardless to avoid odd user experiences. 429 if len(lost) != 0 { 430 allowed := helper.IntMin(len(lost), len(place)) 431 desiredChanges.Place += uint64(allowed) 432 for _, p := range place[:allowed] { 433 a.result.place = append(a.result.place, p) 434 } 435 } 436 437 // Handle rescheduling of failed allocations even if the deployment is 438 // failed. We do not reschedule if the allocation is part of the failed 439 // deployment. 440 if now := len(rescheduleNow); now != 0 { 441 for _, p := range place { 442 prev := p.PreviousAllocation() 443 if p.IsRescheduling() && !(a.deploymentFailed && prev != nil && a.deployment.ID == prev.DeploymentID) { 444 a.result.place = append(a.result.place, p) 445 desiredChanges.Place++ 446 } 447 } 448 } 449 } 450 451 if deploymentPlaceReady { 452 // Do all destructive updates 453 min := helper.IntMin(len(destructive), limit) 454 desiredChanges.DestructiveUpdate += uint64(min) 455 desiredChanges.Ignore += uint64(len(destructive) - min) 456 for _, alloc := range destructive.nameOrder()[:min] { 457 a.result.destructiveUpdate = append(a.result.destructiveUpdate, allocDestructiveResult{ 458 placeName: alloc.Name, 459 placeTaskGroup: tg, 460 stopAlloc: alloc, 461 stopStatusDescription: allocUpdating, 462 }) 463 } 464 } else { 465 desiredChanges.Ignore += uint64(len(destructive)) 466 } 467 468 // Migrate all the allocations 469 desiredChanges.Migrate += uint64(len(migrate)) 470 for _, alloc := range migrate.nameOrder() { 471 a.result.stop = append(a.result.stop, allocStopResult{ 472 alloc: alloc, 473 statusDescription: allocMigrating, 474 }) 475 a.result.place = append(a.result.place, allocPlaceResult{ 476 name: alloc.Name, 477 canary: false, 478 taskGroup: tg, 479 previousAlloc: alloc, 480 }) 481 } 482 483 // Create new deployment if: 484 // 1. Updating a job specification 485 // 2. No running allocations (first time running a job) 486 updatingSpec := len(destructive) != 0 || len(a.result.inplaceUpdate) != 0 487 hadRunning := false 488 for _, alloc := range all { 489 if alloc.Job.Version == a.job.Version && alloc.Job.CreateIndex == a.job.CreateIndex { 490 hadRunning = true 491 break 492 } 493 } 494 495 // Create a new deployment if necessary 496 if !existingDeployment && strategy != nil && dstate.DesiredTotal != 0 && (!hadRunning || updatingSpec) { 497 // A previous group may have made the deployment already 498 if a.deployment == nil { 499 a.deployment = structs.NewDeployment(a.job) 500 a.result.deployment = a.deployment 501 } 502 503 // Attach the groups deployment state to the deployment 504 a.deployment.TaskGroups[group] = dstate 505 } 506 507 // deploymentComplete is whether the deployment is complete which largely 508 // means that no placements were made or desired to be made 509 deploymentComplete := len(destructive)+len(inplace)+len(place)+len(migrate)+len(rescheduleNow)+len(rescheduleLater) == 0 && !requireCanary 510 511 // Final check to see if the deployment is complete is to ensure everything 512 // is healthy 513 if deploymentComplete && a.deployment != nil { 514 if dstate, ok := a.deployment.TaskGroups[group]; ok { 515 if dstate.HealthyAllocs < helper.IntMax(dstate.DesiredTotal, dstate.DesiredCanaries) || // Make sure we have enough healthy allocs 516 (dstate.DesiredCanaries > 0 && !dstate.Promoted) { // Make sure we are promoted if we have canaries 517 deploymentComplete = false 518 } 519 } 520 } 521 522 return deploymentComplete 523 } 524 525 // filterOldTerminalAllocs filters allocations that should be ignored since they 526 // are allocations that are terminal from a previous job version. 527 func (a *allocReconciler) filterOldTerminalAllocs(all allocSet) (filtered, ignore allocSet) { 528 if !a.batch { 529 return all, nil 530 } 531 532 filtered = filtered.union(all) 533 ignored := make(map[string]*structs.Allocation) 534 535 // Ignore terminal batch jobs from older versions 536 for id, alloc := range filtered { 537 older := alloc.Job.Version < a.job.Version || alloc.Job.CreateIndex < a.job.CreateIndex 538 if older && alloc.TerminalStatus() { 539 delete(filtered, id) 540 ignored[id] = alloc 541 } 542 } 543 544 return filtered, ignored 545 } 546 547 // handleGroupCanaries handles the canaries for the group by stopping the 548 // unneeded ones and returning the current set of canaries and the updated total 549 // set of allocs for the group 550 func (a *allocReconciler) handleGroupCanaries(all allocSet, desiredChanges *structs.DesiredUpdates) (canaries, newAll allocSet) { 551 // Stop any canary from an older deployment or from a failed one 552 var stop []string 553 554 // Cancel any non-promoted canaries from the older deployment 555 if a.oldDeployment != nil { 556 for _, s := range a.oldDeployment.TaskGroups { 557 if !s.Promoted { 558 stop = append(stop, s.PlacedCanaries...) 559 } 560 } 561 } 562 563 // Cancel any non-promoted canaries from a failed deployment 564 if a.deployment != nil && a.deployment.Status == structs.DeploymentStatusFailed { 565 for _, s := range a.deployment.TaskGroups { 566 if !s.Promoted { 567 stop = append(stop, s.PlacedCanaries...) 568 } 569 } 570 } 571 572 // stopSet is the allocSet that contains the canaries we desire to stop from 573 // above. 574 stopSet := all.fromKeys(stop) 575 a.markStop(stopSet, "", allocNotNeeded) 576 desiredChanges.Stop += uint64(len(stopSet)) 577 all = all.difference(stopSet) 578 579 // Capture our current set of canaries and handle any migrations that are 580 // needed by just stopping them. 581 if a.deployment != nil { 582 var canaryIDs []string 583 for _, s := range a.deployment.TaskGroups { 584 canaryIDs = append(canaryIDs, s.PlacedCanaries...) 585 } 586 587 canaries = all.fromKeys(canaryIDs) 588 untainted, migrate, lost := canaries.filterByTainted(a.taintedNodes) 589 a.markStop(migrate, "", allocMigrating) 590 a.markStop(lost, structs.AllocClientStatusLost, allocLost) 591 592 canaries = untainted 593 all = all.difference(migrate, lost) 594 } 595 596 return canaries, all 597 } 598 599 // computeLimit returns the placement limit for a particular group. The inputs 600 // are the group definition, the untainted, destructive, and migrate allocation 601 // set and whether we are in a canary state. 602 func (a *allocReconciler) computeLimit(group *structs.TaskGroup, untainted, destructive, migrate allocSet, canaryState bool) int { 603 // If there is no update strategy or deployment for the group we can deploy 604 // as many as the group has 605 if group.Update == nil || len(destructive)+len(migrate) == 0 { 606 return group.Count 607 } else if a.deploymentPaused || a.deploymentFailed { 608 // If the deployment is paused or failed, do not create anything else 609 return 0 610 } 611 612 // If we have canaries and they have not been promoted the limit is 0 613 if canaryState { 614 return 0 615 } 616 617 // If we have been promoted or there are no canaries, the limit is the 618 // configured MaxParallel minus any outstanding non-healthy alloc for the 619 // deployment 620 limit := group.Update.MaxParallel 621 if a.deployment != nil { 622 partOf, _ := untainted.filterByDeployment(a.deployment.ID) 623 for _, alloc := range partOf { 624 // An unhealthy allocation means nothing else should be happen. 625 if alloc.DeploymentStatus.IsUnhealthy() { 626 return 0 627 } 628 629 if !alloc.DeploymentStatus.IsHealthy() { 630 limit-- 631 } 632 } 633 } 634 635 // The limit can be less than zero in the case that the job was changed such 636 // that it required destructive changes and the count was scaled up. 637 if limit < 0 { 638 return 0 639 } 640 641 return limit 642 } 643 644 // computePlacement returns the set of allocations to place given the group 645 // definition, the set of untainted, migrating and reschedule allocations for the group. 646 func (a *allocReconciler) computePlacements(group *structs.TaskGroup, 647 nameIndex *allocNameIndex, untainted, migrate allocSet, reschedule allocSet) []allocPlaceResult { 648 649 // Add rescheduled placement results 650 var place []allocPlaceResult 651 for _, alloc := range reschedule { 652 place = append(place, allocPlaceResult{ 653 name: alloc.Name, 654 taskGroup: group, 655 previousAlloc: alloc, 656 reschedule: true, 657 canary: alloc.DeploymentStatus.IsCanary(), 658 }) 659 } 660 661 // Hot path the nothing to do case 662 existing := len(untainted) + len(migrate) + len(reschedule) 663 if existing >= group.Count { 664 return place 665 } 666 667 // Add remaining placement results 668 if existing < group.Count { 669 for _, name := range nameIndex.Next(uint(group.Count - existing)) { 670 place = append(place, allocPlaceResult{ 671 name: name, 672 taskGroup: group, 673 }) 674 } 675 } 676 677 return place 678 } 679 680 // computeStop returns the set of allocations that are marked for stopping given 681 // the group definition, the set of allocations in various states and whether we 682 // are canarying. 683 func (a *allocReconciler) computeStop(group *structs.TaskGroup, nameIndex *allocNameIndex, 684 untainted, migrate, lost, canaries allocSet, canaryState bool) allocSet { 685 686 // Mark all lost allocations for stop. Previous allocation doesn't matter 687 // here since it is on a lost node 688 var stop allocSet 689 stop = stop.union(lost) 690 a.markStop(lost, structs.AllocClientStatusLost, allocLost) 691 692 // If we are still deploying or creating canaries, don't stop them 693 if canaryState { 694 untainted = untainted.difference(canaries) 695 } 696 697 // Hot path the nothing to do case 698 remove := len(untainted) + len(migrate) - group.Count 699 if remove <= 0 { 700 return stop 701 } 702 703 // Filter out any terminal allocations from the untainted set 704 // This is so that we don't try to mark them as stopped redundantly 705 untainted = filterByTerminal(untainted) 706 707 // Prefer stopping any alloc that has the same name as the canaries if we 708 // are promoted 709 if !canaryState && len(canaries) != 0 { 710 canaryNames := canaries.nameSet() 711 for id, alloc := range untainted.difference(canaries) { 712 if _, match := canaryNames[alloc.Name]; match { 713 stop[id] = alloc 714 a.result.stop = append(a.result.stop, allocStopResult{ 715 alloc: alloc, 716 statusDescription: allocNotNeeded, 717 }) 718 delete(untainted, id) 719 720 remove-- 721 if remove == 0 { 722 return stop 723 } 724 } 725 } 726 } 727 728 // Prefer selecting from the migrating set before stopping existing allocs 729 if len(migrate) != 0 { 730 mNames := newAllocNameIndex(a.jobID, group.Name, group.Count, migrate) 731 removeNames := mNames.Highest(uint(remove)) 732 for id, alloc := range migrate { 733 if _, match := removeNames[alloc.Name]; !match { 734 continue 735 } 736 a.result.stop = append(a.result.stop, allocStopResult{ 737 alloc: alloc, 738 statusDescription: allocNotNeeded, 739 }) 740 delete(migrate, id) 741 stop[id] = alloc 742 nameIndex.UnsetIndex(alloc.Index()) 743 744 remove-- 745 if remove == 0 { 746 return stop 747 } 748 } 749 } 750 751 // Select the allocs with the highest count to remove 752 removeNames := nameIndex.Highest(uint(remove)) 753 for id, alloc := range untainted { 754 if _, ok := removeNames[alloc.Name]; ok { 755 stop[id] = alloc 756 a.result.stop = append(a.result.stop, allocStopResult{ 757 alloc: alloc, 758 statusDescription: allocNotNeeded, 759 }) 760 delete(untainted, id) 761 762 remove-- 763 if remove == 0 { 764 return stop 765 } 766 } 767 } 768 769 // It is possible that we didn't stop as many as we should have if there 770 // were allocations with duplicate names. 771 for id, alloc := range untainted { 772 stop[id] = alloc 773 a.result.stop = append(a.result.stop, allocStopResult{ 774 alloc: alloc, 775 statusDescription: allocNotNeeded, 776 }) 777 delete(untainted, id) 778 779 remove-- 780 if remove == 0 { 781 return stop 782 } 783 } 784 785 return stop 786 } 787 788 // computeUpdates determines which allocations for the passed group require 789 // updates. Three groups are returned: 790 // 1. Those that require no upgrades 791 // 2. Those that can be upgraded in-place. These are added to the results 792 // automatically since the function contains the correct state to do so, 793 // 3. Those that require destructive updates 794 func (a *allocReconciler) computeUpdates(group *structs.TaskGroup, untainted allocSet) (ignore, inplace, destructive allocSet) { 795 // Determine the set of allocations that need to be updated 796 ignore = make(map[string]*structs.Allocation) 797 inplace = make(map[string]*structs.Allocation) 798 destructive = make(map[string]*structs.Allocation) 799 800 for _, alloc := range untainted { 801 ignoreChange, destructiveChange, inplaceAlloc := a.allocUpdateFn(alloc, a.job, group) 802 if ignoreChange { 803 ignore[alloc.ID] = alloc 804 } else if destructiveChange { 805 destructive[alloc.ID] = alloc 806 } else { 807 inplace[alloc.ID] = alloc 808 a.result.inplaceUpdate = append(a.result.inplaceUpdate, inplaceAlloc) 809 } 810 } 811 812 return 813 } 814 815 // handleDelayedReschedules creates batched followup evaluations with the WaitUntil field set 816 // for allocations that are eligible to be rescheduled later 817 func (a *allocReconciler) handleDelayedReschedules(rescheduleLater []*delayedRescheduleInfo, all allocSet, tgName string) { 818 if len(rescheduleLater) == 0 { 819 return 820 } 821 822 // Sort by time 823 sort.Slice(rescheduleLater, func(i, j int) bool { 824 return rescheduleLater[i].rescheduleTime.Before(rescheduleLater[j].rescheduleTime) 825 }) 826 827 var evals []*structs.Evaluation 828 nextReschedTime := rescheduleLater[0].rescheduleTime 829 allocIDToFollowupEvalID := make(map[string]string, len(rescheduleLater)) 830 831 // Create a new eval for the first batch 832 eval := &structs.Evaluation{ 833 ID: uuid.Generate(), 834 Namespace: a.job.Namespace, 835 Priority: a.job.Priority, 836 Type: a.job.Type, 837 TriggeredBy: structs.EvalTriggerRetryFailedAlloc, 838 JobID: a.job.ID, 839 JobModifyIndex: a.job.ModifyIndex, 840 Status: structs.EvalStatusPending, 841 StatusDescription: reschedulingFollowupEvalDesc, 842 WaitUntil: nextReschedTime, 843 } 844 evals = append(evals, eval) 845 846 for _, allocReschedInfo := range rescheduleLater { 847 if allocReschedInfo.rescheduleTime.Sub(nextReschedTime) < batchedFailedAllocWindowSize { 848 allocIDToFollowupEvalID[allocReschedInfo.allocID] = eval.ID 849 } else { 850 // Start a new batch 851 nextReschedTime = allocReschedInfo.rescheduleTime 852 // Create a new eval for the new batch 853 eval = &structs.Evaluation{ 854 ID: uuid.Generate(), 855 Namespace: a.job.Namespace, 856 Priority: a.job.Priority, 857 Type: a.job.Type, 858 TriggeredBy: structs.EvalTriggerRetryFailedAlloc, 859 JobID: a.job.ID, 860 JobModifyIndex: a.job.ModifyIndex, 861 Status: structs.EvalStatusPending, 862 WaitUntil: nextReschedTime, 863 } 864 evals = append(evals, eval) 865 // Set the evalID for the first alloc in this new batch 866 allocIDToFollowupEvalID[allocReschedInfo.allocID] = eval.ID 867 } 868 } 869 870 a.result.desiredFollowupEvals[tgName] = evals 871 872 // Initialize the annotations 873 if len(allocIDToFollowupEvalID) != 0 && a.result.attributeUpdates == nil { 874 a.result.attributeUpdates = make(map[string]*structs.Allocation) 875 } 876 877 // Create in-place updates for every alloc ID that needs to be updated with its follow up eval ID 878 for allocID, evalID := range allocIDToFollowupEvalID { 879 existingAlloc := all[allocID] 880 updatedAlloc := existingAlloc.Copy() 881 updatedAlloc.FollowupEvalID = evalID 882 a.result.attributeUpdates[updatedAlloc.ID] = updatedAlloc 883 } 884 }