github.com/zoomfoo/nomad@v0.8.5-0.20180907175415-f28fd3a1a056/scheduler/reconcile.go (about) 1 package scheduler 2 3 import ( 4 "fmt" 5 "log" 6 "time" 7 8 "sort" 9 10 "github.com/hashicorp/nomad/helper" 11 "github.com/hashicorp/nomad/helper/uuid" 12 "github.com/hashicorp/nomad/nomad/structs" 13 ) 14 15 const ( 16 // batchedFailedAllocWindowSize is the window size used 17 // to batch up failed allocations before creating an eval 18 batchedFailedAllocWindowSize = 5 * time.Second 19 20 // rescheduleWindowSize is the window size relative to 21 // current time within which reschedulable allocations are placed. 22 // This helps protect against small clock drifts between servers 23 rescheduleWindowSize = 1 * time.Second 24 ) 25 26 // allocUpdateType takes an existing allocation and a new job definition and 27 // returns whether the allocation can ignore the change, requires a destructive 28 // update, or can be inplace updated. If it can be inplace updated, an updated 29 // allocation that has the new resources and alloc metrics attached will be 30 // returned. 31 type allocUpdateType func(existing *structs.Allocation, newJob *structs.Job, 32 newTG *structs.TaskGroup) (ignore, destructive bool, updated *structs.Allocation) 33 34 // allocReconciler is used to determine the set of allocations that require 35 // placement, inplace updating or stopping given the job specification and 36 // existing cluster state. The reconciler should only be used for batch and 37 // service jobs. 38 type allocReconciler struct { 39 // logger is used to log debug information. Logging should be kept at a 40 // minimal here 41 logger *log.Logger 42 43 // canInplace is used to check if the allocation can be inplace upgraded 44 allocUpdateFn allocUpdateType 45 46 // batch marks whether the job is a batch job 47 batch bool 48 49 // job is the job being operated on, it may be nil if the job is being 50 // stopped via a purge 51 job *structs.Job 52 53 // jobID is the ID of the job being operated on. The job may be nil if it is 54 // being stopped so we require this separately. 55 jobID string 56 57 // oldDeployment is the last deployment for the job 58 oldDeployment *structs.Deployment 59 60 // deployment is the current deployment for the job 61 deployment *structs.Deployment 62 63 // deploymentPaused marks whether the deployment is paused 64 deploymentPaused bool 65 66 // deploymentFailed marks whether the deployment is failed 67 deploymentFailed bool 68 69 // taintedNodes contains a map of nodes that are tainted 70 taintedNodes map[string]*structs.Node 71 72 // existingAllocs is non-terminal existing allocations 73 existingAllocs []*structs.Allocation 74 75 // evalID is the ID of the evaluation that triggered the reconciler 76 evalID string 77 78 // now is the time used when determining rescheduling eligibility 79 // defaults to time.Now, and overidden in unit tests 80 now time.Time 81 82 // result is the results of the reconcile. During computation it can be 83 // used to store intermediate state 84 result *reconcileResults 85 } 86 87 // reconcileResults contains the results of the reconciliation and should be 88 // applied by the scheduler. 89 type reconcileResults struct { 90 // deployment is the deployment that should be created or updated as a 91 // result of scheduling 92 deployment *structs.Deployment 93 94 // deploymentUpdates contains a set of deployment updates that should be 95 // applied as a result of scheduling 96 deploymentUpdates []*structs.DeploymentStatusUpdate 97 98 // place is the set of allocations to place by the scheduler 99 place []allocPlaceResult 100 101 // destructiveUpdate is the set of allocations to apply a destructive update to 102 destructiveUpdate []allocDestructiveResult 103 104 // inplaceUpdate is the set of allocations to apply an inplace update to 105 inplaceUpdate []*structs.Allocation 106 107 // stop is the set of allocations to stop 108 stop []allocStopResult 109 110 // attributeUpdates are updates to the allocation that are not from a 111 // jobspec change. 112 attributeUpdates map[string]*structs.Allocation 113 114 // desiredTGUpdates captures the desired set of changes to make for each 115 // task group. 116 desiredTGUpdates map[string]*structs.DesiredUpdates 117 118 // desiredFollowupEvals is the map of follow up evaluations to create per task group 119 // This is used to create a delayed evaluation for rescheduling failed allocations. 120 desiredFollowupEvals map[string][]*structs.Evaluation 121 } 122 123 // delayedRescheduleInfo contains the allocation id and a time when its eligible to be rescheduled. 124 // this is used to create follow up evaluations 125 type delayedRescheduleInfo struct { 126 127 // allocID is the ID of the allocation eligible to be rescheduled 128 allocID string 129 130 // rescheduleTime is the time to use in the delayed evaluation 131 rescheduleTime time.Time 132 } 133 134 func (r *reconcileResults) GoString() string { 135 base := fmt.Sprintf("Total changes: (place %d) (destructive %d) (inplace %d) (stop %d)", 136 len(r.place), len(r.destructiveUpdate), len(r.inplaceUpdate), len(r.stop)) 137 138 if r.deployment != nil { 139 base += fmt.Sprintf("\nCreated Deployment: %q", r.deployment.ID) 140 } 141 for _, u := range r.deploymentUpdates { 142 base += fmt.Sprintf("\nDeployment Update for ID %q: Status %q; Description %q", 143 u.DeploymentID, u.Status, u.StatusDescription) 144 } 145 for tg, u := range r.desiredTGUpdates { 146 base += fmt.Sprintf("\nDesired Changes for %q: %#v", tg, u) 147 } 148 return base 149 } 150 151 // Changes returns the number of total changes 152 func (r *reconcileResults) Changes() int { 153 return len(r.place) + len(r.inplaceUpdate) + len(r.stop) 154 } 155 156 // NewAllocReconciler creates a new reconciler that should be used to determine 157 // the changes required to bring the cluster state inline with the declared jobspec 158 func NewAllocReconciler(logger *log.Logger, allocUpdateFn allocUpdateType, batch bool, 159 jobID string, job *structs.Job, deployment *structs.Deployment, 160 existingAllocs []*structs.Allocation, taintedNodes map[string]*structs.Node, evalID string) *allocReconciler { 161 return &allocReconciler{ 162 logger: logger, 163 allocUpdateFn: allocUpdateFn, 164 batch: batch, 165 jobID: jobID, 166 job: job, 167 deployment: deployment.Copy(), 168 existingAllocs: existingAllocs, 169 taintedNodes: taintedNodes, 170 evalID: evalID, 171 now: time.Now(), 172 result: &reconcileResults{ 173 desiredTGUpdates: make(map[string]*structs.DesiredUpdates), 174 desiredFollowupEvals: make(map[string][]*structs.Evaluation), 175 }, 176 } 177 } 178 179 // Compute reconciles the existing cluster state and returns the set of changes 180 // required to converge the job spec and state 181 func (a *allocReconciler) Compute() *reconcileResults { 182 // Create the allocation matrix 183 m := newAllocMatrix(a.job, a.existingAllocs) 184 185 // Handle stopping unneeded deployments 186 a.cancelDeployments() 187 188 // If we are just stopping a job we do not need to do anything more than 189 // stopping all running allocs 190 if a.job.Stopped() { 191 a.handleStop(m) 192 return a.result 193 } 194 195 // Detect if the deployment is paused 196 if a.deployment != nil { 197 a.deploymentPaused = a.deployment.Status == structs.DeploymentStatusPaused 198 a.deploymentFailed = a.deployment.Status == structs.DeploymentStatusFailed 199 } 200 201 // Reconcile each group 202 complete := true 203 for group, as := range m { 204 groupComplete := a.computeGroup(group, as) 205 complete = complete && groupComplete 206 } 207 208 // Mark the deployment as complete if possible 209 if a.deployment != nil && complete { 210 a.result.deploymentUpdates = append(a.result.deploymentUpdates, &structs.DeploymentStatusUpdate{ 211 DeploymentID: a.deployment.ID, 212 Status: structs.DeploymentStatusSuccessful, 213 StatusDescription: structs.DeploymentStatusDescriptionSuccessful, 214 }) 215 } 216 217 // Set the description of a created deployment 218 if d := a.result.deployment; d != nil { 219 if d.RequiresPromotion() { 220 d.StatusDescription = structs.DeploymentStatusDescriptionRunningNeedsPromotion 221 } 222 } 223 224 return a.result 225 } 226 227 // cancelDeployments cancels any deployment that is not needed 228 func (a *allocReconciler) cancelDeployments() { 229 // If the job is stopped and there is a non-terminal deployment, cancel it 230 if a.job.Stopped() { 231 if a.deployment != nil && a.deployment.Active() { 232 a.result.deploymentUpdates = append(a.result.deploymentUpdates, &structs.DeploymentStatusUpdate{ 233 DeploymentID: a.deployment.ID, 234 Status: structs.DeploymentStatusCancelled, 235 StatusDescription: structs.DeploymentStatusDescriptionStoppedJob, 236 }) 237 } 238 239 // Nothing else to do 240 a.oldDeployment = a.deployment 241 a.deployment = nil 242 return 243 } 244 245 d := a.deployment 246 if d == nil { 247 return 248 } 249 250 // Check if the deployment is active and referencing an older job and cancel it 251 if d.JobCreateIndex != a.job.CreateIndex || d.JobVersion != a.job.Version { 252 if d.Active() { 253 a.result.deploymentUpdates = append(a.result.deploymentUpdates, &structs.DeploymentStatusUpdate{ 254 DeploymentID: a.deployment.ID, 255 Status: structs.DeploymentStatusCancelled, 256 StatusDescription: structs.DeploymentStatusDescriptionNewerJob, 257 }) 258 } 259 260 a.oldDeployment = d 261 a.deployment = nil 262 } 263 264 // Clear it as the current deployment if it is successful 265 if d.Status == structs.DeploymentStatusSuccessful { 266 a.oldDeployment = d 267 a.deployment = nil 268 } 269 } 270 271 // handleStop marks all allocations to be stopped, handling the lost case 272 func (a *allocReconciler) handleStop(m allocMatrix) { 273 for group, as := range m { 274 untainted, migrate, lost := as.filterByTainted(a.taintedNodes) 275 a.markStop(untainted, "", allocNotNeeded) 276 a.markStop(migrate, "", allocNotNeeded) 277 a.markStop(lost, structs.AllocClientStatusLost, allocLost) 278 desiredChanges := new(structs.DesiredUpdates) 279 desiredChanges.Stop = uint64(len(as)) 280 a.result.desiredTGUpdates[group] = desiredChanges 281 } 282 } 283 284 // markStop is a helper for marking a set of allocation for stop with a 285 // particular client status and description. 286 func (a *allocReconciler) markStop(allocs allocSet, clientStatus, statusDescription string) { 287 for _, alloc := range allocs { 288 a.result.stop = append(a.result.stop, allocStopResult{ 289 alloc: alloc, 290 clientStatus: clientStatus, 291 statusDescription: statusDescription, 292 }) 293 } 294 } 295 296 // computeGroup reconciles state for a particular task group. It returns whether 297 // the deployment it is for is complete with regards to the task group. 298 func (a *allocReconciler) computeGroup(group string, all allocSet) bool { 299 // Create the desired update object for the group 300 desiredChanges := new(structs.DesiredUpdates) 301 a.result.desiredTGUpdates[group] = desiredChanges 302 303 // Get the task group. The task group may be nil if the job was updates such 304 // that the task group no longer exists 305 tg := a.job.LookupTaskGroup(group) 306 307 // If the task group is nil, then the task group has been removed so all we 308 // need to do is stop everything 309 if tg == nil { 310 untainted, migrate, lost := all.filterByTainted(a.taintedNodes) 311 a.markStop(untainted, "", allocNotNeeded) 312 a.markStop(migrate, "", allocNotNeeded) 313 a.markStop(lost, structs.AllocClientStatusLost, allocLost) 314 desiredChanges.Stop = uint64(len(untainted) + len(migrate) + len(lost)) 315 return true 316 } 317 318 // Get the deployment state for the group 319 var dstate *structs.DeploymentState 320 existingDeployment := false 321 if a.deployment != nil { 322 dstate, existingDeployment = a.deployment.TaskGroups[group] 323 } 324 if !existingDeployment { 325 dstate = &structs.DeploymentState{} 326 if tg.Update != nil { 327 dstate.AutoRevert = tg.Update.AutoRevert 328 dstate.ProgressDeadline = tg.Update.ProgressDeadline 329 } 330 } 331 332 // Filter allocations that do not need to be considered because they are 333 // from an older job version and are terminal. 334 all, ignore := a.filterOldTerminalAllocs(all) 335 desiredChanges.Ignore += uint64(len(ignore)) 336 337 // canaries is the set of canaries for the current deployment and all is all 338 // allocs including the canaries 339 canaries, all := a.handleGroupCanaries(all, desiredChanges) 340 341 // Determine what set of allocations are on tainted nodes 342 untainted, migrate, lost := all.filterByTainted(a.taintedNodes) 343 344 // Determine what set of terminal allocations need to be rescheduled 345 untainted, rescheduleNow, rescheduleLater := untainted.filterByRescheduleable(a.batch, a.now, a.evalID, a.deployment) 346 347 // Create batched follow up evaluations for allocations that are 348 // reschedulable later and mark the allocations for in place updating 349 a.handleDelayedReschedules(rescheduleLater, all, tg.Name) 350 351 // Create a structure for choosing names. Seed with the taken names which is 352 // the union of untainted and migrating nodes (includes canaries) 353 nameIndex := newAllocNameIndex(a.jobID, group, tg.Count, untainted.union(migrate, rescheduleNow)) 354 355 // Stop any unneeded allocations and update the untainted set to not 356 // included stopped allocations. 357 canaryState := dstate != nil && dstate.DesiredCanaries != 0 && !dstate.Promoted 358 stop := a.computeStop(tg, nameIndex, untainted, migrate, lost, canaries, canaryState) 359 desiredChanges.Stop += uint64(len(stop)) 360 untainted = untainted.difference(stop) 361 362 // Do inplace upgrades where possible and capture the set of upgrades that 363 // need to be done destructively. 364 ignore, inplace, destructive := a.computeUpdates(tg, untainted) 365 desiredChanges.Ignore += uint64(len(ignore)) 366 desiredChanges.InPlaceUpdate += uint64(len(inplace)) 367 if !existingDeployment { 368 dstate.DesiredTotal += len(destructive) + len(inplace) 369 } 370 371 // Remove the canaries now that we have handled rescheduling so that we do 372 // not consider them when making placement decisions. 373 if canaryState { 374 untainted = untainted.difference(canaries) 375 } 376 377 // The fact that we have destructive updates and have less canaries than is 378 // desired means we need to create canaries 379 numDestructive := len(destructive) 380 strategy := tg.Update 381 canariesPromoted := dstate != nil && dstate.Promoted 382 requireCanary := numDestructive != 0 && strategy != nil && len(canaries) < strategy.Canary && !canariesPromoted 383 if requireCanary && !a.deploymentPaused && !a.deploymentFailed { 384 number := strategy.Canary - len(canaries) 385 desiredChanges.Canary += uint64(number) 386 if !existingDeployment { 387 dstate.DesiredCanaries = strategy.Canary 388 } 389 390 for _, name := range nameIndex.NextCanaries(uint(number), canaries, destructive) { 391 a.result.place = append(a.result.place, allocPlaceResult{ 392 name: name, 393 canary: true, 394 taskGroup: tg, 395 }) 396 } 397 } 398 399 // Determine how many we can place 400 canaryState = dstate != nil && dstate.DesiredCanaries != 0 && !dstate.Promoted 401 limit := a.computeLimit(tg, untainted, destructive, migrate, canaryState) 402 403 // Place if: 404 // * The deployment is not paused or failed 405 // * Not placing any canaries 406 // * If there are any canaries that they have been promoted 407 place := a.computePlacements(tg, nameIndex, untainted, migrate, rescheduleNow) 408 if !existingDeployment { 409 dstate.DesiredTotal += len(place) 410 } 411 412 // deploymentPlaceReady tracks whether the deployment is in a state where 413 // placements can be made without any other consideration. 414 deploymentPlaceReady := !a.deploymentPaused && !a.deploymentFailed && !canaryState 415 416 if deploymentPlaceReady { 417 desiredChanges.Place += uint64(len(place)) 418 for _, p := range place { 419 a.result.place = append(a.result.place, p) 420 } 421 422 min := helper.IntMin(len(place), limit) 423 limit -= min 424 } else if !deploymentPlaceReady { 425 // We do not want to place additional allocations but in the case we 426 // have lost allocations or allocations that require rescheduling now, 427 // we do so regardless to avoid odd user experiences. 428 if len(lost) != 0 { 429 allowed := helper.IntMin(len(lost), len(place)) 430 desiredChanges.Place += uint64(allowed) 431 for _, p := range place[:allowed] { 432 a.result.place = append(a.result.place, p) 433 } 434 } 435 436 // Handle rescheduling of failed allocations even if the deployment is 437 // failed. We do not reschedule if the allocation is part of the failed 438 // deployment. 439 if now := len(rescheduleNow); now != 0 { 440 for _, p := range place { 441 prev := p.PreviousAllocation() 442 if p.IsRescheduling() && !(a.deploymentFailed && prev != nil && a.deployment.ID == prev.DeploymentID) { 443 a.result.place = append(a.result.place, p) 444 desiredChanges.Place++ 445 } 446 } 447 } 448 } 449 450 if deploymentPlaceReady { 451 // Do all destructive updates 452 min := helper.IntMin(len(destructive), limit) 453 desiredChanges.DestructiveUpdate += uint64(min) 454 desiredChanges.Ignore += uint64(len(destructive) - min) 455 for _, alloc := range destructive.nameOrder()[:min] { 456 a.result.destructiveUpdate = append(a.result.destructiveUpdate, allocDestructiveResult{ 457 placeName: alloc.Name, 458 placeTaskGroup: tg, 459 stopAlloc: alloc, 460 stopStatusDescription: allocUpdating, 461 }) 462 } 463 } else { 464 desiredChanges.Ignore += uint64(len(destructive)) 465 } 466 467 // Calculate the allowed number of changes and set the desired changes 468 // accordingly. 469 if !a.deploymentFailed && !a.deploymentPaused { 470 desiredChanges.Migrate += uint64(len(migrate)) 471 } else { 472 desiredChanges.Stop += uint64(len(migrate)) 473 } 474 475 for _, alloc := range migrate.nameOrder() { 476 // If the deployment is failed or paused, don't replace it, just mark as stop. 477 if a.deploymentFailed || a.deploymentPaused { 478 a.result.stop = append(a.result.stop, allocStopResult{ 479 alloc: alloc, 480 statusDescription: allocNodeTainted, 481 }) 482 continue 483 } 484 485 a.result.stop = append(a.result.stop, allocStopResult{ 486 alloc: alloc, 487 statusDescription: allocMigrating, 488 }) 489 a.result.place = append(a.result.place, allocPlaceResult{ 490 name: alloc.Name, 491 canary: false, 492 taskGroup: tg, 493 previousAlloc: alloc, 494 }) 495 } 496 497 // Create new deployment if: 498 // 1. Updating a job specification 499 // 2. No running allocations (first time running a job) 500 updatingSpec := len(destructive) != 0 || len(a.result.inplaceUpdate) != 0 501 hadRunning := false 502 for _, alloc := range all { 503 if alloc.Job.Version == a.job.Version && alloc.Job.CreateIndex == a.job.CreateIndex { 504 hadRunning = true 505 break 506 } 507 } 508 509 // Create a new deployment if necessary 510 if !existingDeployment && strategy != nil && dstate.DesiredTotal != 0 && (!hadRunning || updatingSpec) { 511 // A previous group may have made the deployment already 512 if a.deployment == nil { 513 a.deployment = structs.NewDeployment(a.job) 514 a.result.deployment = a.deployment 515 } 516 517 // Attach the groups deployment state to the deployment 518 a.deployment.TaskGroups[group] = dstate 519 } 520 521 // deploymentComplete is whether the deployment is complete which largely 522 // means that no placements were made or desired to be made 523 deploymentComplete := len(destructive)+len(inplace)+len(place)+len(migrate)+len(rescheduleNow)+len(rescheduleLater) == 0 && !requireCanary 524 525 // Final check to see if the deployment is complete is to ensure everything 526 // is healthy 527 if deploymentComplete && a.deployment != nil { 528 if dstate, ok := a.deployment.TaskGroups[group]; ok { 529 if dstate.HealthyAllocs < helper.IntMax(dstate.DesiredTotal, dstate.DesiredCanaries) || // Make sure we have enough healthy allocs 530 (dstate.DesiredCanaries > 0 && !dstate.Promoted) { // Make sure we are promoted if we have canaries 531 deploymentComplete = false 532 } 533 } 534 } 535 536 return deploymentComplete 537 } 538 539 // filterOldTerminalAllocs filters allocations that should be ignored since they 540 // are allocations that are terminal from a previous job version. 541 func (a *allocReconciler) filterOldTerminalAllocs(all allocSet) (filtered, ignore allocSet) { 542 if !a.batch { 543 return all, nil 544 } 545 546 filtered = filtered.union(all) 547 ignored := make(map[string]*structs.Allocation) 548 549 // Ignore terminal batch jobs from older versions 550 for id, alloc := range filtered { 551 older := alloc.Job.Version < a.job.Version || alloc.Job.CreateIndex < a.job.CreateIndex 552 if older && alloc.TerminalStatus() { 553 delete(filtered, id) 554 ignored[id] = alloc 555 } 556 } 557 558 return filtered, ignored 559 } 560 561 // handleGroupCanaries handles the canaries for the group by stopping the 562 // unneeded ones and returning the current set of canaries and the updated total 563 // set of allocs for the group 564 func (a *allocReconciler) handleGroupCanaries(all allocSet, desiredChanges *structs.DesiredUpdates) (canaries, newAll allocSet) { 565 // Stop any canary from an older deployment or from a failed one 566 var stop []string 567 568 // Cancel any non-promoted canaries from the older deployment 569 if a.oldDeployment != nil { 570 for _, s := range a.oldDeployment.TaskGroups { 571 if !s.Promoted { 572 stop = append(stop, s.PlacedCanaries...) 573 } 574 } 575 } 576 577 // Cancel any non-promoted canaries from a failed deployment 578 if a.deployment != nil && a.deployment.Status == structs.DeploymentStatusFailed { 579 for _, s := range a.deployment.TaskGroups { 580 if !s.Promoted { 581 stop = append(stop, s.PlacedCanaries...) 582 } 583 } 584 } 585 586 // stopSet is the allocSet that contains the canaries we desire to stop from 587 // above. 588 stopSet := all.fromKeys(stop) 589 a.markStop(stopSet, "", allocNotNeeded) 590 desiredChanges.Stop += uint64(len(stopSet)) 591 all = all.difference(stopSet) 592 593 // Capture our current set of canaries and handle any migrations that are 594 // needed by just stopping them. 595 if a.deployment != nil { 596 var canaryIDs []string 597 for _, s := range a.deployment.TaskGroups { 598 canaryIDs = append(canaryIDs, s.PlacedCanaries...) 599 } 600 601 canaries = all.fromKeys(canaryIDs) 602 untainted, migrate, lost := canaries.filterByTainted(a.taintedNodes) 603 a.markStop(migrate, "", allocMigrating) 604 a.markStop(lost, structs.AllocClientStatusLost, allocLost) 605 606 canaries = untainted 607 all = all.difference(migrate, lost) 608 } 609 610 return canaries, all 611 } 612 613 // computeLimit returns the placement limit for a particular group. The inputs 614 // are the group definition, the untainted, destructive, and migrate allocation 615 // set and whether we are in a canary state. 616 func (a *allocReconciler) computeLimit(group *structs.TaskGroup, untainted, destructive, migrate allocSet, canaryState bool) int { 617 // If there is no update strategy or deployment for the group we can deploy 618 // as many as the group has 619 if group.Update == nil || len(destructive)+len(migrate) == 0 { 620 return group.Count 621 } else if a.deploymentPaused || a.deploymentFailed { 622 // If the deployment is paused or failed, do not create anything else 623 return 0 624 } 625 626 // If we have canaries and they have not been promoted the limit is 0 627 if canaryState { 628 return 0 629 } 630 631 // If we have been promoted or there are no canaries, the limit is the 632 // configured MaxParallel minus any outstanding non-healthy alloc for the 633 // deployment 634 limit := group.Update.MaxParallel 635 if a.deployment != nil { 636 partOf, _ := untainted.filterByDeployment(a.deployment.ID) 637 for _, alloc := range partOf { 638 // An unhealthy allocation means nothing else should be happen. 639 if alloc.DeploymentStatus.IsUnhealthy() { 640 return 0 641 } 642 643 if !alloc.DeploymentStatus.IsHealthy() { 644 limit-- 645 } 646 } 647 } 648 649 // The limit can be less than zero in the case that the job was changed such 650 // that it required destructive changes and the count was scaled up. 651 if limit < 0 { 652 return 0 653 } 654 655 return limit 656 } 657 658 // computePlacement returns the set of allocations to place given the group 659 // definition, the set of untainted, migrating and reschedule allocations for the group. 660 func (a *allocReconciler) computePlacements(group *structs.TaskGroup, 661 nameIndex *allocNameIndex, untainted, migrate allocSet, reschedule allocSet) []allocPlaceResult { 662 663 // Add rescheduled placement results 664 var place []allocPlaceResult 665 for _, alloc := range reschedule { 666 place = append(place, allocPlaceResult{ 667 name: alloc.Name, 668 taskGroup: group, 669 previousAlloc: alloc, 670 reschedule: true, 671 canary: alloc.DeploymentStatus.IsCanary(), 672 }) 673 } 674 675 // Hot path the nothing to do case 676 existing := len(untainted) + len(migrate) + len(reschedule) 677 if existing >= group.Count { 678 return place 679 } 680 681 // Add remaining placement results 682 if existing < group.Count { 683 for _, name := range nameIndex.Next(uint(group.Count - existing)) { 684 place = append(place, allocPlaceResult{ 685 name: name, 686 taskGroup: group, 687 }) 688 } 689 } 690 691 return place 692 } 693 694 // computeStop returns the set of allocations that are marked for stopping given 695 // the group definition, the set of allocations in various states and whether we 696 // are canarying. 697 func (a *allocReconciler) computeStop(group *structs.TaskGroup, nameIndex *allocNameIndex, 698 untainted, migrate, lost, canaries allocSet, canaryState bool) allocSet { 699 700 // Mark all lost allocations for stop. Previous allocation doesn't matter 701 // here since it is on a lost node 702 var stop allocSet 703 stop = stop.union(lost) 704 a.markStop(lost, structs.AllocClientStatusLost, allocLost) 705 706 // If we are still deploying or creating canaries, don't stop them 707 if canaryState { 708 untainted = untainted.difference(canaries) 709 } 710 711 // Hot path the nothing to do case 712 remove := len(untainted) + len(migrate) - group.Count 713 if remove <= 0 { 714 return stop 715 } 716 717 // Filter out any terminal allocations from the untainted set 718 // This is so that we don't try to mark them as stopped redundantly 719 untainted = filterByTerminal(untainted) 720 721 // Prefer stopping any alloc that has the same name as the canaries if we 722 // are promoted 723 if !canaryState && len(canaries) != 0 { 724 canaryNames := canaries.nameSet() 725 for id, alloc := range untainted.difference(canaries) { 726 if _, match := canaryNames[alloc.Name]; match { 727 stop[id] = alloc 728 a.result.stop = append(a.result.stop, allocStopResult{ 729 alloc: alloc, 730 statusDescription: allocNotNeeded, 731 }) 732 delete(untainted, id) 733 734 remove-- 735 if remove == 0 { 736 return stop 737 } 738 } 739 } 740 } 741 742 // Prefer selecting from the migrating set before stopping existing allocs 743 if len(migrate) != 0 { 744 mNames := newAllocNameIndex(a.jobID, group.Name, group.Count, migrate) 745 removeNames := mNames.Highest(uint(remove)) 746 for id, alloc := range migrate { 747 if _, match := removeNames[alloc.Name]; !match { 748 continue 749 } 750 a.result.stop = append(a.result.stop, allocStopResult{ 751 alloc: alloc, 752 statusDescription: allocNotNeeded, 753 }) 754 delete(migrate, id) 755 stop[id] = alloc 756 nameIndex.UnsetIndex(alloc.Index()) 757 758 remove-- 759 if remove == 0 { 760 return stop 761 } 762 } 763 } 764 765 // Select the allocs with the highest count to remove 766 removeNames := nameIndex.Highest(uint(remove)) 767 for id, alloc := range untainted { 768 if _, ok := removeNames[alloc.Name]; ok { 769 stop[id] = alloc 770 a.result.stop = append(a.result.stop, allocStopResult{ 771 alloc: alloc, 772 statusDescription: allocNotNeeded, 773 }) 774 delete(untainted, id) 775 776 remove-- 777 if remove == 0 { 778 return stop 779 } 780 } 781 } 782 783 // It is possible that we didn't stop as many as we should have if there 784 // were allocations with duplicate names. 785 for id, alloc := range untainted { 786 stop[id] = alloc 787 a.result.stop = append(a.result.stop, allocStopResult{ 788 alloc: alloc, 789 statusDescription: allocNotNeeded, 790 }) 791 delete(untainted, id) 792 793 remove-- 794 if remove == 0 { 795 return stop 796 } 797 } 798 799 return stop 800 } 801 802 // computeUpdates determines which allocations for the passed group require 803 // updates. Three groups are returned: 804 // 1. Those that require no upgrades 805 // 2. Those that can be upgraded in-place. These are added to the results 806 // automatically since the function contains the correct state to do so, 807 // 3. Those that require destructive updates 808 func (a *allocReconciler) computeUpdates(group *structs.TaskGroup, untainted allocSet) (ignore, inplace, destructive allocSet) { 809 // Determine the set of allocations that need to be updated 810 ignore = make(map[string]*structs.Allocation) 811 inplace = make(map[string]*structs.Allocation) 812 destructive = make(map[string]*structs.Allocation) 813 814 for _, alloc := range untainted { 815 ignoreChange, destructiveChange, inplaceAlloc := a.allocUpdateFn(alloc, a.job, group) 816 if ignoreChange { 817 ignore[alloc.ID] = alloc 818 } else if destructiveChange { 819 destructive[alloc.ID] = alloc 820 } else { 821 inplace[alloc.ID] = alloc 822 a.result.inplaceUpdate = append(a.result.inplaceUpdate, inplaceAlloc) 823 } 824 } 825 826 return 827 } 828 829 // handleDelayedReschedules creates batched followup evaluations with the WaitUntil field set 830 // for allocations that are eligible to be rescheduled later 831 func (a *allocReconciler) handleDelayedReschedules(rescheduleLater []*delayedRescheduleInfo, all allocSet, tgName string) { 832 if len(rescheduleLater) == 0 { 833 return 834 } 835 836 // Sort by time 837 sort.Slice(rescheduleLater, func(i, j int) bool { 838 return rescheduleLater[i].rescheduleTime.Before(rescheduleLater[j].rescheduleTime) 839 }) 840 841 var evals []*structs.Evaluation 842 nextReschedTime := rescheduleLater[0].rescheduleTime 843 allocIDToFollowupEvalID := make(map[string]string, len(rescheduleLater)) 844 845 // Create a new eval for the first batch 846 eval := &structs.Evaluation{ 847 ID: uuid.Generate(), 848 Namespace: a.job.Namespace, 849 Priority: a.job.Priority, 850 Type: a.job.Type, 851 TriggeredBy: structs.EvalTriggerRetryFailedAlloc, 852 JobID: a.job.ID, 853 JobModifyIndex: a.job.ModifyIndex, 854 Status: structs.EvalStatusPending, 855 StatusDescription: reschedulingFollowupEvalDesc, 856 WaitUntil: nextReschedTime, 857 } 858 evals = append(evals, eval) 859 860 for _, allocReschedInfo := range rescheduleLater { 861 if allocReschedInfo.rescheduleTime.Sub(nextReschedTime) < batchedFailedAllocWindowSize { 862 allocIDToFollowupEvalID[allocReschedInfo.allocID] = eval.ID 863 } else { 864 // Start a new batch 865 nextReschedTime = allocReschedInfo.rescheduleTime 866 // Create a new eval for the new batch 867 eval = &structs.Evaluation{ 868 ID: uuid.Generate(), 869 Namespace: a.job.Namespace, 870 Priority: a.job.Priority, 871 Type: a.job.Type, 872 TriggeredBy: structs.EvalTriggerRetryFailedAlloc, 873 JobID: a.job.ID, 874 JobModifyIndex: a.job.ModifyIndex, 875 Status: structs.EvalStatusPending, 876 WaitUntil: nextReschedTime, 877 } 878 evals = append(evals, eval) 879 // Set the evalID for the first alloc in this new batch 880 allocIDToFollowupEvalID[allocReschedInfo.allocID] = eval.ID 881 } 882 } 883 884 a.result.desiredFollowupEvals[tgName] = evals 885 886 // Initialize the annotations 887 if len(allocIDToFollowupEvalID) != 0 && a.result.attributeUpdates == nil { 888 a.result.attributeUpdates = make(map[string]*structs.Allocation) 889 } 890 891 // Create in-place updates for every alloc ID that needs to be updated with its follow up eval ID 892 for allocID, evalID := range allocIDToFollowupEvalID { 893 existingAlloc := all[allocID] 894 updatedAlloc := existingAlloc.Copy() 895 updatedAlloc.FollowupEvalID = evalID 896 a.result.attributeUpdates[updatedAlloc.ID] = updatedAlloc 897 } 898 }