github.com/adityamillind98/nomad@v0.11.8/scheduler/util.go (about) 1 package scheduler 2 3 import ( 4 "fmt" 5 "math/rand" 6 "reflect" 7 8 log "github.com/hashicorp/go-hclog" 9 memdb "github.com/hashicorp/go-memdb" 10 "github.com/hashicorp/nomad/nomad/structs" 11 ) 12 13 // allocTuple is a tuple of the allocation name and potential alloc ID 14 type allocTuple struct { 15 Name string 16 TaskGroup *structs.TaskGroup 17 Alloc *structs.Allocation 18 } 19 20 // materializeTaskGroups is used to materialize all the task groups 21 // a job requires. This is used to do the count expansion. 22 func materializeTaskGroups(job *structs.Job) map[string]*structs.TaskGroup { 23 out := make(map[string]*structs.TaskGroup) 24 if job.Stopped() { 25 return out 26 } 27 28 for _, tg := range job.TaskGroups { 29 for i := 0; i < tg.Count; i++ { 30 name := fmt.Sprintf("%s.%s[%d]", job.Name, tg.Name, i) 31 out[name] = tg 32 } 33 } 34 return out 35 } 36 37 // diffResult is used to return the sets that result from the diff 38 type diffResult struct { 39 place, update, migrate, stop, ignore, lost []allocTuple 40 } 41 42 func (d *diffResult) GoString() string { 43 return fmt.Sprintf("allocs: (place %d) (update %d) (migrate %d) (stop %d) (ignore %d) (lost %d)", 44 len(d.place), len(d.update), len(d.migrate), len(d.stop), len(d.ignore), len(d.lost)) 45 } 46 47 func (d *diffResult) Append(other *diffResult) { 48 d.place = append(d.place, other.place...) 49 d.update = append(d.update, other.update...) 50 d.migrate = append(d.migrate, other.migrate...) 51 d.stop = append(d.stop, other.stop...) 52 d.ignore = append(d.ignore, other.ignore...) 53 d.lost = append(d.lost, other.lost...) 54 } 55 56 // diffSystemAllocsForNode is used to do a set difference between the target allocations 57 // and the existing allocations for a particular node. This returns 6 sets of results, 58 // the list of named task groups that need to be placed (no existing allocation), the 59 // allocations that need to be updated (job definition is newer), allocs that 60 // need to be migrated (node is draining), the allocs that need to be evicted 61 // (no longer required), those that should be ignored and those that are lost 62 // that need to be replaced (running on a lost node). 63 // 64 // job is the job whose allocs is going to be diff-ed. 65 // taintedNodes is an index of the nodes which are either down or in drain mode 66 // by name. 67 // required is a set of allocations that must exist. 68 // allocs is a list of non terminal allocations. 69 // terminalAllocs is an index of the latest terminal allocations by name. 70 func diffSystemAllocsForNode(job *structs.Job, nodeID string, 71 eligibleNodes, taintedNodes map[string]*structs.Node, 72 required map[string]*structs.TaskGroup, allocs []*structs.Allocation, 73 terminalAllocs map[string]*structs.Allocation) *diffResult { 74 result := &diffResult{} 75 76 // Scan the existing updates 77 existing := make(map[string]struct{}) 78 for _, exist := range allocs { 79 // Index the existing node 80 name := exist.Name 81 existing[name] = struct{}{} 82 83 // Check for the definition in the required set 84 tg, ok := required[name] 85 86 // If not required, we stop the alloc 87 if !ok { 88 result.stop = append(result.stop, allocTuple{ 89 Name: name, 90 TaskGroup: tg, 91 Alloc: exist, 92 }) 93 continue 94 } 95 96 // If we have been marked for migration and aren't terminal, migrate 97 if !exist.TerminalStatus() && exist.DesiredTransition.ShouldMigrate() { 98 result.migrate = append(result.migrate, allocTuple{ 99 Name: name, 100 TaskGroup: tg, 101 Alloc: exist, 102 }) 103 continue 104 } 105 // If we are on a tainted node, we must migrate if we are a service or 106 // if the batch allocation did not finish 107 if node, ok := taintedNodes[exist.NodeID]; ok { 108 // If the job is batch and finished successfully, the fact that the 109 // node is tainted does not mean it should be migrated or marked as 110 // lost as the work was already successfully finished. However for 111 // service/system jobs, tasks should never complete. The check of 112 // batch type, defends against client bugs. 113 if exist.Job.Type == structs.JobTypeBatch && exist.RanSuccessfully() { 114 goto IGNORE 115 } 116 117 if !exist.TerminalStatus() && (node == nil || node.TerminalStatus()) { 118 result.lost = append(result.lost, allocTuple{ 119 Name: name, 120 TaskGroup: tg, 121 Alloc: exist, 122 }) 123 } else { 124 goto IGNORE 125 } 126 127 continue 128 } 129 130 // For an existing allocation, if the nodeID is no longer 131 // eligible, the diff should be ignored 132 if _, ok := eligibleNodes[nodeID]; !ok { 133 goto IGNORE 134 } 135 136 // If the definition is updated we need to update 137 if job.JobModifyIndex != exist.Job.JobModifyIndex { 138 result.update = append(result.update, allocTuple{ 139 Name: name, 140 TaskGroup: tg, 141 Alloc: exist, 142 }) 143 continue 144 } 145 146 // Everything is up-to-date 147 IGNORE: 148 result.ignore = append(result.ignore, allocTuple{ 149 Name: name, 150 TaskGroup: tg, 151 Alloc: exist, 152 }) 153 } 154 155 // Scan the required groups 156 for name, tg := range required { 157 // Check for an existing allocation 158 _, ok := existing[name] 159 160 // Require a placement if no existing allocation. If there 161 // is an existing allocation, we would have checked for a potential 162 // update or ignore above. Ignore placements for tainted or 163 // ineligible nodes 164 if !ok { 165 // Tainted and ineligible nodes for a non existing alloc 166 // should be filtered out and not count towards ignore or place 167 if _, tainted := taintedNodes[nodeID]; tainted { 168 continue 169 } 170 if _, eligible := eligibleNodes[nodeID]; !eligible { 171 continue 172 } 173 174 allocTuple := allocTuple{ 175 Name: name, 176 TaskGroup: tg, 177 Alloc: terminalAllocs[name], 178 } 179 180 // If the new allocation isn't annotated with a previous allocation 181 // or if the previous allocation isn't from the same node then we 182 // annotate the allocTuple with a new Allocation 183 if allocTuple.Alloc == nil || allocTuple.Alloc.NodeID != nodeID { 184 allocTuple.Alloc = &structs.Allocation{NodeID: nodeID} 185 } 186 result.place = append(result.place, allocTuple) 187 } 188 } 189 return result 190 } 191 192 // diffSystemAllocs is like diffSystemAllocsForNode however, the allocations in the 193 // diffResult contain the specific nodeID they should be allocated on. 194 // 195 // job is the job whose allocs is going to be diff-ed. 196 // nodes is a list of nodes in ready state. 197 // taintedNodes is an index of the nodes which are either down or in drain mode 198 // by name. 199 // allocs is a list of non terminal allocations. 200 // terminalAllocs is an index of the latest terminal allocations by name. 201 func diffSystemAllocs(job *structs.Job, nodes []*structs.Node, taintedNodes map[string]*structs.Node, 202 allocs []*structs.Allocation, terminalAllocs map[string]*structs.Allocation) *diffResult { 203 204 // Build a mapping of nodes to all their allocs. 205 nodeAllocs := make(map[string][]*structs.Allocation, len(allocs)) 206 for _, alloc := range allocs { 207 nallocs := append(nodeAllocs[alloc.NodeID], alloc) 208 nodeAllocs[alloc.NodeID] = nallocs 209 } 210 211 eligibleNodes := make(map[string]*structs.Node) 212 for _, node := range nodes { 213 if _, ok := nodeAllocs[node.ID]; !ok { 214 nodeAllocs[node.ID] = nil 215 } 216 eligibleNodes[node.ID] = node 217 } 218 219 // Create the required task groups. 220 required := materializeTaskGroups(job) 221 222 result := &diffResult{} 223 for nodeID, allocs := range nodeAllocs { 224 diff := diffSystemAllocsForNode(job, nodeID, eligibleNodes, taintedNodes, required, allocs, terminalAllocs) 225 result.Append(diff) 226 } 227 228 return result 229 } 230 231 // readyNodesInDCs returns all the ready nodes in the given datacenters and a 232 // mapping of each data center to the count of ready nodes. 233 func readyNodesInDCs(state State, dcs []string) ([]*structs.Node, map[string]int, error) { 234 // Index the DCs 235 dcMap := make(map[string]int, len(dcs)) 236 for _, dc := range dcs { 237 dcMap[dc] = 0 238 } 239 240 // Scan the nodes 241 ws := memdb.NewWatchSet() 242 var out []*structs.Node 243 iter, err := state.Nodes(ws) 244 if err != nil { 245 return nil, nil, err 246 } 247 for { 248 raw := iter.Next() 249 if raw == nil { 250 break 251 } 252 253 // Filter on datacenter and status 254 node := raw.(*structs.Node) 255 if node.Status != structs.NodeStatusReady { 256 continue 257 } 258 if node.Drain { 259 continue 260 } 261 if node.SchedulingEligibility != structs.NodeSchedulingEligible { 262 continue 263 } 264 if _, ok := dcMap[node.Datacenter]; !ok { 265 continue 266 } 267 out = append(out, node) 268 dcMap[node.Datacenter]++ 269 } 270 return out, dcMap, nil 271 } 272 273 // retryMax is used to retry a callback until it returns success or 274 // a maximum number of attempts is reached. An optional reset function may be 275 // passed which is called after each failed iteration. If the reset function is 276 // set and returns true, the number of attempts is reset back to max. 277 func retryMax(max int, cb func() (bool, error), reset func() bool) error { 278 attempts := 0 279 for attempts < max { 280 done, err := cb() 281 if err != nil { 282 return err 283 } 284 if done { 285 return nil 286 } 287 288 // Check if we should reset the number attempts 289 if reset != nil && reset() { 290 attempts = 0 291 } else { 292 attempts++ 293 } 294 } 295 return &SetStatusError{ 296 Err: fmt.Errorf("maximum attempts reached (%d)", max), 297 EvalStatus: structs.EvalStatusFailed, 298 } 299 } 300 301 // progressMade checks to see if the plan result made allocations or updates. 302 // If the result is nil, false is returned. 303 func progressMade(result *structs.PlanResult) bool { 304 return result != nil && (len(result.NodeUpdate) != 0 || 305 len(result.NodeAllocation) != 0 || result.Deployment != nil || 306 len(result.DeploymentUpdates) != 0) 307 } 308 309 // taintedNodes is used to scan the allocations and then check if the 310 // underlying nodes are tainted, and should force a migration of the allocation. 311 // All the nodes returned in the map are tainted. 312 func taintedNodes(state State, allocs []*structs.Allocation) (map[string]*structs.Node, error) { 313 out := make(map[string]*structs.Node) 314 for _, alloc := range allocs { 315 if _, ok := out[alloc.NodeID]; ok { 316 continue 317 } 318 319 ws := memdb.NewWatchSet() 320 node, err := state.NodeByID(ws, alloc.NodeID) 321 if err != nil { 322 return nil, err 323 } 324 325 // If the node does not exist, we should migrate 326 if node == nil { 327 out[alloc.NodeID] = nil 328 continue 329 } 330 if structs.ShouldDrainNode(node.Status) || node.Drain { 331 out[alloc.NodeID] = node 332 } 333 } 334 return out, nil 335 } 336 337 // shuffleNodes randomizes the slice order with the Fisher-Yates algorithm 338 func shuffleNodes(nodes []*structs.Node) { 339 n := len(nodes) 340 for i := n - 1; i > 0; i-- { 341 j := rand.Intn(i + 1) 342 nodes[i], nodes[j] = nodes[j], nodes[i] 343 } 344 } 345 346 // tasksUpdated does a diff between task groups to see if the 347 // tasks, their drivers, environment variables or config have updated. The 348 // inputs are the task group name to diff and two jobs to diff. 349 // taskUpdated and functions called within assume that the given 350 // taskGroup has already been checked to not be nil 351 func tasksUpdated(jobA, jobB *structs.Job, taskGroup string) bool { 352 a := jobA.LookupTaskGroup(taskGroup) 353 b := jobB.LookupTaskGroup(taskGroup) 354 355 // If the number of tasks do not match, clearly there is an update 356 if len(a.Tasks) != len(b.Tasks) { 357 return true 358 } 359 360 // Check ephemeral disk 361 if !reflect.DeepEqual(a.EphemeralDisk, b.EphemeralDisk) { 362 return true 363 } 364 365 // Check that the network resources haven't changed 366 if networkUpdated(a.Networks, b.Networks) { 367 return true 368 } 369 370 // Check Affinities 371 if affinitiesUpdated(jobA, jobB, taskGroup) { 372 return true 373 } 374 375 // Check Spreads 376 if spreadsUpdated(jobA, jobB, taskGroup) { 377 return true 378 } 379 380 // Check each task 381 for _, at := range a.Tasks { 382 bt := b.LookupTask(at.Name) 383 if bt == nil { 384 return true 385 } 386 if at.Driver != bt.Driver { 387 return true 388 } 389 if at.User != bt.User { 390 return true 391 } 392 if !reflect.DeepEqual(at.Config, bt.Config) { 393 return true 394 } 395 if !reflect.DeepEqual(at.Env, bt.Env) { 396 return true 397 } 398 if !reflect.DeepEqual(at.Artifacts, bt.Artifacts) { 399 return true 400 } 401 if !reflect.DeepEqual(at.Vault, bt.Vault) { 402 return true 403 } 404 if !reflect.DeepEqual(at.Templates, bt.Templates) { 405 return true 406 } 407 408 // Check the metadata 409 if !reflect.DeepEqual( 410 jobA.CombinedTaskMeta(taskGroup, at.Name), 411 jobB.CombinedTaskMeta(taskGroup, bt.Name)) { 412 return true 413 } 414 415 // Inspect the network to see if the dynamic ports are different 416 if networkUpdated(at.Resources.Networks, bt.Resources.Networks) { 417 return true 418 } 419 420 // Inspect the non-network resources 421 if ar, br := at.Resources, bt.Resources; ar.CPU != br.CPU { 422 return true 423 } else if ar.MemoryMB != br.MemoryMB { 424 return true 425 } else if !ar.Devices.Equals(&br.Devices) { 426 return true 427 } 428 } 429 return false 430 } 431 432 func networkUpdated(netA, netB []*structs.NetworkResource) bool { 433 if len(netA) != len(netB) { 434 return true 435 } 436 for idx := range netA { 437 an := netA[idx] 438 bn := netB[idx] 439 440 if an.Mode != bn.Mode { 441 return true 442 } 443 444 if an.MBits != bn.MBits { 445 return true 446 } 447 448 aPorts, bPorts := networkPortMap(an), networkPortMap(bn) 449 if !reflect.DeepEqual(aPorts, bPorts) { 450 return true 451 } 452 } 453 return false 454 } 455 456 // networkPortMap takes a network resource and returns a map of port labels to 457 // values. The value for dynamic ports is disregarded even if it is set. This 458 // makes this function suitable for comparing two network resources for changes. 459 func networkPortMap(n *structs.NetworkResource) map[string]int { 460 m := make(map[string]int, len(n.DynamicPorts)+len(n.ReservedPorts)) 461 for _, p := range n.ReservedPorts { 462 m[p.Label] = p.Value 463 } 464 for _, p := range n.DynamicPorts { 465 m[p.Label] = -1 466 } 467 return m 468 } 469 470 func affinitiesUpdated(jobA, jobB *structs.Job, taskGroup string) bool { 471 var aAffinities []*structs.Affinity 472 var bAffinities []*structs.Affinity 473 474 tgA := jobA.LookupTaskGroup(taskGroup) 475 tgB := jobB.LookupTaskGroup(taskGroup) 476 477 // Append jobA job and task group level affinities 478 aAffinities = append(aAffinities, jobA.Affinities...) 479 aAffinities = append(aAffinities, tgA.Affinities...) 480 481 // Append jobB job and task group level affinities 482 bAffinities = append(bAffinities, jobB.Affinities...) 483 bAffinities = append(bAffinities, tgB.Affinities...) 484 485 // append task affinities 486 for _, task := range tgA.Tasks { 487 aAffinities = append(aAffinities, task.Affinities...) 488 } 489 490 for _, task := range tgB.Tasks { 491 bAffinities = append(bAffinities, task.Affinities...) 492 } 493 494 // Check for equality 495 if len(aAffinities) != len(bAffinities) { 496 return true 497 } 498 499 return !reflect.DeepEqual(aAffinities, bAffinities) 500 } 501 502 func spreadsUpdated(jobA, jobB *structs.Job, taskGroup string) bool { 503 var aSpreads []*structs.Spread 504 var bSpreads []*structs.Spread 505 506 tgA := jobA.LookupTaskGroup(taskGroup) 507 tgB := jobB.LookupTaskGroup(taskGroup) 508 509 // append jobA and task group level spreads 510 aSpreads = append(aSpreads, jobA.Spreads...) 511 aSpreads = append(aSpreads, tgA.Spreads...) 512 513 // append jobB and task group level spreads 514 bSpreads = append(bSpreads, jobB.Spreads...) 515 bSpreads = append(bSpreads, tgB.Spreads...) 516 517 // Check for equality 518 if len(aSpreads) != len(bSpreads) { 519 return true 520 } 521 522 return !reflect.DeepEqual(aSpreads, bSpreads) 523 } 524 525 // setStatus is used to update the status of the evaluation 526 func setStatus(logger log.Logger, planner Planner, 527 eval, nextEval, spawnedBlocked *structs.Evaluation, 528 tgMetrics map[string]*structs.AllocMetric, status, desc string, 529 queuedAllocs map[string]int, deploymentID string) error { 530 531 logger.Debug("setting eval status", "status", status) 532 newEval := eval.Copy() 533 newEval.Status = status 534 newEval.StatusDescription = desc 535 newEval.DeploymentID = deploymentID 536 newEval.FailedTGAllocs = tgMetrics 537 if nextEval != nil { 538 newEval.NextEval = nextEval.ID 539 } 540 if spawnedBlocked != nil { 541 newEval.BlockedEval = spawnedBlocked.ID 542 } 543 if queuedAllocs != nil { 544 newEval.QueuedAllocations = queuedAllocs 545 } 546 547 return planner.UpdateEval(newEval) 548 } 549 550 // inplaceUpdate attempts to update allocations in-place where possible. It 551 // returns the allocs that couldn't be done inplace and then those that could. 552 func inplaceUpdate(ctx Context, eval *structs.Evaluation, job *structs.Job, 553 stack Stack, updates []allocTuple) (destructive, inplace []allocTuple) { 554 555 // doInplace manipulates the updates map to make the current allocation 556 // an inplace update. 557 doInplace := func(cur, last, inplaceCount *int) { 558 updates[*cur], updates[*last-1] = updates[*last-1], updates[*cur] 559 *cur-- 560 *last-- 561 *inplaceCount++ 562 } 563 564 ws := memdb.NewWatchSet() 565 n := len(updates) 566 inplaceCount := 0 567 for i := 0; i < n; i++ { 568 // Get the update 569 update := updates[i] 570 571 // Check if the task drivers or config has changed, requires 572 // a rolling upgrade since that cannot be done in-place. 573 existing := update.Alloc.Job 574 if tasksUpdated(job, existing, update.TaskGroup.Name) { 575 continue 576 } 577 578 // Terminal batch allocations are not filtered when they are completed 579 // successfully. We should avoid adding the allocation to the plan in 580 // the case that it is an in-place update to avoid both additional data 581 // in the plan and work for the clients. 582 if update.Alloc.TerminalStatus() { 583 doInplace(&i, &n, &inplaceCount) 584 continue 585 } 586 587 // Get the existing node 588 node, err := ctx.State().NodeByID(ws, update.Alloc.NodeID) 589 if err != nil { 590 ctx.Logger().Error("failed to get node", "node_id", update.Alloc.NodeID, "error", err) 591 continue 592 } 593 if node == nil { 594 continue 595 } 596 597 // Set the existing node as the base set 598 stack.SetNodes([]*structs.Node{node}) 599 600 // Stage an eviction of the current allocation. This is done so that 601 // the current allocation is discounted when checking for feasibility. 602 // Otherwise we would be trying to fit the tasks current resources and 603 // updated resources. After select is called we can remove the evict. 604 ctx.Plan().AppendStoppedAlloc(update.Alloc, allocInPlace, "", "") 605 606 // Attempt to match the task group 607 option := stack.Select(update.TaskGroup, nil) // This select only looks at one node so we don't pass selectOptions 608 609 // Pop the allocation 610 ctx.Plan().PopUpdate(update.Alloc) 611 612 // Skip if we could not do an in-place update 613 if option == nil { 614 continue 615 } 616 617 // Restore the network and device offers from the existing allocation. 618 // We do not allow network resources (reserved/dynamic ports) 619 // to be updated. This is guarded in taskUpdated, so we can 620 // safely restore those here. 621 for task, resources := range option.TaskResources { 622 var networks structs.Networks 623 var devices []*structs.AllocatedDeviceResource 624 if update.Alloc.AllocatedResources != nil { 625 if tr, ok := update.Alloc.AllocatedResources.Tasks[task]; ok { 626 networks = tr.Networks 627 devices = tr.Devices 628 } 629 } else if tr, ok := update.Alloc.TaskResources[task]; ok { 630 networks = tr.Networks 631 } 632 633 // Add the networks and devices back 634 resources.Networks = networks 635 resources.Devices = devices 636 } 637 638 // Create a shallow copy 639 newAlloc := new(structs.Allocation) 640 *newAlloc = *update.Alloc 641 642 // Update the allocation 643 newAlloc.EvalID = eval.ID 644 newAlloc.Job = nil // Use the Job in the Plan 645 newAlloc.Resources = nil // Computed in Plan Apply 646 newAlloc.AllocatedResources = &structs.AllocatedResources{ 647 Tasks: option.TaskResources, 648 TaskLifecycles: option.TaskLifecycles, 649 Shared: structs.AllocatedSharedResources{ 650 DiskMB: int64(update.TaskGroup.EphemeralDisk.SizeMB), 651 }, 652 } 653 newAlloc.Metrics = ctx.Metrics() 654 ctx.Plan().AppendAlloc(newAlloc) 655 656 // Remove this allocation from the slice 657 doInplace(&i, &n, &inplaceCount) 658 } 659 660 if len(updates) > 0 { 661 ctx.Logger().Debug("made in-place updates", "in-place", inplaceCount, "total_updates", len(updates)) 662 } 663 return updates[:n], updates[n:] 664 } 665 666 // evictAndPlace is used to mark allocations for evicts and add them to the 667 // placement queue. evictAndPlace modifies both the diffResult and the 668 // limit. It returns true if the limit has been reached. 669 func evictAndPlace(ctx Context, diff *diffResult, allocs []allocTuple, desc string, limit *int) bool { 670 n := len(allocs) 671 for i := 0; i < n && i < *limit; i++ { 672 a := allocs[i] 673 ctx.Plan().AppendStoppedAlloc(a.Alloc, desc, "", "") 674 diff.place = append(diff.place, a) 675 } 676 if n <= *limit { 677 *limit -= n 678 return false 679 } 680 *limit = 0 681 return true 682 } 683 684 // tgConstrainTuple is used to store the total constraints of a task group. 685 type tgConstrainTuple struct { 686 // Holds the combined constraints of the task group and all it's sub-tasks. 687 constraints []*structs.Constraint 688 689 // The set of required drivers within the task group. 690 drivers map[string]struct{} 691 } 692 693 // taskGroupConstraints collects the constraints, drivers and resources required by each 694 // sub-task to aggregate the TaskGroup totals 695 func taskGroupConstraints(tg *structs.TaskGroup) tgConstrainTuple { 696 c := tgConstrainTuple{ 697 constraints: make([]*structs.Constraint, 0, len(tg.Constraints)), 698 drivers: make(map[string]struct{}), 699 } 700 701 c.constraints = append(c.constraints, tg.Constraints...) 702 for _, task := range tg.Tasks { 703 c.drivers[task.Driver] = struct{}{} 704 c.constraints = append(c.constraints, task.Constraints...) 705 } 706 707 return c 708 } 709 710 // desiredUpdates takes the diffResult as well as the set of inplace and 711 // destructive updates and returns a map of task groups to their set of desired 712 // updates. 713 func desiredUpdates(diff *diffResult, inplaceUpdates, 714 destructiveUpdates []allocTuple) map[string]*structs.DesiredUpdates { 715 desiredTgs := make(map[string]*structs.DesiredUpdates) 716 717 for _, tuple := range diff.place { 718 name := tuple.TaskGroup.Name 719 des, ok := desiredTgs[name] 720 if !ok { 721 des = &structs.DesiredUpdates{} 722 desiredTgs[name] = des 723 } 724 725 des.Place++ 726 } 727 728 for _, tuple := range diff.stop { 729 name := tuple.Alloc.TaskGroup 730 des, ok := desiredTgs[name] 731 if !ok { 732 des = &structs.DesiredUpdates{} 733 desiredTgs[name] = des 734 } 735 736 des.Stop++ 737 } 738 739 for _, tuple := range diff.ignore { 740 name := tuple.TaskGroup.Name 741 des, ok := desiredTgs[name] 742 if !ok { 743 des = &structs.DesiredUpdates{} 744 desiredTgs[name] = des 745 } 746 747 des.Ignore++ 748 } 749 750 for _, tuple := range diff.migrate { 751 name := tuple.TaskGroup.Name 752 des, ok := desiredTgs[name] 753 if !ok { 754 des = &structs.DesiredUpdates{} 755 desiredTgs[name] = des 756 } 757 758 des.Migrate++ 759 } 760 761 for _, tuple := range inplaceUpdates { 762 name := tuple.TaskGroup.Name 763 des, ok := desiredTgs[name] 764 if !ok { 765 des = &structs.DesiredUpdates{} 766 desiredTgs[name] = des 767 } 768 769 des.InPlaceUpdate++ 770 } 771 772 for _, tuple := range destructiveUpdates { 773 name := tuple.TaskGroup.Name 774 des, ok := desiredTgs[name] 775 if !ok { 776 des = &structs.DesiredUpdates{} 777 desiredTgs[name] = des 778 } 779 780 des.DestructiveUpdate++ 781 } 782 783 return desiredTgs 784 } 785 786 // adjustQueuedAllocations decrements the number of allocations pending per task 787 // group based on the number of allocations successfully placed 788 func adjustQueuedAllocations(logger log.Logger, result *structs.PlanResult, queuedAllocs map[string]int) { 789 if result == nil { 790 return 791 } 792 793 for _, allocations := range result.NodeAllocation { 794 for _, allocation := range allocations { 795 // Ensure that the allocation is newly created. We check that 796 // the CreateIndex is equal to the ModifyIndex in order to check 797 // that the allocation was just created. We do not check that 798 // the CreateIndex is equal to the results AllocIndex because 799 // the allocations we get back have gone through the planner's 800 // optimistic snapshot and thus their indexes may not be 801 // correct, but they will be consistent. 802 if allocation.CreateIndex != allocation.ModifyIndex { 803 continue 804 } 805 806 if _, ok := queuedAllocs[allocation.TaskGroup]; ok { 807 queuedAllocs[allocation.TaskGroup]-- 808 } else { 809 logger.Error("allocation placed but task group is not in list of unplaced allocations", "task_group", allocation.TaskGroup) 810 } 811 } 812 } 813 } 814 815 // updateNonTerminalAllocsToLost updates the allocations which are in pending/running state 816 // on tainted node to lost, but only for allocs already DesiredStatus stop or evict 817 func updateNonTerminalAllocsToLost(plan *structs.Plan, tainted map[string]*structs.Node, allocs []*structs.Allocation) { 818 for _, alloc := range allocs { 819 node, ok := tainted[alloc.NodeID] 820 if !ok { 821 continue 822 } 823 824 // Only handle down nodes or nodes that are gone (node == nil) 825 if node != nil && node.Status != structs.NodeStatusDown { 826 continue 827 } 828 829 // If the alloc is already correctly marked lost, we're done 830 if (alloc.DesiredStatus == structs.AllocDesiredStatusStop || 831 alloc.DesiredStatus == structs.AllocDesiredStatusEvict) && 832 (alloc.ClientStatus == structs.AllocClientStatusRunning || 833 alloc.ClientStatus == structs.AllocClientStatusPending) { 834 plan.AppendStoppedAlloc(alloc, allocLost, structs.AllocClientStatusLost, "") 835 } 836 } 837 } 838 839 // genericAllocUpdateFn is a factory for the scheduler to create an allocUpdateType 840 // function to be passed into the reconciler. The factory takes objects that 841 // exist only in the scheduler context and returns a function that can be used 842 // by the reconciler to make decisions about how to update an allocation. The 843 // factory allows the reconciler to be unaware of how to determine the type of 844 // update necessary and can minimize the set of objects it is exposed to. 845 func genericAllocUpdateFn(ctx Context, stack Stack, evalID string) allocUpdateType { 846 return func(existing *structs.Allocation, newJob *structs.Job, newTG *structs.TaskGroup) (ignore, destructive bool, updated *structs.Allocation) { 847 // Same index, so nothing to do 848 if existing.Job.JobModifyIndex == newJob.JobModifyIndex { 849 return true, false, nil 850 } 851 852 // Check if the task drivers or config has changed, requires 853 // a destructive upgrade since that cannot be done in-place. 854 if tasksUpdated(newJob, existing.Job, newTG.Name) { 855 return false, true, nil 856 } 857 858 // Terminal batch allocations are not filtered when they are completed 859 // successfully. We should avoid adding the allocation to the plan in 860 // the case that it is an in-place update to avoid both additional data 861 // in the plan and work for the clients. 862 if existing.TerminalStatus() { 863 return true, false, nil 864 } 865 866 // Get the existing node 867 ws := memdb.NewWatchSet() 868 node, err := ctx.State().NodeByID(ws, existing.NodeID) 869 if err != nil { 870 ctx.Logger().Error("failed to get node", "node_id", existing.NodeID, "error", err) 871 return true, false, nil 872 } 873 if node == nil { 874 return false, true, nil 875 } 876 877 // Set the existing node as the base set 878 stack.SetNodes([]*structs.Node{node}) 879 880 // Stage an eviction of the current allocation. This is done so that 881 // the current allocation is discounted when checking for feasibility. 882 // Otherwise we would be trying to fit the tasks current resources and 883 // updated resources. After select is called we can remove the evict. 884 ctx.Plan().AppendStoppedAlloc(existing, allocInPlace, "", "") 885 886 // Attempt to match the task group 887 option := stack.Select(newTG, nil) // This select only looks at one node so we don't pass selectOptions 888 889 // Pop the allocation 890 ctx.Plan().PopUpdate(existing) 891 892 // Require destructive if we could not do an in-place update 893 if option == nil { 894 return false, true, nil 895 } 896 897 // Restore the network and device offers from the existing allocation. 898 // We do not allow network resources (reserved/dynamic ports) 899 // to be updated. This is guarded in taskUpdated, so we can 900 // safely restore those here. 901 for task, resources := range option.TaskResources { 902 var networks structs.Networks 903 var devices []*structs.AllocatedDeviceResource 904 if existing.AllocatedResources != nil { 905 if tr, ok := existing.AllocatedResources.Tasks[task]; ok { 906 networks = tr.Networks 907 devices = tr.Devices 908 } 909 } else if tr, ok := existing.TaskResources[task]; ok { 910 networks = tr.Networks 911 } 912 913 // Add the networks back 914 resources.Networks = networks 915 resources.Devices = devices 916 } 917 918 // Create a shallow copy 919 newAlloc := new(structs.Allocation) 920 *newAlloc = *existing 921 922 // Update the allocation 923 newAlloc.EvalID = evalID 924 newAlloc.Job = nil // Use the Job in the Plan 925 newAlloc.Resources = nil // Computed in Plan Apply 926 newAlloc.AllocatedResources = &structs.AllocatedResources{ 927 Tasks: option.TaskResources, 928 TaskLifecycles: option.TaskLifecycles, 929 Shared: structs.AllocatedSharedResources{ 930 DiskMB: int64(newTG.EphemeralDisk.SizeMB), 931 }, 932 } 933 934 // Since this is an inplace update, we should copy network 935 // information from the original alloc. This is similar to how 936 // we copy network info for task level networks above. 937 // 938 // existing.AllocatedResources is nil on Allocations created by 939 // Nomad v0.8 or earlier. 940 if existing.AllocatedResources != nil { 941 newAlloc.AllocatedResources.Shared.Networks = existing.AllocatedResources.Shared.Networks 942 } 943 944 // Use metrics from existing alloc for in place upgrade 945 // This is because if the inplace upgrade succeeded, any scoring metadata from 946 // when it first went through the scheduler should still be preserved. Using scoring 947 // metadata from the context would incorrectly replace it with metadata only from a single node that the 948 // allocation is already on. 949 newAlloc.Metrics = existing.Metrics.Copy() 950 return false, false, newAlloc 951 } 952 }