github.com/djenriquez/nomad-1@v0.8.1/scheduler/util.go (about) 1 package scheduler 2 3 import ( 4 "fmt" 5 "log" 6 "math/rand" 7 "reflect" 8 9 memdb "github.com/hashicorp/go-memdb" 10 "github.com/hashicorp/nomad/nomad/structs" 11 ) 12 13 // allocTuple is a tuple of the allocation name and potential alloc ID 14 type allocTuple struct { 15 Name string 16 TaskGroup *structs.TaskGroup 17 Alloc *structs.Allocation 18 } 19 20 // materializeTaskGroups is used to materialize all the task groups 21 // a job requires. This is used to do the count expansion. 22 func materializeTaskGroups(job *structs.Job) map[string]*structs.TaskGroup { 23 out := make(map[string]*structs.TaskGroup) 24 if job.Stopped() { 25 return out 26 } 27 28 for _, tg := range job.TaskGroups { 29 for i := 0; i < tg.Count; i++ { 30 name := fmt.Sprintf("%s.%s[%d]", job.Name, tg.Name, i) 31 out[name] = tg 32 } 33 } 34 return out 35 } 36 37 // diffResult is used to return the sets that result from the diff 38 type diffResult struct { 39 place, update, migrate, stop, ignore, lost []allocTuple 40 } 41 42 func (d *diffResult) GoString() string { 43 return fmt.Sprintf("allocs: (place %d) (update %d) (migrate %d) (stop %d) (ignore %d) (lost %d)", 44 len(d.place), len(d.update), len(d.migrate), len(d.stop), len(d.ignore), len(d.lost)) 45 } 46 47 func (d *diffResult) Append(other *diffResult) { 48 d.place = append(d.place, other.place...) 49 d.update = append(d.update, other.update...) 50 d.migrate = append(d.migrate, other.migrate...) 51 d.stop = append(d.stop, other.stop...) 52 d.ignore = append(d.ignore, other.ignore...) 53 d.lost = append(d.lost, other.lost...) 54 } 55 56 // diffAllocs is used to do a set difference between the target allocations 57 // and the existing allocations. This returns 6 sets of results, the list of 58 // named task groups that need to be placed (no existing allocation), the 59 // allocations that need to be updated (job definition is newer), allocs that 60 // need to be migrated (node is draining), the allocs that need to be evicted 61 // (no longer required), those that should be ignored and those that are lost 62 // that need to be replaced (running on a lost node). 63 // 64 // job is the job whose allocs is going to be diff-ed. 65 // taintedNodes is an index of the nodes which are either down or in drain mode 66 // by name. 67 // required is a set of allocations that must exist. 68 // allocs is a list of non terminal allocations. 69 // terminalAllocs is an index of the latest terminal allocations by name. 70 func diffAllocs(job *structs.Job, taintedNodes map[string]*structs.Node, 71 required map[string]*structs.TaskGroup, allocs []*structs.Allocation, 72 terminalAllocs map[string]*structs.Allocation) *diffResult { 73 result := &diffResult{} 74 75 // Scan the existing updates 76 existing := make(map[string]struct{}) 77 for _, exist := range allocs { 78 // Index the existing node 79 name := exist.Name 80 existing[name] = struct{}{} 81 82 // Check for the definition in the required set 83 tg, ok := required[name] 84 85 // If not required, we stop the alloc 86 if !ok { 87 result.stop = append(result.stop, allocTuple{ 88 Name: name, 89 TaskGroup: tg, 90 Alloc: exist, 91 }) 92 continue 93 } 94 95 // If we have been marked for migration and aren't terminal, migrate 96 if !exist.TerminalStatus() && exist.DesiredTransition.ShouldMigrate() { 97 result.migrate = append(result.migrate, allocTuple{ 98 Name: name, 99 TaskGroup: tg, 100 Alloc: exist, 101 }) 102 continue 103 } 104 // If we are on a tainted node, we must migrate if we are a service or 105 // if the batch allocation did not finish 106 if node, ok := taintedNodes[exist.NodeID]; ok { 107 // If the job is batch and finished successfully, the fact that the 108 // node is tainted does not mean it should be migrated or marked as 109 // lost as the work was already successfully finished. However for 110 // service/system jobs, tasks should never complete. The check of 111 // batch type, defends against client bugs. 112 if exist.Job.Type == structs.JobTypeBatch && exist.RanSuccessfully() { 113 goto IGNORE 114 } 115 116 if !exist.TerminalStatus() && (node == nil || node.TerminalStatus()) { 117 result.lost = append(result.lost, allocTuple{ 118 Name: name, 119 TaskGroup: tg, 120 Alloc: exist, 121 }) 122 } else { 123 goto IGNORE 124 } 125 126 continue 127 } 128 129 // If the definition is updated we need to update 130 if job.JobModifyIndex != exist.Job.JobModifyIndex { 131 result.update = append(result.update, allocTuple{ 132 Name: name, 133 TaskGroup: tg, 134 Alloc: exist, 135 }) 136 continue 137 } 138 139 // Everything is up-to-date 140 IGNORE: 141 result.ignore = append(result.ignore, allocTuple{ 142 Name: name, 143 TaskGroup: tg, 144 Alloc: exist, 145 }) 146 } 147 148 // Scan the required groups 149 for name, tg := range required { 150 // Check for an existing allocation 151 _, ok := existing[name] 152 153 // Require a placement if no existing allocation. If there 154 // is an existing allocation, we would have checked for a potential 155 // update or ignore above. 156 if !ok { 157 result.place = append(result.place, allocTuple{ 158 Name: name, 159 TaskGroup: tg, 160 Alloc: terminalAllocs[name], 161 }) 162 } 163 } 164 return result 165 } 166 167 // diffSystemAllocs is like diffAllocs however, the allocations in the 168 // diffResult contain the specific nodeID they should be allocated on. 169 // 170 // job is the job whose allocs is going to be diff-ed. 171 // nodes is a list of nodes in ready state. 172 // taintedNodes is an index of the nodes which are either down or in drain mode 173 // by name. 174 // allocs is a list of non terminal allocations. 175 // terminalAllocs is an index of the latest terminal allocations by name. 176 func diffSystemAllocs(job *structs.Job, nodes []*structs.Node, taintedNodes map[string]*structs.Node, 177 allocs []*structs.Allocation, terminalAllocs map[string]*structs.Allocation) *diffResult { 178 179 // Build a mapping of nodes to all their allocs. 180 nodeAllocs := make(map[string][]*structs.Allocation, len(allocs)) 181 for _, alloc := range allocs { 182 nallocs := append(nodeAllocs[alloc.NodeID], alloc) 183 nodeAllocs[alloc.NodeID] = nallocs 184 } 185 186 for _, node := range nodes { 187 if _, ok := nodeAllocs[node.ID]; !ok { 188 nodeAllocs[node.ID] = nil 189 } 190 } 191 192 // Create the required task groups. 193 required := materializeTaskGroups(job) 194 195 result := &diffResult{} 196 for nodeID, allocs := range nodeAllocs { 197 diff := diffAllocs(job, taintedNodes, required, allocs, terminalAllocs) 198 199 // If the node is tainted there should be no placements made 200 if _, ok := taintedNodes[nodeID]; ok { 201 diff.place = nil 202 } else { 203 // Mark the alloc as being for a specific node. 204 for i := range diff.place { 205 alloc := &diff.place[i] 206 207 // If the new allocation isn't annotated with a previous allocation 208 // or if the previous allocation isn't from the same node then we 209 // annotate the allocTuple with a new Allocation 210 if alloc.Alloc == nil || alloc.Alloc.NodeID != nodeID { 211 alloc.Alloc = &structs.Allocation{NodeID: nodeID} 212 } 213 } 214 } 215 216 result.Append(diff) 217 } 218 219 return result 220 } 221 222 // readyNodesInDCs returns all the ready nodes in the given datacenters and a 223 // mapping of each data center to the count of ready nodes. 224 func readyNodesInDCs(state State, dcs []string) ([]*structs.Node, map[string]int, error) { 225 // Index the DCs 226 dcMap := make(map[string]int, len(dcs)) 227 for _, dc := range dcs { 228 dcMap[dc] = 0 229 } 230 231 // Scan the nodes 232 ws := memdb.NewWatchSet() 233 var out []*structs.Node 234 iter, err := state.Nodes(ws) 235 if err != nil { 236 return nil, nil, err 237 } 238 for { 239 raw := iter.Next() 240 if raw == nil { 241 break 242 } 243 244 // Filter on datacenter and status 245 node := raw.(*structs.Node) 246 if node.Status != structs.NodeStatusReady { 247 continue 248 } 249 if node.Drain { 250 continue 251 } 252 if node.SchedulingEligibility != structs.NodeSchedulingEligible { 253 continue 254 } 255 if _, ok := dcMap[node.Datacenter]; !ok { 256 continue 257 } 258 out = append(out, node) 259 dcMap[node.Datacenter]++ 260 } 261 return out, dcMap, nil 262 } 263 264 // retryMax is used to retry a callback until it returns success or 265 // a maximum number of attempts is reached. An optional reset function may be 266 // passed which is called after each failed iteration. If the reset function is 267 // set and returns true, the number of attempts is reset back to max. 268 func retryMax(max int, cb func() (bool, error), reset func() bool) error { 269 attempts := 0 270 for attempts < max { 271 done, err := cb() 272 if err != nil { 273 return err 274 } 275 if done { 276 return nil 277 } 278 279 // Check if we should reset the number attempts 280 if reset != nil && reset() { 281 attempts = 0 282 } else { 283 attempts++ 284 } 285 } 286 return &SetStatusError{ 287 Err: fmt.Errorf("maximum attempts reached (%d)", max), 288 EvalStatus: structs.EvalStatusFailed, 289 } 290 } 291 292 // progressMade checks to see if the plan result made allocations or updates. 293 // If the result is nil, false is returned. 294 func progressMade(result *structs.PlanResult) bool { 295 return result != nil && (len(result.NodeUpdate) != 0 || 296 len(result.NodeAllocation) != 0 || result.Deployment != nil || 297 len(result.DeploymentUpdates) != 0) 298 } 299 300 // taintedNodes is used to scan the allocations and then check if the 301 // underlying nodes are tainted, and should force a migration of the allocation. 302 // All the nodes returned in the map are tainted. 303 func taintedNodes(state State, allocs []*structs.Allocation) (map[string]*structs.Node, error) { 304 out := make(map[string]*structs.Node) 305 for _, alloc := range allocs { 306 if _, ok := out[alloc.NodeID]; ok { 307 continue 308 } 309 310 ws := memdb.NewWatchSet() 311 node, err := state.NodeByID(ws, alloc.NodeID) 312 if err != nil { 313 return nil, err 314 } 315 316 // If the node does not exist, we should migrate 317 if node == nil { 318 out[alloc.NodeID] = nil 319 continue 320 } 321 if structs.ShouldDrainNode(node.Status) || node.Drain { 322 out[alloc.NodeID] = node 323 } 324 } 325 return out, nil 326 } 327 328 // shuffleNodes randomizes the slice order with the Fisher-Yates algorithm 329 func shuffleNodes(nodes []*structs.Node) { 330 n := len(nodes) 331 for i := n - 1; i > 0; i-- { 332 j := rand.Intn(i + 1) 333 nodes[i], nodes[j] = nodes[j], nodes[i] 334 } 335 } 336 337 // tasksUpdated does a diff between task groups to see if the 338 // tasks, their drivers, environment variables or config have updated. The 339 // inputs are the task group name to diff and two jobs to diff. 340 func tasksUpdated(jobA, jobB *structs.Job, taskGroup string) bool { 341 a := jobA.LookupTaskGroup(taskGroup) 342 b := jobB.LookupTaskGroup(taskGroup) 343 344 // If the number of tasks do not match, clearly there is an update 345 if len(a.Tasks) != len(b.Tasks) { 346 return true 347 } 348 349 // Check ephemeral disk 350 if !reflect.DeepEqual(a.EphemeralDisk, b.EphemeralDisk) { 351 return true 352 } 353 354 // Check each task 355 for _, at := range a.Tasks { 356 bt := b.LookupTask(at.Name) 357 if bt == nil { 358 return true 359 } 360 if at.Driver != bt.Driver { 361 return true 362 } 363 if at.User != bt.User { 364 return true 365 } 366 if !reflect.DeepEqual(at.Config, bt.Config) { 367 return true 368 } 369 if !reflect.DeepEqual(at.Env, bt.Env) { 370 return true 371 } 372 if !reflect.DeepEqual(at.Artifacts, bt.Artifacts) { 373 return true 374 } 375 if !reflect.DeepEqual(at.Vault, bt.Vault) { 376 return true 377 } 378 if !reflect.DeepEqual(at.Templates, bt.Templates) { 379 return true 380 } 381 382 // Check the metadata 383 if !reflect.DeepEqual( 384 jobA.CombinedTaskMeta(taskGroup, at.Name), 385 jobB.CombinedTaskMeta(taskGroup, bt.Name)) { 386 return true 387 } 388 389 // Inspect the network to see if the dynamic ports are different 390 if len(at.Resources.Networks) != len(bt.Resources.Networks) { 391 return true 392 } 393 for idx := range at.Resources.Networks { 394 an := at.Resources.Networks[idx] 395 bn := bt.Resources.Networks[idx] 396 397 if an.MBits != bn.MBits { 398 return true 399 } 400 401 aPorts, bPorts := networkPortMap(an), networkPortMap(bn) 402 if !reflect.DeepEqual(aPorts, bPorts) { 403 return true 404 } 405 } 406 407 // Inspect the non-network resources 408 if ar, br := at.Resources, bt.Resources; ar.CPU != br.CPU { 409 return true 410 } else if ar.MemoryMB != br.MemoryMB { 411 return true 412 } else if ar.IOPS != br.IOPS { 413 return true 414 } 415 } 416 return false 417 } 418 419 // networkPortMap takes a network resource and returns a map of port labels to 420 // values. The value for dynamic ports is disregarded even if it is set. This 421 // makes this function suitable for comparing two network resources for changes. 422 func networkPortMap(n *structs.NetworkResource) map[string]int { 423 m := make(map[string]int, len(n.DynamicPorts)+len(n.ReservedPorts)) 424 for _, p := range n.ReservedPorts { 425 m[p.Label] = p.Value 426 } 427 for _, p := range n.DynamicPorts { 428 m[p.Label] = -1 429 } 430 return m 431 } 432 433 // setStatus is used to update the status of the evaluation 434 func setStatus(logger *log.Logger, planner Planner, 435 eval, nextEval, spawnedBlocked *structs.Evaluation, 436 tgMetrics map[string]*structs.AllocMetric, status, desc string, 437 queuedAllocs map[string]int, deploymentID string) error { 438 439 logger.Printf("[DEBUG] sched: %#v: setting status to %s", eval, status) 440 newEval := eval.Copy() 441 newEval.Status = status 442 newEval.StatusDescription = desc 443 newEval.DeploymentID = deploymentID 444 newEval.FailedTGAllocs = tgMetrics 445 if nextEval != nil { 446 newEval.NextEval = nextEval.ID 447 } 448 if spawnedBlocked != nil { 449 newEval.BlockedEval = spawnedBlocked.ID 450 } 451 if queuedAllocs != nil { 452 newEval.QueuedAllocations = queuedAllocs 453 } 454 455 return planner.UpdateEval(newEval) 456 } 457 458 // inplaceUpdate attempts to update allocations in-place where possible. It 459 // returns the allocs that couldn't be done inplace and then those that could. 460 func inplaceUpdate(ctx Context, eval *structs.Evaluation, job *structs.Job, 461 stack Stack, updates []allocTuple) (destructive, inplace []allocTuple) { 462 463 // doInplace manipulates the updates map to make the current allocation 464 // an inplace update. 465 doInplace := func(cur, last, inplaceCount *int) { 466 updates[*cur], updates[*last-1] = updates[*last-1], updates[*cur] 467 *cur-- 468 *last-- 469 *inplaceCount++ 470 } 471 472 ws := memdb.NewWatchSet() 473 n := len(updates) 474 inplaceCount := 0 475 for i := 0; i < n; i++ { 476 // Get the update 477 update := updates[i] 478 479 // Check if the task drivers or config has changed, requires 480 // a rolling upgrade since that cannot be done in-place. 481 existing := update.Alloc.Job 482 if tasksUpdated(job, existing, update.TaskGroup.Name) { 483 continue 484 } 485 486 // Terminal batch allocations are not filtered when they are completed 487 // successfully. We should avoid adding the allocation to the plan in 488 // the case that it is an in-place update to avoid both additional data 489 // in the plan and work for the clients. 490 if update.Alloc.TerminalStatus() { 491 doInplace(&i, &n, &inplaceCount) 492 continue 493 } 494 495 // Get the existing node 496 node, err := ctx.State().NodeByID(ws, update.Alloc.NodeID) 497 if err != nil { 498 ctx.Logger().Printf("[ERR] sched: %#v failed to get node '%s': %v", 499 eval, update.Alloc.NodeID, err) 500 continue 501 } 502 if node == nil { 503 continue 504 } 505 506 // Set the existing node as the base set 507 stack.SetNodes([]*structs.Node{node}) 508 509 // Stage an eviction of the current allocation. This is done so that 510 // the current allocation is discounted when checking for feasibility. 511 // Otherwise we would be trying to fit the tasks current resources and 512 // updated resources. After select is called we can remove the evict. 513 ctx.Plan().AppendUpdate(update.Alloc, structs.AllocDesiredStatusStop, 514 allocInPlace, "") 515 516 // Attempt to match the task group 517 option, _ := stack.Select(update.TaskGroup, nil) // This select only looks at one node so we don't pass selectOptions 518 519 // Pop the allocation 520 ctx.Plan().PopUpdate(update.Alloc) 521 522 // Skip if we could not do an in-place update 523 if option == nil { 524 continue 525 } 526 527 // Restore the network offers from the existing allocation. 528 // We do not allow network resources (reserved/dynamic ports) 529 // to be updated. This is guarded in taskUpdated, so we can 530 // safely restore those here. 531 for task, resources := range option.TaskResources { 532 existing := update.Alloc.TaskResources[task] 533 resources.Networks = existing.Networks 534 } 535 536 // Create a shallow copy 537 newAlloc := new(structs.Allocation) 538 *newAlloc = *update.Alloc 539 540 // Update the allocation 541 newAlloc.EvalID = eval.ID 542 newAlloc.Job = nil // Use the Job in the Plan 543 newAlloc.Resources = nil // Computed in Plan Apply 544 newAlloc.TaskResources = option.TaskResources 545 newAlloc.Metrics = ctx.Metrics() 546 ctx.Plan().AppendAlloc(newAlloc) 547 548 // Remove this allocation from the slice 549 doInplace(&i, &n, &inplaceCount) 550 } 551 552 if len(updates) > 0 { 553 ctx.Logger().Printf("[DEBUG] sched: %#v: %d in-place updates of %d", eval, inplaceCount, len(updates)) 554 } 555 return updates[:n], updates[n:] 556 } 557 558 // evictAndPlace is used to mark allocations for evicts and add them to the 559 // placement queue. evictAndPlace modifies both the diffResult and the 560 // limit. It returns true if the limit has been reached. 561 func evictAndPlace(ctx Context, diff *diffResult, allocs []allocTuple, desc string, limit *int) bool { 562 n := len(allocs) 563 for i := 0; i < n && i < *limit; i++ { 564 a := allocs[i] 565 ctx.Plan().AppendUpdate(a.Alloc, structs.AllocDesiredStatusStop, desc, "") 566 diff.place = append(diff.place, a) 567 } 568 if n <= *limit { 569 *limit -= n 570 return false 571 } 572 *limit = 0 573 return true 574 } 575 576 // tgConstrainTuple is used to store the total constraints of a task group. 577 type tgConstrainTuple struct { 578 // Holds the combined constraints of the task group and all it's sub-tasks. 579 constraints []*structs.Constraint 580 581 // The set of required drivers within the task group. 582 drivers map[string]struct{} 583 584 // The combined resources of all tasks within the task group. 585 size *structs.Resources 586 } 587 588 // taskGroupConstraints collects the constraints, drivers and resources required by each 589 // sub-task to aggregate the TaskGroup totals 590 func taskGroupConstraints(tg *structs.TaskGroup) tgConstrainTuple { 591 c := tgConstrainTuple{ 592 constraints: make([]*structs.Constraint, 0, len(tg.Constraints)), 593 drivers: make(map[string]struct{}), 594 size: &structs.Resources{DiskMB: tg.EphemeralDisk.SizeMB}, 595 } 596 597 c.constraints = append(c.constraints, tg.Constraints...) 598 for _, task := range tg.Tasks { 599 c.drivers[task.Driver] = struct{}{} 600 c.constraints = append(c.constraints, task.Constraints...) 601 c.size.Add(task.Resources) 602 } 603 604 return c 605 } 606 607 // desiredUpdates takes the diffResult as well as the set of inplace and 608 // destructive updates and returns a map of task groups to their set of desired 609 // updates. 610 func desiredUpdates(diff *diffResult, inplaceUpdates, 611 destructiveUpdates []allocTuple) map[string]*structs.DesiredUpdates { 612 desiredTgs := make(map[string]*structs.DesiredUpdates) 613 614 for _, tuple := range diff.place { 615 name := tuple.TaskGroup.Name 616 des, ok := desiredTgs[name] 617 if !ok { 618 des = &structs.DesiredUpdates{} 619 desiredTgs[name] = des 620 } 621 622 des.Place++ 623 } 624 625 for _, tuple := range diff.stop { 626 name := tuple.Alloc.TaskGroup 627 des, ok := desiredTgs[name] 628 if !ok { 629 des = &structs.DesiredUpdates{} 630 desiredTgs[name] = des 631 } 632 633 des.Stop++ 634 } 635 636 for _, tuple := range diff.ignore { 637 name := tuple.TaskGroup.Name 638 des, ok := desiredTgs[name] 639 if !ok { 640 des = &structs.DesiredUpdates{} 641 desiredTgs[name] = des 642 } 643 644 des.Ignore++ 645 } 646 647 for _, tuple := range diff.migrate { 648 name := tuple.TaskGroup.Name 649 des, ok := desiredTgs[name] 650 if !ok { 651 des = &structs.DesiredUpdates{} 652 desiredTgs[name] = des 653 } 654 655 des.Migrate++ 656 } 657 658 for _, tuple := range inplaceUpdates { 659 name := tuple.TaskGroup.Name 660 des, ok := desiredTgs[name] 661 if !ok { 662 des = &structs.DesiredUpdates{} 663 desiredTgs[name] = des 664 } 665 666 des.InPlaceUpdate++ 667 } 668 669 for _, tuple := range destructiveUpdates { 670 name := tuple.TaskGroup.Name 671 des, ok := desiredTgs[name] 672 if !ok { 673 des = &structs.DesiredUpdates{} 674 desiredTgs[name] = des 675 } 676 677 des.DestructiveUpdate++ 678 } 679 680 return desiredTgs 681 } 682 683 // adjustQueuedAllocations decrements the number of allocations pending per task 684 // group based on the number of allocations successfully placed 685 func adjustQueuedAllocations(logger *log.Logger, result *structs.PlanResult, queuedAllocs map[string]int) { 686 if result == nil { 687 return 688 } 689 690 for _, allocations := range result.NodeAllocation { 691 for _, allocation := range allocations { 692 // Ensure that the allocation is newly created. We check that 693 // the CreateIndex is equal to the ModifyIndex in order to check 694 // that the allocation was just created. We do not check that 695 // the CreateIndex is equal to the results AllocIndex because 696 // the allocations we get back have gone through the planner's 697 // optimistic snapshot and thus their indexes may not be 698 // correct, but they will be consistent. 699 if allocation.CreateIndex != allocation.ModifyIndex { 700 continue 701 } 702 703 if _, ok := queuedAllocs[allocation.TaskGroup]; ok { 704 queuedAllocs[allocation.TaskGroup]-- 705 } else { 706 logger.Printf("[ERR] sched: allocation %q placed but not in list of unplaced allocations", allocation.TaskGroup) 707 } 708 } 709 } 710 } 711 712 // updateNonTerminalAllocsToLost updates the allocations which are in pending/running state on tainted node 713 // to lost 714 func updateNonTerminalAllocsToLost(plan *structs.Plan, tainted map[string]*structs.Node, allocs []*structs.Allocation) { 715 for _, alloc := range allocs { 716 node, ok := tainted[alloc.NodeID] 717 if !ok { 718 continue 719 } 720 721 // Only handle down nodes or nodes that are gone (node == nil) 722 if node != nil && node.Status != structs.NodeStatusDown { 723 continue 724 } 725 726 // If the scheduler has marked it as stop already but the alloc wasn't 727 // terminal on the client change the status to lost. 728 if alloc.DesiredStatus == structs.AllocDesiredStatusStop && 729 (alloc.ClientStatus == structs.AllocClientStatusRunning || 730 alloc.ClientStatus == structs.AllocClientStatusPending) { 731 plan.AppendUpdate(alloc, structs.AllocDesiredStatusStop, allocLost, structs.AllocClientStatusLost) 732 } 733 } 734 } 735 736 // genericAllocUpdateFn is a factory for the scheduler to create an allocUpdateType 737 // function to be passed into the reconciler. The factory takes objects that 738 // exist only in the scheduler context and returns a function that can be used 739 // by the reconciler to make decisions about how to update an allocation. The 740 // factory allows the reconciler to be unaware of how to determine the type of 741 // update necessary and can minimize the set of objects it is exposed to. 742 func genericAllocUpdateFn(ctx Context, stack Stack, evalID string) allocUpdateType { 743 return func(existing *structs.Allocation, newJob *structs.Job, newTG *structs.TaskGroup) (ignore, destructive bool, updated *structs.Allocation) { 744 // Same index, so nothing to do 745 if existing.Job.JobModifyIndex == newJob.JobModifyIndex { 746 return true, false, nil 747 } 748 749 // Check if the task drivers or config has changed, requires 750 // a destructive upgrade since that cannot be done in-place. 751 if tasksUpdated(newJob, existing.Job, newTG.Name) { 752 return false, true, nil 753 } 754 755 // Terminal batch allocations are not filtered when they are completed 756 // successfully. We should avoid adding the allocation to the plan in 757 // the case that it is an in-place update to avoid both additional data 758 // in the plan and work for the clients. 759 if existing.TerminalStatus() { 760 return true, false, nil 761 } 762 763 // Get the existing node 764 ws := memdb.NewWatchSet() 765 node, err := ctx.State().NodeByID(ws, existing.NodeID) 766 if err != nil { 767 ctx.Logger().Printf("[ERR] sched: %#v failed to get node '%s': %v", evalID, existing.NodeID, err) 768 return true, false, nil 769 } 770 if node == nil { 771 return false, true, nil 772 } 773 774 // Set the existing node as the base set 775 stack.SetNodes([]*structs.Node{node}) 776 777 // Stage an eviction of the current allocation. This is done so that 778 // the current allocation is discounted when checking for feasibility. 779 // Otherwise we would be trying to fit the tasks current resources and 780 // updated resources. After select is called we can remove the evict. 781 ctx.Plan().AppendUpdate(existing, structs.AllocDesiredStatusStop, allocInPlace, "") 782 783 // Attempt to match the task group 784 option, _ := stack.Select(newTG, nil) // This select only looks at one node so we don't pass selectOptions 785 786 // Pop the allocation 787 ctx.Plan().PopUpdate(existing) 788 789 // Require destructive if we could not do an in-place update 790 if option == nil { 791 return false, true, nil 792 } 793 794 // Restore the network offers from the existing allocation. 795 // We do not allow network resources (reserved/dynamic ports) 796 // to be updated. This is guarded in taskUpdated, so we can 797 // safely restore those here. 798 for task, resources := range option.TaskResources { 799 existingResources := existing.TaskResources[task] 800 resources.Networks = existingResources.Networks 801 } 802 803 // Create a shallow copy 804 newAlloc := new(structs.Allocation) 805 *newAlloc = *existing 806 807 // Update the allocation 808 newAlloc.EvalID = evalID 809 newAlloc.Job = nil // Use the Job in the Plan 810 newAlloc.Resources = nil // Computed in Plan Apply 811 newAlloc.TaskResources = option.TaskResources 812 newAlloc.Metrics = ctx.Metrics() 813 return false, false, newAlloc 814 } 815 }