github.com/jrxfive/nomad@v0.6.1-0.20170802162750-1fef470e89bf/scheduler/util.go (about) 1 package scheduler 2 3 import ( 4 "fmt" 5 "log" 6 "math/rand" 7 "reflect" 8 9 memdb "github.com/hashicorp/go-memdb" 10 "github.com/hashicorp/nomad/nomad/structs" 11 ) 12 13 // allocTuple is a tuple of the allocation name and potential alloc ID 14 type allocTuple struct { 15 Name string 16 TaskGroup *structs.TaskGroup 17 Alloc *structs.Allocation 18 } 19 20 // materializeTaskGroups is used to materialize all the task groups 21 // a job requires. This is used to do the count expansion. 22 func materializeTaskGroups(job *structs.Job) map[string]*structs.TaskGroup { 23 out := make(map[string]*structs.TaskGroup) 24 if job.Stopped() { 25 return out 26 } 27 28 for _, tg := range job.TaskGroups { 29 for i := 0; i < tg.Count; i++ { 30 name := fmt.Sprintf("%s.%s[%d]", job.Name, tg.Name, i) 31 out[name] = tg 32 } 33 } 34 return out 35 } 36 37 // diffResult is used to return the sets that result from the diff 38 type diffResult struct { 39 place, update, migrate, stop, ignore, lost []allocTuple 40 } 41 42 func (d *diffResult) GoString() string { 43 return fmt.Sprintf("allocs: (place %d) (update %d) (migrate %d) (stop %d) (ignore %d) (lost %d)", 44 len(d.place), len(d.update), len(d.migrate), len(d.stop), len(d.ignore), len(d.lost)) 45 } 46 47 func (d *diffResult) Append(other *diffResult) { 48 d.place = append(d.place, other.place...) 49 d.update = append(d.update, other.update...) 50 d.migrate = append(d.migrate, other.migrate...) 51 d.stop = append(d.stop, other.stop...) 52 d.ignore = append(d.ignore, other.ignore...) 53 d.lost = append(d.lost, other.lost...) 54 } 55 56 // diffAllocs is used to do a set difference between the target allocations 57 // and the existing allocations. This returns 6 sets of results, the list of 58 // named task groups that need to be placed (no existing allocation), the 59 // allocations that need to be updated (job definition is newer), allocs that 60 // need to be migrated (node is draining), the allocs that need to be evicted 61 // (no longer required), those that should be ignored and those that are lost 62 // that need to be replaced (running on a lost node). 63 // 64 // job is the job whose allocs is going to be diff-ed. 65 // taintedNodes is an index of the nodes which are either down or in drain mode 66 // by name. 67 // required is a set of allocations that must exist. 68 // allocs is a list of non terminal allocations. 69 // terminalAllocs is an index of the latest terminal allocations by name. 70 func diffAllocs(job *structs.Job, taintedNodes map[string]*structs.Node, 71 required map[string]*structs.TaskGroup, allocs []*structs.Allocation, 72 terminalAllocs map[string]*structs.Allocation) *diffResult { 73 result := &diffResult{} 74 75 // Scan the existing updates 76 existing := make(map[string]struct{}) 77 for _, exist := range allocs { 78 // Index the existing node 79 name := exist.Name 80 existing[name] = struct{}{} 81 82 // Check for the definition in the required set 83 tg, ok := required[name] 84 85 // If not required, we stop the alloc 86 if !ok { 87 result.stop = append(result.stop, allocTuple{ 88 Name: name, 89 TaskGroup: tg, 90 Alloc: exist, 91 }) 92 continue 93 } 94 95 // If we are on a tainted node, we must migrate if we are a service or 96 // if the batch allocation did not finish 97 if node, ok := taintedNodes[exist.NodeID]; ok { 98 // If the job is batch and finished successfully, the fact that the 99 // node is tainted does not mean it should be migrated or marked as 100 // lost as the work was already successfully finished. However for 101 // service/system jobs, tasks should never complete. The check of 102 // batch type, defends against client bugs. 103 if exist.Job.Type == structs.JobTypeBatch && exist.RanSuccessfully() { 104 goto IGNORE 105 } 106 107 if node == nil || node.TerminalStatus() { 108 result.lost = append(result.lost, allocTuple{ 109 Name: name, 110 TaskGroup: tg, 111 Alloc: exist, 112 }) 113 } else { 114 // This is the drain case 115 result.migrate = append(result.migrate, allocTuple{ 116 Name: name, 117 TaskGroup: tg, 118 Alloc: exist, 119 }) 120 } 121 continue 122 } 123 124 // If the definition is updated we need to update 125 if job.JobModifyIndex != exist.Job.JobModifyIndex { 126 result.update = append(result.update, allocTuple{ 127 Name: name, 128 TaskGroup: tg, 129 Alloc: exist, 130 }) 131 continue 132 } 133 134 // Everything is up-to-date 135 IGNORE: 136 result.ignore = append(result.ignore, allocTuple{ 137 Name: name, 138 TaskGroup: tg, 139 Alloc: exist, 140 }) 141 } 142 143 // Scan the required groups 144 for name, tg := range required { 145 // Check for an existing allocation 146 _, ok := existing[name] 147 148 // Require a placement if no existing allocation. If there 149 // is an existing allocation, we would have checked for a potential 150 // update or ignore above. 151 if !ok { 152 result.place = append(result.place, allocTuple{ 153 Name: name, 154 TaskGroup: tg, 155 Alloc: terminalAllocs[name], 156 }) 157 } 158 } 159 return result 160 } 161 162 // diffSystemAllocs is like diffAllocs however, the allocations in the 163 // diffResult contain the specific nodeID they should be allocated on. 164 // 165 // job is the job whose allocs is going to be diff-ed. 166 // nodes is a list of nodes in ready state. 167 // taintedNodes is an index of the nodes which are either down or in drain mode 168 // by name. 169 // allocs is a list of non terminal allocations. 170 // terminalAllocs is an index of the latest terminal allocations by name. 171 func diffSystemAllocs(job *structs.Job, nodes []*structs.Node, taintedNodes map[string]*structs.Node, 172 allocs []*structs.Allocation, terminalAllocs map[string]*structs.Allocation) *diffResult { 173 174 // Build a mapping of nodes to all their allocs. 175 nodeAllocs := make(map[string][]*structs.Allocation, len(allocs)) 176 for _, alloc := range allocs { 177 nallocs := append(nodeAllocs[alloc.NodeID], alloc) 178 nodeAllocs[alloc.NodeID] = nallocs 179 } 180 181 for _, node := range nodes { 182 if _, ok := nodeAllocs[node.ID]; !ok { 183 nodeAllocs[node.ID] = nil 184 } 185 } 186 187 // Create the required task groups. 188 required := materializeTaskGroups(job) 189 190 result := &diffResult{} 191 for nodeID, allocs := range nodeAllocs { 192 diff := diffAllocs(job, taintedNodes, required, allocs, terminalAllocs) 193 194 // If the node is tainted there should be no placements made 195 if _, ok := taintedNodes[nodeID]; ok { 196 diff.place = nil 197 } else { 198 // Mark the alloc as being for a specific node. 199 for i := range diff.place { 200 alloc := &diff.place[i] 201 202 // If the new allocation isn't annotated with a previous allocation 203 // or if the previous allocation isn't from the same node then we 204 // annotate the allocTuple with a new Allocation 205 if alloc.Alloc == nil || alloc.Alloc.NodeID != nodeID { 206 alloc.Alloc = &structs.Allocation{NodeID: nodeID} 207 } 208 } 209 } 210 211 // Migrate does not apply to system jobs and instead should be marked as 212 // stop because if a node is tainted, the job is invalid on that node. 213 diff.stop = append(diff.stop, diff.migrate...) 214 diff.migrate = nil 215 216 result.Append(diff) 217 } 218 219 return result 220 } 221 222 // readyNodesInDCs returns all the ready nodes in the given datacenters and a 223 // mapping of each data center to the count of ready nodes. 224 func readyNodesInDCs(state State, dcs []string) ([]*structs.Node, map[string]int, error) { 225 // Index the DCs 226 dcMap := make(map[string]int, len(dcs)) 227 for _, dc := range dcs { 228 dcMap[dc] = 0 229 } 230 231 // Scan the nodes 232 ws := memdb.NewWatchSet() 233 var out []*structs.Node 234 iter, err := state.Nodes(ws) 235 if err != nil { 236 return nil, nil, err 237 } 238 for { 239 raw := iter.Next() 240 if raw == nil { 241 break 242 } 243 244 // Filter on datacenter and status 245 node := raw.(*structs.Node) 246 if node.Status != structs.NodeStatusReady { 247 continue 248 } 249 if node.Drain { 250 continue 251 } 252 if _, ok := dcMap[node.Datacenter]; !ok { 253 continue 254 } 255 out = append(out, node) 256 dcMap[node.Datacenter] += 1 257 } 258 return out, dcMap, nil 259 } 260 261 // retryMax is used to retry a callback until it returns success or 262 // a maximum number of attempts is reached. An optional reset function may be 263 // passed which is called after each failed iteration. If the reset function is 264 // set and returns true, the number of attempts is reset back to max. 265 func retryMax(max int, cb func() (bool, error), reset func() bool) error { 266 attempts := 0 267 for attempts < max { 268 done, err := cb() 269 if err != nil { 270 return err 271 } 272 if done { 273 return nil 274 } 275 276 // Check if we should reset the number attempts 277 if reset != nil && reset() { 278 attempts = 0 279 } else { 280 attempts += 1 281 } 282 } 283 return &SetStatusError{ 284 Err: fmt.Errorf("maximum attempts reached (%d)", max), 285 EvalStatus: structs.EvalStatusFailed, 286 } 287 } 288 289 // progressMade checks to see if the plan result made allocations or updates. 290 // If the result is nil, false is returned. 291 func progressMade(result *structs.PlanResult) bool { 292 return result != nil && (len(result.NodeUpdate) != 0 || 293 len(result.NodeAllocation) != 0 || result.Deployment != nil || 294 len(result.DeploymentUpdates) != 0) 295 } 296 297 // taintedNodes is used to scan the allocations and then check if the 298 // underlying nodes are tainted, and should force a migration of the allocation. 299 // All the nodes returned in the map are tainted. 300 func taintedNodes(state State, allocs []*structs.Allocation) (map[string]*structs.Node, error) { 301 out := make(map[string]*structs.Node) 302 for _, alloc := range allocs { 303 if _, ok := out[alloc.NodeID]; ok { 304 continue 305 } 306 307 ws := memdb.NewWatchSet() 308 node, err := state.NodeByID(ws, alloc.NodeID) 309 if err != nil { 310 return nil, err 311 } 312 313 // If the node does not exist, we should migrate 314 if node == nil { 315 out[alloc.NodeID] = nil 316 continue 317 } 318 if structs.ShouldDrainNode(node.Status) || node.Drain { 319 out[alloc.NodeID] = node 320 } 321 } 322 return out, nil 323 } 324 325 // shuffleNodes randomizes the slice order with the Fisher-Yates algorithm 326 func shuffleNodes(nodes []*structs.Node) { 327 n := len(nodes) 328 for i := n - 1; i > 0; i-- { 329 j := rand.Intn(i + 1) 330 nodes[i], nodes[j] = nodes[j], nodes[i] 331 } 332 } 333 334 // tasksUpdated does a diff between task groups to see if the 335 // tasks, their drivers, environment variables or config have updated. The 336 // inputs are the task group name to diff and two jobs to diff. 337 func tasksUpdated(jobA, jobB *structs.Job, taskGroup string) bool { 338 a := jobA.LookupTaskGroup(taskGroup) 339 b := jobB.LookupTaskGroup(taskGroup) 340 341 // If the number of tasks do not match, clearly there is an update 342 if len(a.Tasks) != len(b.Tasks) { 343 return true 344 } 345 346 // Check ephemeral disk 347 if !reflect.DeepEqual(a.EphemeralDisk, b.EphemeralDisk) { 348 return true 349 } 350 351 // Check each task 352 for _, at := range a.Tasks { 353 bt := b.LookupTask(at.Name) 354 if bt == nil { 355 return true 356 } 357 if at.Driver != bt.Driver { 358 return true 359 } 360 if at.User != bt.User { 361 return true 362 } 363 if !reflect.DeepEqual(at.Config, bt.Config) { 364 return true 365 } 366 if !reflect.DeepEqual(at.Env, bt.Env) { 367 return true 368 } 369 if !reflect.DeepEqual(at.Artifacts, bt.Artifacts) { 370 return true 371 } 372 if !reflect.DeepEqual(at.Vault, bt.Vault) { 373 return true 374 } 375 if !reflect.DeepEqual(at.Templates, bt.Templates) { 376 return true 377 } 378 379 // Check the metadata 380 if !reflect.DeepEqual( 381 jobA.CombinedTaskMeta(taskGroup, at.Name), 382 jobB.CombinedTaskMeta(taskGroup, bt.Name)) { 383 return true 384 } 385 386 // Inspect the network to see if the dynamic ports are different 387 if len(at.Resources.Networks) != len(bt.Resources.Networks) { 388 return true 389 } 390 for idx := range at.Resources.Networks { 391 an := at.Resources.Networks[idx] 392 bn := bt.Resources.Networks[idx] 393 394 if an.MBits != bn.MBits { 395 return true 396 } 397 398 aPorts, bPorts := networkPortMap(an), networkPortMap(bn) 399 if !reflect.DeepEqual(aPorts, bPorts) { 400 return true 401 } 402 } 403 404 // Inspect the non-network resources 405 if ar, br := at.Resources, bt.Resources; ar.CPU != br.CPU { 406 return true 407 } else if ar.MemoryMB != br.MemoryMB { 408 return true 409 } else if ar.IOPS != br.IOPS { 410 return true 411 } 412 } 413 return false 414 } 415 416 // networkPortMap takes a network resource and returns a map of port labels to 417 // values. The value for dynamic ports is disregarded even if it is set. This 418 // makes this function suitable for comparing two network resources for changes. 419 func networkPortMap(n *structs.NetworkResource) map[string]int { 420 m := make(map[string]int, len(n.DynamicPorts)+len(n.ReservedPorts)) 421 for _, p := range n.ReservedPorts { 422 m[p.Label] = p.Value 423 } 424 for _, p := range n.DynamicPorts { 425 m[p.Label] = -1 426 } 427 return m 428 } 429 430 // setStatus is used to update the status of the evaluation 431 func setStatus(logger *log.Logger, planner Planner, 432 eval, nextEval, spawnedBlocked *structs.Evaluation, 433 tgMetrics map[string]*structs.AllocMetric, status, desc string, 434 queuedAllocs map[string]int, deploymentID string) error { 435 436 logger.Printf("[DEBUG] sched: %#v: setting status to %s", eval, status) 437 newEval := eval.Copy() 438 newEval.Status = status 439 newEval.StatusDescription = desc 440 newEval.DeploymentID = deploymentID 441 newEval.FailedTGAllocs = tgMetrics 442 if nextEval != nil { 443 newEval.NextEval = nextEval.ID 444 } 445 if spawnedBlocked != nil { 446 newEval.BlockedEval = spawnedBlocked.ID 447 } 448 if queuedAllocs != nil { 449 newEval.QueuedAllocations = queuedAllocs 450 } 451 452 return planner.UpdateEval(newEval) 453 } 454 455 // inplaceUpdate attempts to update allocations in-place where possible. It 456 // returns the allocs that couldn't be done inplace and then those that could. 457 func inplaceUpdate(ctx Context, eval *structs.Evaluation, job *structs.Job, 458 stack Stack, updates []allocTuple) (destructive, inplace []allocTuple) { 459 460 // doInplace manipulates the updates map to make the current allocation 461 // an inplace update. 462 doInplace := func(cur, last, inplaceCount *int) { 463 updates[*cur], updates[*last-1] = updates[*last-1], updates[*cur] 464 *cur-- 465 *last-- 466 *inplaceCount++ 467 } 468 469 ws := memdb.NewWatchSet() 470 n := len(updates) 471 inplaceCount := 0 472 for i := 0; i < n; i++ { 473 // Get the update 474 update := updates[i] 475 476 // Check if the task drivers or config has changed, requires 477 // a rolling upgrade since that cannot be done in-place. 478 existing := update.Alloc.Job 479 if tasksUpdated(job, existing, update.TaskGroup.Name) { 480 continue 481 } 482 483 // Terminal batch allocations are not filtered when they are completed 484 // successfully. We should avoid adding the allocation to the plan in 485 // the case that it is an in-place update to avoid both additional data 486 // in the plan and work for the clients. 487 if update.Alloc.TerminalStatus() { 488 doInplace(&i, &n, &inplaceCount) 489 continue 490 } 491 492 // Get the existing node 493 node, err := ctx.State().NodeByID(ws, update.Alloc.NodeID) 494 if err != nil { 495 ctx.Logger().Printf("[ERR] sched: %#v failed to get node '%s': %v", 496 eval, update.Alloc.NodeID, err) 497 continue 498 } 499 if node == nil { 500 continue 501 } 502 503 // Set the existing node as the base set 504 stack.SetNodes([]*structs.Node{node}) 505 506 // Stage an eviction of the current allocation. This is done so that 507 // the current allocation is discounted when checking for feasability. 508 // Otherwise we would be trying to fit the tasks current resources and 509 // updated resources. After select is called we can remove the evict. 510 ctx.Plan().AppendUpdate(update.Alloc, structs.AllocDesiredStatusStop, 511 allocInPlace, "") 512 513 // Attempt to match the task group 514 option, _ := stack.Select(update.TaskGroup) 515 516 // Pop the allocation 517 ctx.Plan().PopUpdate(update.Alloc) 518 519 // Skip if we could not do an in-place update 520 if option == nil { 521 continue 522 } 523 524 // Restore the network offers from the existing allocation. 525 // We do not allow network resources (reserved/dynamic ports) 526 // to be updated. This is guarded in taskUpdated, so we can 527 // safely restore those here. 528 for task, resources := range option.TaskResources { 529 existing := update.Alloc.TaskResources[task] 530 resources.Networks = existing.Networks 531 } 532 533 // Create a shallow copy 534 newAlloc := new(structs.Allocation) 535 *newAlloc = *update.Alloc 536 537 // Update the allocation 538 newAlloc.EvalID = eval.ID 539 newAlloc.Job = nil // Use the Job in the Plan 540 newAlloc.Resources = nil // Computed in Plan Apply 541 newAlloc.TaskResources = option.TaskResources 542 newAlloc.Metrics = ctx.Metrics() 543 ctx.Plan().AppendAlloc(newAlloc) 544 545 // Remove this allocation from the slice 546 doInplace(&i, &n, &inplaceCount) 547 } 548 549 if len(updates) > 0 { 550 ctx.Logger().Printf("[DEBUG] sched: %#v: %d in-place updates of %d", eval, inplaceCount, len(updates)) 551 } 552 return updates[:n], updates[n:] 553 } 554 555 // evictAndPlace is used to mark allocations for evicts and add them to the 556 // placement queue. evictAndPlace modifies both the diffResult and the 557 // limit. It returns true if the limit has been reached. 558 func evictAndPlace(ctx Context, diff *diffResult, allocs []allocTuple, desc string, limit *int) bool { 559 n := len(allocs) 560 for i := 0; i < n && i < *limit; i++ { 561 a := allocs[i] 562 ctx.Plan().AppendUpdate(a.Alloc, structs.AllocDesiredStatusStop, desc, "") 563 diff.place = append(diff.place, a) 564 } 565 if n <= *limit { 566 *limit -= n 567 return false 568 } 569 *limit = 0 570 return true 571 } 572 573 // markLostAndPlace is used to mark allocations as lost and add them to the 574 // placement queue. evictAndPlace modifies both the diffResult and the 575 // limit. It returns true if the limit has been reached. 576 func markLostAndPlace(ctx Context, diff *diffResult, allocs []allocTuple, desc string, limit *int) bool { 577 n := len(allocs) 578 for i := 0; i < n && i < *limit; i++ { 579 a := allocs[i] 580 ctx.Plan().AppendUpdate(a.Alloc, structs.AllocDesiredStatusStop, desc, structs.AllocClientStatusLost) 581 diff.place = append(diff.place, a) 582 } 583 if n <= *limit { 584 *limit -= n 585 return false 586 } 587 *limit = 0 588 return true 589 } 590 591 // tgConstrainTuple is used to store the total constraints of a task group. 592 type tgConstrainTuple struct { 593 // Holds the combined constraints of the task group and all it's sub-tasks. 594 constraints []*structs.Constraint 595 596 // The set of required drivers within the task group. 597 drivers map[string]struct{} 598 599 // The combined resources of all tasks within the task group. 600 size *structs.Resources 601 } 602 603 // taskGroupConstraints collects the constraints, drivers and resources required by each 604 // sub-task to aggregate the TaskGroup totals 605 func taskGroupConstraints(tg *structs.TaskGroup) tgConstrainTuple { 606 c := tgConstrainTuple{ 607 constraints: make([]*structs.Constraint, 0, len(tg.Constraints)), 608 drivers: make(map[string]struct{}), 609 size: &structs.Resources{DiskMB: tg.EphemeralDisk.SizeMB}, 610 } 611 612 c.constraints = append(c.constraints, tg.Constraints...) 613 for _, task := range tg.Tasks { 614 c.drivers[task.Driver] = struct{}{} 615 c.constraints = append(c.constraints, task.Constraints...) 616 c.size.Add(task.Resources) 617 } 618 619 return c 620 } 621 622 // desiredUpdates takes the diffResult as well as the set of inplace and 623 // destructive updates and returns a map of task groups to their set of desired 624 // updates. 625 func desiredUpdates(diff *diffResult, inplaceUpdates, 626 destructiveUpdates []allocTuple) map[string]*structs.DesiredUpdates { 627 desiredTgs := make(map[string]*structs.DesiredUpdates) 628 629 for _, tuple := range diff.place { 630 name := tuple.TaskGroup.Name 631 des, ok := desiredTgs[name] 632 if !ok { 633 des = &structs.DesiredUpdates{} 634 desiredTgs[name] = des 635 } 636 637 des.Place++ 638 } 639 640 for _, tuple := range diff.stop { 641 name := tuple.Alloc.TaskGroup 642 des, ok := desiredTgs[name] 643 if !ok { 644 des = &structs.DesiredUpdates{} 645 desiredTgs[name] = des 646 } 647 648 des.Stop++ 649 } 650 651 for _, tuple := range diff.ignore { 652 name := tuple.TaskGroup.Name 653 des, ok := desiredTgs[name] 654 if !ok { 655 des = &structs.DesiredUpdates{} 656 desiredTgs[name] = des 657 } 658 659 des.Ignore++ 660 } 661 662 for _, tuple := range diff.migrate { 663 name := tuple.TaskGroup.Name 664 des, ok := desiredTgs[name] 665 if !ok { 666 des = &structs.DesiredUpdates{} 667 desiredTgs[name] = des 668 } 669 670 des.Migrate++ 671 } 672 673 for _, tuple := range inplaceUpdates { 674 name := tuple.TaskGroup.Name 675 des, ok := desiredTgs[name] 676 if !ok { 677 des = &structs.DesiredUpdates{} 678 desiredTgs[name] = des 679 } 680 681 des.InPlaceUpdate++ 682 } 683 684 for _, tuple := range destructiveUpdates { 685 name := tuple.TaskGroup.Name 686 des, ok := desiredTgs[name] 687 if !ok { 688 des = &structs.DesiredUpdates{} 689 desiredTgs[name] = des 690 } 691 692 des.DestructiveUpdate++ 693 } 694 695 return desiredTgs 696 } 697 698 // adjustQueuedAllocations decrements the number of allocations pending per task 699 // group based on the number of allocations successfully placed 700 func adjustQueuedAllocations(logger *log.Logger, result *structs.PlanResult, queuedAllocs map[string]int) { 701 if result == nil { 702 return 703 } 704 705 for _, allocations := range result.NodeAllocation { 706 for _, allocation := range allocations { 707 // Ensure that the allocation is newly created. We check that 708 // the CreateIndex is equal to the ModifyIndex in order to check 709 // that the allocation was just created. We do not check that 710 // the CreateIndex is equal to the results AllocIndex because 711 // the allocations we get back have gone through the planner's 712 // optimistic snapshot and thus their indexes may not be 713 // correct, but they will be consistent. 714 if allocation.CreateIndex != allocation.ModifyIndex { 715 continue 716 } 717 718 if _, ok := queuedAllocs[allocation.TaskGroup]; ok { 719 queuedAllocs[allocation.TaskGroup] -= 1 720 } else { 721 logger.Printf("[ERR] sched: allocation %q placed but not in list of unplaced allocations", allocation.TaskGroup) 722 } 723 } 724 } 725 } 726 727 // updateNonTerminalAllocsToLost updates the allocations which are in pending/running state on tainted node 728 // to lost 729 func updateNonTerminalAllocsToLost(plan *structs.Plan, tainted map[string]*structs.Node, allocs []*structs.Allocation) { 730 for _, alloc := range allocs { 731 if _, ok := tainted[alloc.NodeID]; ok && 732 alloc.DesiredStatus == structs.AllocDesiredStatusStop && 733 (alloc.ClientStatus == structs.AllocClientStatusRunning || 734 alloc.ClientStatus == structs.AllocClientStatusPending) { 735 plan.AppendUpdate(alloc, structs.AllocDesiredStatusStop, allocLost, structs.AllocClientStatusLost) 736 } 737 } 738 } 739 740 // genericAllocUpdateFn is a factory for the scheduler to create an allocUpdateType 741 // function to be passed into the reconciler. The factory takes objects that 742 // exist only in the scheduler context and returns a function that can be used 743 // by the reconciler to make decsions about how to update an allocation. The 744 // factory allows the reconciler to be unaware of how to determine the type of 745 // update necessary and can minimize the set of objects it is exposed to. 746 func genericAllocUpdateFn(ctx Context, stack Stack, evalID string) allocUpdateType { 747 return func(existing *structs.Allocation, newJob *structs.Job, newTG *structs.TaskGroup) (ignore, destructive bool, updated *structs.Allocation) { 748 // Same index, so nothing to do 749 if existing.Job.JobModifyIndex == newJob.JobModifyIndex { 750 return true, false, nil 751 } 752 753 // Check if the task drivers or config has changed, requires 754 // a destructive upgrade since that cannot be done in-place. 755 if tasksUpdated(newJob, existing.Job, newTG.Name) { 756 return false, true, nil 757 } 758 759 // Terminal batch allocations are not filtered when they are completed 760 // successfully. We should avoid adding the allocation to the plan in 761 // the case that it is an in-place update to avoid both additional data 762 // in the plan and work for the clients. 763 if existing.TerminalStatus() { 764 return true, false, nil 765 } 766 767 // Get the existing node 768 ws := memdb.NewWatchSet() 769 node, err := ctx.State().NodeByID(ws, existing.NodeID) 770 if err != nil { 771 ctx.Logger().Printf("[ERR] sched: %#v failed to get node '%s': %v", evalID, existing.NodeID, err) 772 return true, false, nil 773 } 774 if node == nil { 775 return false, true, nil 776 } 777 778 // Set the existing node as the base set 779 stack.SetNodes([]*structs.Node{node}) 780 781 // Stage an eviction of the current allocation. This is done so that 782 // the current allocation is discounted when checking for feasability. 783 // Otherwise we would be trying to fit the tasks current resources and 784 // updated resources. After select is called we can remove the evict. 785 ctx.Plan().AppendUpdate(existing, structs.AllocDesiredStatusStop, allocInPlace, "") 786 787 // Attempt to match the task group 788 option, _ := stack.Select(newTG) 789 790 // Pop the allocation 791 ctx.Plan().PopUpdate(existing) 792 793 // Require destructive if we could not do an in-place update 794 if option == nil { 795 return false, true, nil 796 } 797 798 // Restore the network offers from the existing allocation. 799 // We do not allow network resources (reserved/dynamic ports) 800 // to be updated. This is guarded in taskUpdated, so we can 801 // safely restore those here. 802 for task, resources := range option.TaskResources { 803 existingResources := existing.TaskResources[task] 804 resources.Networks = existingResources.Networks 805 } 806 807 // Create a shallow copy 808 newAlloc := new(structs.Allocation) 809 *newAlloc = *existing 810 811 // Update the allocation 812 newAlloc.EvalID = evalID 813 newAlloc.Job = nil // Use the Job in the Plan 814 newAlloc.Resources = nil // Computed in Plan Apply 815 newAlloc.TaskResources = option.TaskResources 816 newAlloc.Metrics = ctx.Metrics() 817 return false, false, newAlloc 818 } 819 }