github.com/maier/nomad@v0.4.1-0.20161110003312-a9e3d0b8549d/scheduler/util.go (about) 1 package scheduler 2 3 import ( 4 "fmt" 5 "log" 6 "math/rand" 7 "reflect" 8 9 "github.com/hashicorp/nomad/nomad/structs" 10 ) 11 12 // allocTuple is a tuple of the allocation name and potential alloc ID 13 type allocTuple struct { 14 Name string 15 TaskGroup *structs.TaskGroup 16 Alloc *structs.Allocation 17 } 18 19 // materializeTaskGroups is used to materialize all the task groups 20 // a job requires. This is used to do the count expansion. 21 func materializeTaskGroups(job *structs.Job) map[string]*structs.TaskGroup { 22 out := make(map[string]*structs.TaskGroup) 23 if job == nil { 24 return out 25 } 26 27 for _, tg := range job.TaskGroups { 28 for i := 0; i < tg.Count; i++ { 29 name := fmt.Sprintf("%s.%s[%d]", job.Name, tg.Name, i) 30 out[name] = tg 31 } 32 } 33 return out 34 } 35 36 // diffResult is used to return the sets that result from the diff 37 type diffResult struct { 38 place, update, migrate, stop, ignore, lost []allocTuple 39 } 40 41 func (d *diffResult) GoString() string { 42 return fmt.Sprintf("allocs: (place %d) (update %d) (migrate %d) (stop %d) (ignore %d) (lost %d)", 43 len(d.place), len(d.update), len(d.migrate), len(d.stop), len(d.ignore), len(d.lost)) 44 } 45 46 func (d *diffResult) Append(other *diffResult) { 47 d.place = append(d.place, other.place...) 48 d.update = append(d.update, other.update...) 49 d.migrate = append(d.migrate, other.migrate...) 50 d.stop = append(d.stop, other.stop...) 51 d.ignore = append(d.ignore, other.ignore...) 52 d.lost = append(d.lost, other.lost...) 53 } 54 55 // diffAllocs is used to do a set difference between the target allocations 56 // and the existing allocations. This returns 6 sets of results, the list of 57 // named task groups that need to be placed (no existing allocation), the 58 // allocations that need to be updated (job definition is newer), allocs that 59 // need to be migrated (node is draining), the allocs that need to be evicted 60 // (no longer required), those that should be ignored and those that are lost 61 // that need to be replaced (running on a lost node). 62 // 63 // job is the job whose allocs is going to be diff-ed. 64 // taintedNodes is an index of the nodes which are either down or in drain mode 65 // by name. 66 // required is a set of allocations that must exist. 67 // allocs is a list of non terminal allocations. 68 // terminalAllocs is an index of the latest terminal allocations by name. 69 func diffAllocs(job *structs.Job, taintedNodes map[string]*structs.Node, 70 required map[string]*structs.TaskGroup, allocs []*structs.Allocation, 71 terminalAllocs map[string]*structs.Allocation) *diffResult { 72 result := &diffResult{} 73 74 // Scan the existing updates 75 existing := make(map[string]struct{}) 76 for _, exist := range allocs { 77 // Index the existing node 78 name := exist.Name 79 existing[name] = struct{}{} 80 81 // Check for the definition in the required set 82 tg, ok := required[name] 83 84 // If not required, we stop the alloc 85 if !ok { 86 result.stop = append(result.stop, allocTuple{ 87 Name: name, 88 TaskGroup: tg, 89 Alloc: exist, 90 }) 91 continue 92 } 93 94 // If we are on a tainted node, we must migrate if we are a service or 95 // if the batch allocation did not finish 96 if node, ok := taintedNodes[exist.NodeID]; ok { 97 // If the job is batch and finished successfully, the fact that the 98 // node is tainted does not mean it should be migrated or marked as 99 // lost as the work was already successfully finished. However for 100 // service/system jobs, tasks should never complete. The check of 101 // batch type, defends against client bugs. 102 if exist.Job.Type == structs.JobTypeBatch && exist.RanSuccessfully() { 103 goto IGNORE 104 } 105 106 if node == nil || node.TerminalStatus() { 107 result.lost = append(result.lost, allocTuple{ 108 Name: name, 109 TaskGroup: tg, 110 Alloc: exist, 111 }) 112 } else { 113 // This is the drain case 114 result.migrate = append(result.migrate, allocTuple{ 115 Name: name, 116 TaskGroup: tg, 117 Alloc: exist, 118 }) 119 } 120 continue 121 } 122 123 // If the definition is updated we need to update 124 if job.JobModifyIndex != exist.Job.JobModifyIndex { 125 result.update = append(result.update, allocTuple{ 126 Name: name, 127 TaskGroup: tg, 128 Alloc: exist, 129 }) 130 continue 131 } 132 133 // Everything is up-to-date 134 IGNORE: 135 result.ignore = append(result.ignore, allocTuple{ 136 Name: name, 137 TaskGroup: tg, 138 Alloc: exist, 139 }) 140 } 141 142 // Scan the required groups 143 for name, tg := range required { 144 // Check for an existing allocation 145 _, ok := existing[name] 146 147 // Require a placement if no existing allocation. If there 148 // is an existing allocation, we would have checked for a potential 149 // update or ignore above. 150 if !ok { 151 result.place = append(result.place, allocTuple{ 152 Name: name, 153 TaskGroup: tg, 154 Alloc: terminalAllocs[name], 155 }) 156 } 157 } 158 return result 159 } 160 161 // diffSystemAllocs is like diffAllocs however, the allocations in the 162 // diffResult contain the specific nodeID they should be allocated on. 163 // 164 // job is the job whose allocs is going to be diff-ed. 165 // nodes is a list of nodes in ready state. 166 // taintedNodes is an index of the nodes which are either down or in drain mode 167 // by name. 168 // allocs is a list of non terminal allocations. 169 // terminalAllocs is an index of the latest terminal allocations by name. 170 func diffSystemAllocs(job *structs.Job, nodes []*structs.Node, taintedNodes map[string]*structs.Node, 171 allocs []*structs.Allocation, terminalAllocs map[string]*structs.Allocation) *diffResult { 172 173 // Build a mapping of nodes to all their allocs. 174 nodeAllocs := make(map[string][]*structs.Allocation, len(allocs)) 175 for _, alloc := range allocs { 176 nallocs := append(nodeAllocs[alloc.NodeID], alloc) 177 nodeAllocs[alloc.NodeID] = nallocs 178 } 179 180 for _, node := range nodes { 181 if _, ok := nodeAllocs[node.ID]; !ok { 182 nodeAllocs[node.ID] = nil 183 } 184 } 185 186 // Create the required task groups. 187 required := materializeTaskGroups(job) 188 189 result := &diffResult{} 190 for nodeID, allocs := range nodeAllocs { 191 diff := diffAllocs(job, taintedNodes, required, allocs, terminalAllocs) 192 193 // If the node is tainted there should be no placements made 194 if _, ok := taintedNodes[nodeID]; ok { 195 diff.place = nil 196 } else { 197 // Mark the alloc as being for a specific node. 198 for i := range diff.place { 199 alloc := &diff.place[i] 200 201 // If the new allocation isn't annotated with a previous allocation 202 // or if the previous allocation isn't from the same node then we 203 // annotate the allocTuple with a new Allocation 204 if alloc.Alloc == nil || alloc.Alloc.NodeID != nodeID { 205 alloc.Alloc = &structs.Allocation{NodeID: nodeID} 206 } 207 } 208 } 209 210 // Migrate does not apply to system jobs and instead should be marked as 211 // stop because if a node is tainted, the job is invalid on that node. 212 diff.stop = append(diff.stop, diff.migrate...) 213 diff.migrate = nil 214 215 result.Append(diff) 216 } 217 218 return result 219 } 220 221 // readyNodesInDCs returns all the ready nodes in the given datacenters and a 222 // mapping of each data center to the count of ready nodes. 223 func readyNodesInDCs(state State, dcs []string) ([]*structs.Node, map[string]int, error) { 224 // Index the DCs 225 dcMap := make(map[string]int, len(dcs)) 226 for _, dc := range dcs { 227 dcMap[dc] = 0 228 } 229 230 // Scan the nodes 231 var out []*structs.Node 232 iter, err := state.Nodes() 233 if err != nil { 234 return nil, nil, err 235 } 236 for { 237 raw := iter.Next() 238 if raw == nil { 239 break 240 } 241 242 // Filter on datacenter and status 243 node := raw.(*structs.Node) 244 if node.Status != structs.NodeStatusReady { 245 continue 246 } 247 if node.Drain { 248 continue 249 } 250 if _, ok := dcMap[node.Datacenter]; !ok { 251 continue 252 } 253 out = append(out, node) 254 dcMap[node.Datacenter] += 1 255 } 256 return out, dcMap, nil 257 } 258 259 // retryMax is used to retry a callback until it returns success or 260 // a maximum number of attempts is reached. An optional reset function may be 261 // passed which is called after each failed iteration. If the reset function is 262 // set and returns true, the number of attempts is reset back to max. 263 func retryMax(max int, cb func() (bool, error), reset func() bool) error { 264 attempts := 0 265 for attempts < max { 266 done, err := cb() 267 if err != nil { 268 return err 269 } 270 if done { 271 return nil 272 } 273 274 // Check if we should reset the number attempts 275 if reset != nil && reset() { 276 attempts = 0 277 } else { 278 attempts += 1 279 } 280 } 281 return &SetStatusError{ 282 Err: fmt.Errorf("maximum attempts reached (%d)", max), 283 EvalStatus: structs.EvalStatusFailed, 284 } 285 } 286 287 // progressMade checks to see if the plan result made allocations or updates. 288 // If the result is nil, false is returned. 289 func progressMade(result *structs.PlanResult) bool { 290 return result != nil && (len(result.NodeUpdate) != 0 || 291 len(result.NodeAllocation) != 0) 292 } 293 294 // taintedNodes is used to scan the allocations and then check if the 295 // underlying nodes are tainted, and should force a migration of the allocation. 296 // All the nodes returned in the map are tainted. 297 func taintedNodes(state State, allocs []*structs.Allocation) (map[string]*structs.Node, error) { 298 out := make(map[string]*structs.Node) 299 for _, alloc := range allocs { 300 if _, ok := out[alloc.NodeID]; ok { 301 continue 302 } 303 304 node, err := state.NodeByID(alloc.NodeID) 305 if err != nil { 306 return nil, err 307 } 308 309 // If the node does not exist, we should migrate 310 if node == nil { 311 out[alloc.NodeID] = nil 312 continue 313 } 314 if structs.ShouldDrainNode(node.Status) || node.Drain { 315 out[alloc.NodeID] = node 316 } 317 } 318 return out, nil 319 } 320 321 // shuffleNodes randomizes the slice order with the Fisher-Yates algorithm 322 func shuffleNodes(nodes []*structs.Node) { 323 n := len(nodes) 324 for i := n - 1; i > 0; i-- { 325 j := rand.Intn(i + 1) 326 nodes[i], nodes[j] = nodes[j], nodes[i] 327 } 328 } 329 330 // tasksUpdated does a diff between task groups to see if the 331 // tasks, their drivers, environment variables or config have updated. 332 func tasksUpdated(a, b *structs.TaskGroup) bool { 333 // If the number of tasks do not match, clearly there is an update 334 if len(a.Tasks) != len(b.Tasks) { 335 return true 336 } 337 338 // Check ephemeral disk 339 if !reflect.DeepEqual(a.EphemeralDisk, b.EphemeralDisk) { 340 return true 341 } 342 343 // Check each task 344 for _, at := range a.Tasks { 345 bt := b.LookupTask(at.Name) 346 if bt == nil { 347 return true 348 } 349 if at.Driver != bt.Driver { 350 return true 351 } 352 if at.User != bt.User { 353 return true 354 } 355 if !reflect.DeepEqual(at.Config, bt.Config) { 356 return true 357 } 358 if !reflect.DeepEqual(at.Env, bt.Env) { 359 return true 360 } 361 if !reflect.DeepEqual(at.Meta, bt.Meta) { 362 return true 363 } 364 if !reflect.DeepEqual(at.Artifacts, bt.Artifacts) { 365 return true 366 } 367 if !reflect.DeepEqual(at.Vault, bt.Vault) { 368 return true 369 } 370 if !reflect.DeepEqual(at.Templates, bt.Templates) { 371 return true 372 } 373 374 // Inspect the network to see if the dynamic ports are different 375 if len(at.Resources.Networks) != len(bt.Resources.Networks) { 376 return true 377 } 378 for idx := range at.Resources.Networks { 379 an := at.Resources.Networks[idx] 380 bn := bt.Resources.Networks[idx] 381 382 if an.MBits != bn.MBits { 383 return true 384 } 385 386 aPorts, bPorts := networkPortMap(an), networkPortMap(bn) 387 if !reflect.DeepEqual(aPorts, bPorts) { 388 return true 389 } 390 } 391 392 // Inspect the non-network resources 393 if ar, br := at.Resources, bt.Resources; ar.CPU != br.CPU { 394 return true 395 } else if ar.MemoryMB != br.MemoryMB { 396 return true 397 } else if ar.IOPS != br.IOPS { 398 return true 399 } 400 } 401 return false 402 } 403 404 // networkPortMap takes a network resource and returns a map of port labels to 405 // values. The value for dynamic ports is disregarded even if it is set. This 406 // makes this function suitable for comparing two network resources for changes. 407 func networkPortMap(n *structs.NetworkResource) map[string]int { 408 m := make(map[string]int, len(n.DynamicPorts)+len(n.ReservedPorts)) 409 for _, p := range n.ReservedPorts { 410 m[p.Label] = p.Value 411 } 412 for _, p := range n.DynamicPorts { 413 m[p.Label] = -1 414 } 415 return m 416 } 417 418 // setStatus is used to update the status of the evaluation 419 func setStatus(logger *log.Logger, planner Planner, 420 eval, nextEval, spawnedBlocked *structs.Evaluation, 421 tgMetrics map[string]*structs.AllocMetric, status, desc string, 422 queuedAllocs map[string]int) error { 423 424 logger.Printf("[DEBUG] sched: %#v: setting status to %s", eval, status) 425 newEval := eval.Copy() 426 newEval.Status = status 427 newEval.StatusDescription = desc 428 newEval.FailedTGAllocs = tgMetrics 429 if nextEval != nil { 430 newEval.NextEval = nextEval.ID 431 } 432 if spawnedBlocked != nil { 433 newEval.BlockedEval = spawnedBlocked.ID 434 } 435 if queuedAllocs != nil { 436 newEval.QueuedAllocations = queuedAllocs 437 } 438 439 return planner.UpdateEval(newEval) 440 } 441 442 // inplaceUpdate attempts to update allocations in-place where possible. It 443 // returns the allocs that couldn't be done inplace and then those that could. 444 func inplaceUpdate(ctx Context, eval *structs.Evaluation, job *structs.Job, 445 stack Stack, updates []allocTuple) (destructive, inplace []allocTuple) { 446 447 n := len(updates) 448 inplaceCount := 0 449 for i := 0; i < n; i++ { 450 // Get the update 451 update := updates[i] 452 453 // Check if the task drivers or config has changed, requires 454 // a rolling upgrade since that cannot be done in-place. 455 existing := update.Alloc.Job.LookupTaskGroup(update.TaskGroup.Name) 456 if tasksUpdated(update.TaskGroup, existing) { 457 continue 458 } 459 460 // Get the existing node 461 node, err := ctx.State().NodeByID(update.Alloc.NodeID) 462 if err != nil { 463 ctx.Logger().Printf("[ERR] sched: %#v failed to get node '%s': %v", 464 eval, update.Alloc.NodeID, err) 465 continue 466 } 467 if node == nil { 468 continue 469 } 470 471 // Set the existing node as the base set 472 stack.SetNodes([]*structs.Node{node}) 473 474 // Stage an eviction of the current allocation. This is done so that 475 // the current allocation is discounted when checking for feasability. 476 // Otherwise we would be trying to fit the tasks current resources and 477 // updated resources. After select is called we can remove the evict. 478 ctx.Plan().AppendUpdate(update.Alloc, structs.AllocDesiredStatusStop, 479 allocInPlace, "") 480 481 // Attempt to match the task group 482 option, _ := stack.Select(update.TaskGroup) 483 484 // Pop the allocation 485 ctx.Plan().PopUpdate(update.Alloc) 486 487 // Skip if we could not do an in-place update 488 if option == nil { 489 continue 490 } 491 492 // Restore the network offers from the existing allocation. 493 // We do not allow network resources (reserved/dynamic ports) 494 // to be updated. This is guarded in taskUpdated, so we can 495 // safely restore those here. 496 for task, resources := range option.TaskResources { 497 existing := update.Alloc.TaskResources[task] 498 resources.Networks = existing.Networks 499 } 500 501 // Create a shallow copy 502 newAlloc := new(structs.Allocation) 503 *newAlloc = *update.Alloc 504 505 // Update the allocation 506 newAlloc.EvalID = eval.ID 507 newAlloc.Job = nil // Use the Job in the Plan 508 newAlloc.Resources = nil // Computed in Plan Apply 509 newAlloc.TaskResources = option.TaskResources 510 newAlloc.Metrics = ctx.Metrics() 511 ctx.Plan().AppendAlloc(newAlloc) 512 513 // Remove this allocation from the slice 514 updates[i], updates[n-1] = updates[n-1], updates[i] 515 i-- 516 n-- 517 inplaceCount++ 518 } 519 if len(updates) > 0 { 520 ctx.Logger().Printf("[DEBUG] sched: %#v: %d in-place updates of %d", eval, inplaceCount, len(updates)) 521 } 522 return updates[:n], updates[n:] 523 } 524 525 // evictAndPlace is used to mark allocations for evicts and add them to the 526 // placement queue. evictAndPlace modifies both the diffResult and the 527 // limit. It returns true if the limit has been reached. 528 func evictAndPlace(ctx Context, diff *diffResult, allocs []allocTuple, desc string, limit *int) bool { 529 n := len(allocs) 530 for i := 0; i < n && i < *limit; i++ { 531 a := allocs[i] 532 ctx.Plan().AppendUpdate(a.Alloc, structs.AllocDesiredStatusStop, desc, "") 533 diff.place = append(diff.place, a) 534 } 535 if n <= *limit { 536 *limit -= n 537 return false 538 } 539 *limit = 0 540 return true 541 } 542 543 // markLostAndPlace is used to mark allocations as lost and add them to the 544 // placement queue. evictAndPlace modifies both the diffResult and the 545 // limit. It returns true if the limit has been reached. 546 func markLostAndPlace(ctx Context, diff *diffResult, allocs []allocTuple, desc string, limit *int) bool { 547 n := len(allocs) 548 for i := 0; i < n && i < *limit; i++ { 549 a := allocs[i] 550 ctx.Plan().AppendUpdate(a.Alloc, structs.AllocDesiredStatusStop, desc, structs.AllocClientStatusLost) 551 diff.place = append(diff.place, a) 552 } 553 if n <= *limit { 554 *limit -= n 555 return false 556 } 557 *limit = 0 558 return true 559 } 560 561 // tgConstrainTuple is used to store the total constraints of a task group. 562 type tgConstrainTuple struct { 563 // Holds the combined constraints of the task group and all it's sub-tasks. 564 constraints []*structs.Constraint 565 566 // The set of required drivers within the task group. 567 drivers map[string]struct{} 568 569 // The combined resources of all tasks within the task group. 570 size *structs.Resources 571 } 572 573 // taskGroupConstraints collects the constraints, drivers and resources required by each 574 // sub-task to aggregate the TaskGroup totals 575 func taskGroupConstraints(tg *structs.TaskGroup) tgConstrainTuple { 576 c := tgConstrainTuple{ 577 constraints: make([]*structs.Constraint, 0, len(tg.Constraints)), 578 drivers: make(map[string]struct{}), 579 size: &structs.Resources{DiskMB: tg.EphemeralDisk.SizeMB}, 580 } 581 582 c.constraints = append(c.constraints, tg.Constraints...) 583 for _, task := range tg.Tasks { 584 c.drivers[task.Driver] = struct{}{} 585 c.constraints = append(c.constraints, task.Constraints...) 586 c.size.Add(task.Resources) 587 } 588 589 return c 590 } 591 592 // desiredUpdates takes the diffResult as well as the set of inplace and 593 // destructive updates and returns a map of task groups to their set of desired 594 // updates. 595 func desiredUpdates(diff *diffResult, inplaceUpdates, 596 destructiveUpdates []allocTuple) map[string]*structs.DesiredUpdates { 597 desiredTgs := make(map[string]*structs.DesiredUpdates) 598 599 for _, tuple := range diff.place { 600 name := tuple.TaskGroup.Name 601 des, ok := desiredTgs[name] 602 if !ok { 603 des = &structs.DesiredUpdates{} 604 desiredTgs[name] = des 605 } 606 607 des.Place++ 608 } 609 610 for _, tuple := range diff.stop { 611 name := tuple.Alloc.TaskGroup 612 des, ok := desiredTgs[name] 613 if !ok { 614 des = &structs.DesiredUpdates{} 615 desiredTgs[name] = des 616 } 617 618 des.Stop++ 619 } 620 621 for _, tuple := range diff.ignore { 622 name := tuple.TaskGroup.Name 623 des, ok := desiredTgs[name] 624 if !ok { 625 des = &structs.DesiredUpdates{} 626 desiredTgs[name] = des 627 } 628 629 des.Ignore++ 630 } 631 632 for _, tuple := range diff.migrate { 633 name := tuple.TaskGroup.Name 634 des, ok := desiredTgs[name] 635 if !ok { 636 des = &structs.DesiredUpdates{} 637 desiredTgs[name] = des 638 } 639 640 des.Migrate++ 641 } 642 643 for _, tuple := range inplaceUpdates { 644 name := tuple.TaskGroup.Name 645 des, ok := desiredTgs[name] 646 if !ok { 647 des = &structs.DesiredUpdates{} 648 desiredTgs[name] = des 649 } 650 651 des.InPlaceUpdate++ 652 } 653 654 for _, tuple := range destructiveUpdates { 655 name := tuple.TaskGroup.Name 656 des, ok := desiredTgs[name] 657 if !ok { 658 des = &structs.DesiredUpdates{} 659 desiredTgs[name] = des 660 } 661 662 des.DestructiveUpdate++ 663 } 664 665 return desiredTgs 666 } 667 668 // adjustQueuedAllocations decrements the number of allocations pending per task 669 // group based on the number of allocations successfully placed 670 func adjustQueuedAllocations(logger *log.Logger, result *structs.PlanResult, queuedAllocs map[string]int) { 671 if result != nil { 672 for _, allocations := range result.NodeAllocation { 673 for _, allocation := range allocations { 674 // Ensure that the allocation is newly created 675 if allocation.CreateIndex != result.AllocIndex { 676 continue 677 } 678 679 if _, ok := queuedAllocs[allocation.TaskGroup]; ok { 680 queuedAllocs[allocation.TaskGroup] -= 1 681 } else { 682 logger.Printf("[ERR] sched: allocation %q placed but not in list of unplaced allocations", allocation.TaskGroup) 683 } 684 } 685 } 686 } 687 } 688 689 // updateNonTerminalAllocsToLost updates the allocations which are in pending/running state on tainted node 690 // to lost 691 func updateNonTerminalAllocsToLost(plan *structs.Plan, tainted map[string]*structs.Node, allocs []*structs.Allocation) { 692 for _, alloc := range allocs { 693 if _, ok := tainted[alloc.NodeID]; ok && 694 alloc.DesiredStatus == structs.AllocDesiredStatusStop && 695 (alloc.ClientStatus == structs.AllocClientStatusRunning || 696 alloc.ClientStatus == structs.AllocClientStatusPending) { 697 plan.AppendUpdate(alloc, structs.AllocDesiredStatusStop, allocLost, structs.AllocClientStatusLost) 698 } 699 } 700 }