github.com/dkerwin/nomad@v0.3.3-0.20160525181927-74554135514b/scheduler/util.go (about) 1 package scheduler 2 3 import ( 4 "fmt" 5 "log" 6 "math/rand" 7 "reflect" 8 9 "github.com/hashicorp/nomad/nomad/structs" 10 ) 11 12 // allocTuple is a tuple of the allocation name and potential alloc ID 13 type allocTuple struct { 14 Name string 15 TaskGroup *structs.TaskGroup 16 Alloc *structs.Allocation 17 } 18 19 // materializeTaskGroups is used to materialize all the task groups 20 // a job requires. This is used to do the count expansion. 21 func materializeTaskGroups(job *structs.Job) map[string]*structs.TaskGroup { 22 out := make(map[string]*structs.TaskGroup) 23 if job == nil { 24 return out 25 } 26 27 for _, tg := range job.TaskGroups { 28 for i := 0; i < tg.Count; i++ { 29 name := fmt.Sprintf("%s.%s[%d]", job.Name, tg.Name, i) 30 out[name] = tg 31 } 32 } 33 return out 34 } 35 36 // diffResult is used to return the sets that result from the diff 37 type diffResult struct { 38 place, update, migrate, stop, ignore []allocTuple 39 } 40 41 func (d *diffResult) GoString() string { 42 return fmt.Sprintf("allocs: (place %d) (update %d) (migrate %d) (stop %d) (ignore %d)", 43 len(d.place), len(d.update), len(d.migrate), len(d.stop), len(d.ignore)) 44 } 45 46 func (d *diffResult) Append(other *diffResult) { 47 d.place = append(d.place, other.place...) 48 d.update = append(d.update, other.update...) 49 d.migrate = append(d.migrate, other.migrate...) 50 d.stop = append(d.stop, other.stop...) 51 d.ignore = append(d.ignore, other.ignore...) 52 } 53 54 // diffAllocs is used to do a set difference between the target allocations 55 // and the existing allocations. This returns 5 sets of results, the list of 56 // named task groups that need to be placed (no existing allocation), the 57 // allocations that need to be updated (job definition is newer), allocs that 58 // need to be migrated (node is draining), the allocs that need to be evicted 59 // (no longer required), and those that should be ignored. 60 func diffAllocs(job *structs.Job, taintedNodes map[string]bool, 61 required map[string]*structs.TaskGroup, allocs []*structs.Allocation) *diffResult { 62 result := &diffResult{} 63 64 // Scan the existing updates 65 existing := make(map[string]struct{}) 66 for _, exist := range allocs { 67 // Index the existing node 68 name := exist.Name 69 existing[name] = struct{}{} 70 71 // Check for the definition in the required set 72 tg, ok := required[name] 73 74 // If not required, we stop the alloc 75 if !ok { 76 result.stop = append(result.stop, allocTuple{ 77 Name: name, 78 TaskGroup: tg, 79 Alloc: exist, 80 }) 81 continue 82 } 83 84 // If we are on a tainted node, we must migrate if we are a service or 85 // if the batch allocation did not finish 86 if taintedNodes[exist.NodeID] { 87 // If the job is batch and finished succesfully, the fact that the 88 // node is tainted does not mean it should be migrated as the work 89 // was already succesfully finished. However for service/system 90 // jobs, tasks should never complete. The check of batch type, 91 // defends against client bugs. 92 if exist.Job.Type == structs.JobTypeBatch && exist.RanSuccessfully() { 93 goto IGNORE 94 } 95 result.migrate = append(result.migrate, allocTuple{ 96 Name: name, 97 TaskGroup: tg, 98 Alloc: exist, 99 }) 100 continue 101 } 102 103 // If the definition is updated we need to update 104 if job.JobModifyIndex != exist.Job.JobModifyIndex { 105 result.update = append(result.update, allocTuple{ 106 Name: name, 107 TaskGroup: tg, 108 Alloc: exist, 109 }) 110 continue 111 } 112 113 // Everything is up-to-date 114 IGNORE: 115 result.ignore = append(result.ignore, allocTuple{ 116 Name: name, 117 TaskGroup: tg, 118 Alloc: exist, 119 }) 120 } 121 122 // Scan the required groups 123 for name, tg := range required { 124 // Check for an existing allocation 125 _, ok := existing[name] 126 127 // Require a placement if no existing allocation. If there 128 // is an existing allocation, we would have checked for a potential 129 // update or ignore above. 130 if !ok { 131 result.place = append(result.place, allocTuple{ 132 Name: name, 133 TaskGroup: tg, 134 }) 135 } 136 } 137 return result 138 } 139 140 // diffSystemAllocs is like diffAllocs however, the allocations in the 141 // diffResult contain the specific nodeID they should be allocated on. 142 func diffSystemAllocs(job *structs.Job, nodes []*structs.Node, taintedNodes map[string]bool, 143 allocs []*structs.Allocation) *diffResult { 144 145 // Build a mapping of nodes to all their allocs. 146 nodeAllocs := make(map[string][]*structs.Allocation, len(allocs)) 147 for _, alloc := range allocs { 148 nallocs := append(nodeAllocs[alloc.NodeID], alloc) 149 nodeAllocs[alloc.NodeID] = nallocs 150 } 151 152 for _, node := range nodes { 153 if _, ok := nodeAllocs[node.ID]; !ok { 154 nodeAllocs[node.ID] = nil 155 } 156 } 157 158 // Create the required task groups. 159 required := materializeTaskGroups(job) 160 161 result := &diffResult{} 162 for nodeID, allocs := range nodeAllocs { 163 diff := diffAllocs(job, taintedNodes, required, allocs) 164 165 // Mark the alloc as being for a specific node. 166 for i := range diff.place { 167 alloc := &diff.place[i] 168 alloc.Alloc = &structs.Allocation{NodeID: nodeID} 169 } 170 171 // Migrate does not apply to system jobs and instead should be marked as 172 // stop because if a node is tainted, the job is invalid on that node. 173 diff.stop = append(diff.stop, diff.migrate...) 174 diff.migrate = nil 175 176 result.Append(diff) 177 } 178 179 return result 180 } 181 182 // readyNodesInDCs returns all the ready nodes in the given datacenters and a 183 // mapping of each data center to the count of ready nodes. 184 func readyNodesInDCs(state State, dcs []string) ([]*structs.Node, map[string]int, error) { 185 // Index the DCs 186 dcMap := make(map[string]int, len(dcs)) 187 for _, dc := range dcs { 188 dcMap[dc] = 0 189 } 190 191 // Scan the nodes 192 var out []*structs.Node 193 iter, err := state.Nodes() 194 if err != nil { 195 return nil, nil, err 196 } 197 for { 198 raw := iter.Next() 199 if raw == nil { 200 break 201 } 202 203 // Filter on datacenter and status 204 node := raw.(*structs.Node) 205 if node.Status != structs.NodeStatusReady { 206 continue 207 } 208 if node.Drain { 209 continue 210 } 211 if _, ok := dcMap[node.Datacenter]; !ok { 212 continue 213 } 214 out = append(out, node) 215 dcMap[node.Datacenter] += 1 216 } 217 return out, dcMap, nil 218 } 219 220 // retryMax is used to retry a callback until it returns success or 221 // a maximum number of attempts is reached. An optional reset function may be 222 // passed which is called after each failed iteration. If the reset function is 223 // set and returns true, the number of attempts is reset back to max. 224 func retryMax(max int, cb func() (bool, error), reset func() bool) error { 225 attempts := 0 226 for attempts < max { 227 done, err := cb() 228 if err != nil { 229 return err 230 } 231 if done { 232 return nil 233 } 234 235 // Check if we should reset the number attempts 236 if reset != nil && reset() { 237 attempts = 0 238 } else { 239 attempts += 1 240 } 241 } 242 return &SetStatusError{ 243 Err: fmt.Errorf("maximum attempts reached (%d)", max), 244 EvalStatus: structs.EvalStatusFailed, 245 } 246 } 247 248 // progressMade checks to see if the plan result made allocations or updates. 249 // If the result is nil, false is returned. 250 func progressMade(result *structs.PlanResult) bool { 251 return result != nil && (len(result.NodeUpdate) != 0 || 252 len(result.NodeAllocation) != 0) 253 } 254 255 // taintedNodes is used to scan the allocations and then check if the 256 // underlying nodes are tainted, and should force a migration of the allocation. 257 func taintedNodes(state State, allocs []*structs.Allocation) (map[string]bool, error) { 258 out := make(map[string]bool) 259 for _, alloc := range allocs { 260 if _, ok := out[alloc.NodeID]; ok { 261 continue 262 } 263 264 node, err := state.NodeByID(alloc.NodeID) 265 if err != nil { 266 return nil, err 267 } 268 269 // If the node does not exist, we should migrate 270 if node == nil { 271 out[alloc.NodeID] = true 272 continue 273 } 274 275 out[alloc.NodeID] = structs.ShouldDrainNode(node.Status) || node.Drain 276 } 277 return out, nil 278 } 279 280 // shuffleNodes randomizes the slice order with the Fisher-Yates algorithm 281 func shuffleNodes(nodes []*structs.Node) { 282 n := len(nodes) 283 for i := n - 1; i > 0; i-- { 284 j := rand.Intn(i + 1) 285 nodes[i], nodes[j] = nodes[j], nodes[i] 286 } 287 } 288 289 // tasksUpdated does a diff between task groups to see if the 290 // tasks, their drivers, environment variables or config have updated. 291 func tasksUpdated(a, b *structs.TaskGroup) bool { 292 // If the number of tasks do not match, clearly there is an update 293 if len(a.Tasks) != len(b.Tasks) { 294 return true 295 } 296 297 // Check each task 298 for _, at := range a.Tasks { 299 bt := b.LookupTask(at.Name) 300 if bt == nil { 301 return true 302 } 303 if at.Driver != bt.Driver { 304 return true 305 } 306 if at.User != bt.User { 307 return true 308 } 309 if !reflect.DeepEqual(at.Config, bt.Config) { 310 return true 311 } 312 if !reflect.DeepEqual(at.Env, bt.Env) { 313 return true 314 } 315 if !reflect.DeepEqual(at.Meta, bt.Meta) { 316 return true 317 } 318 if !reflect.DeepEqual(at.Artifacts, bt.Artifacts) { 319 return true 320 } 321 322 // Inspect the network to see if the dynamic ports are different 323 if len(at.Resources.Networks) != len(bt.Resources.Networks) { 324 return true 325 } 326 for idx := range at.Resources.Networks { 327 an := at.Resources.Networks[idx] 328 bn := bt.Resources.Networks[idx] 329 330 if an.MBits != bn.MBits { 331 return true 332 } 333 334 aPorts, bPorts := networkPortMap(an), networkPortMap(bn) 335 if !reflect.DeepEqual(aPorts, bPorts) { 336 return true 337 } 338 } 339 340 // Inspect the non-network resources 341 if ar, br := at.Resources, bt.Resources; ar.CPU != br.CPU { 342 return true 343 } else if ar.MemoryMB != br.MemoryMB { 344 return true 345 } else if ar.DiskMB != br.DiskMB { 346 return true 347 } else if ar.IOPS != br.IOPS { 348 return true 349 } 350 } 351 return false 352 } 353 354 // networkPortMap takes a network resource and returns a map of port labels to 355 // values. The value for dynamic ports is disregarded even if it is set. This 356 // makes this function suitable for comparing two network resources for changes. 357 func networkPortMap(n *structs.NetworkResource) map[string]int { 358 m := make(map[string]int, len(n.DynamicPorts)+len(n.ReservedPorts)) 359 for _, p := range n.ReservedPorts { 360 m[p.Label] = p.Value 361 } 362 for _, p := range n.DynamicPorts { 363 m[p.Label] = -1 364 } 365 return m 366 } 367 368 // setStatus is used to update the status of the evaluation 369 func setStatus(logger *log.Logger, planner Planner, eval, nextEval, spawnedBlocked *structs.Evaluation, status, desc string) error { 370 logger.Printf("[DEBUG] sched: %#v: setting status to %s", eval, status) 371 newEval := eval.Copy() 372 newEval.Status = status 373 newEval.StatusDescription = desc 374 if nextEval != nil { 375 newEval.NextEval = nextEval.ID 376 } 377 if spawnedBlocked != nil { 378 newEval.BlockedEval = spawnedBlocked.ID 379 } 380 return planner.UpdateEval(newEval) 381 } 382 383 // inplaceUpdate attempts to update allocations in-place where possible. It 384 // returns the allocs that couldn't be done inplace and then those that could. 385 func inplaceUpdate(ctx Context, eval *structs.Evaluation, job *structs.Job, 386 stack Stack, updates []allocTuple) (destructive, inplace []allocTuple) { 387 388 n := len(updates) 389 inplaceCount := 0 390 for i := 0; i < n; i++ { 391 // Get the update 392 update := updates[i] 393 394 // Check if the task drivers or config has changed, requires 395 // a rolling upgrade since that cannot be done in-place. 396 existing := update.Alloc.Job.LookupTaskGroup(update.TaskGroup.Name) 397 if tasksUpdated(update.TaskGroup, existing) { 398 continue 399 } 400 401 // Get the existing node 402 node, err := ctx.State().NodeByID(update.Alloc.NodeID) 403 if err != nil { 404 ctx.Logger().Printf("[ERR] sched: %#v failed to get node '%s': %v", 405 eval, update.Alloc.NodeID, err) 406 continue 407 } 408 if node == nil { 409 continue 410 } 411 412 // Set the existing node as the base set 413 stack.SetNodes([]*structs.Node{node}) 414 415 // Stage an eviction of the current allocation. This is done so that 416 // the current allocation is discounted when checking for feasability. 417 // Otherwise we would be trying to fit the tasks current resources and 418 // updated resources. After select is called we can remove the evict. 419 ctx.Plan().AppendUpdate(update.Alloc, structs.AllocDesiredStatusStop, 420 allocInPlace) 421 422 // Attempt to match the task group 423 option, _ := stack.Select(update.TaskGroup) 424 425 // Pop the allocation 426 ctx.Plan().PopUpdate(update.Alloc) 427 428 // Skip if we could not do an in-place update 429 if option == nil { 430 continue 431 } 432 433 // Restore the network offers from the existing allocation. 434 // We do not allow network resources (reserved/dynamic ports) 435 // to be updated. This is guarded in taskUpdated, so we can 436 // safely restore those here. 437 for task, resources := range option.TaskResources { 438 existing := update.Alloc.TaskResources[task] 439 resources.Networks = existing.Networks 440 } 441 442 // Create a shallow copy 443 newAlloc := new(structs.Allocation) 444 *newAlloc = *update.Alloc 445 446 // Update the allocation 447 newAlloc.EvalID = eval.ID 448 newAlloc.Job = nil // Use the Job in the Plan 449 newAlloc.Resources = nil // Computed in Plan Apply 450 newAlloc.TaskResources = option.TaskResources 451 newAlloc.Metrics = ctx.Metrics() 452 newAlloc.DesiredStatus = structs.AllocDesiredStatusRun 453 newAlloc.ClientStatus = structs.AllocClientStatusPending 454 newAlloc.PopulateServiceIDs(update.TaskGroup) 455 ctx.Plan().AppendAlloc(newAlloc) 456 457 // Remove this allocation from the slice 458 updates[i], updates[n-1] = updates[n-1], updates[i] 459 i-- 460 n-- 461 inplaceCount++ 462 } 463 if len(updates) > 0 { 464 ctx.Logger().Printf("[DEBUG] sched: %#v: %d in-place updates of %d", eval, inplaceCount, len(updates)) 465 } 466 return updates[:n], updates[n:] 467 } 468 469 // evictAndPlace is used to mark allocations for evicts and add them to the 470 // placement queue. evictAndPlace modifies both the the diffResult and the 471 // limit. It returns true if the limit has been reached. 472 func evictAndPlace(ctx Context, diff *diffResult, allocs []allocTuple, desc string, limit *int) bool { 473 n := len(allocs) 474 for i := 0; i < n && i < *limit; i++ { 475 a := allocs[i] 476 ctx.Plan().AppendUpdate(a.Alloc, structs.AllocDesiredStatusStop, desc) 477 diff.place = append(diff.place, a) 478 } 479 if n <= *limit { 480 *limit -= n 481 return false 482 } 483 *limit = 0 484 return true 485 } 486 487 // tgConstrainTuple is used to store the total constraints of a task group. 488 type tgConstrainTuple struct { 489 // Holds the combined constraints of the task group and all it's sub-tasks. 490 constraints []*structs.Constraint 491 492 // The set of required drivers within the task group. 493 drivers map[string]struct{} 494 495 // The combined resources of all tasks within the task group. 496 size *structs.Resources 497 } 498 499 // taskGroupConstraints collects the constraints, drivers and resources required by each 500 // sub-task to aggregate the TaskGroup totals 501 func taskGroupConstraints(tg *structs.TaskGroup) tgConstrainTuple { 502 c := tgConstrainTuple{ 503 constraints: make([]*structs.Constraint, 0, len(tg.Constraints)), 504 drivers: make(map[string]struct{}), 505 size: new(structs.Resources), 506 } 507 508 c.constraints = append(c.constraints, tg.Constraints...) 509 for _, task := range tg.Tasks { 510 c.drivers[task.Driver] = struct{}{} 511 c.constraints = append(c.constraints, task.Constraints...) 512 c.size.Add(task.Resources) 513 } 514 515 return c 516 } 517 518 // desiredUpdates takes the diffResult as well as the set of inplace and 519 // destructive updates and returns a map of task groups to their set of desired 520 // updates. 521 func desiredUpdates(diff *diffResult, inplaceUpdates, 522 destructiveUpdates []allocTuple) map[string]*structs.DesiredUpdates { 523 desiredTgs := make(map[string]*structs.DesiredUpdates) 524 525 for _, tuple := range diff.place { 526 name := tuple.TaskGroup.Name 527 des, ok := desiredTgs[name] 528 if !ok { 529 des = &structs.DesiredUpdates{} 530 desiredTgs[name] = des 531 } 532 533 des.Place++ 534 } 535 536 for _, tuple := range diff.stop { 537 name := tuple.Alloc.TaskGroup 538 des, ok := desiredTgs[name] 539 if !ok { 540 des = &structs.DesiredUpdates{} 541 desiredTgs[name] = des 542 } 543 544 des.Stop++ 545 } 546 547 for _, tuple := range diff.ignore { 548 name := tuple.TaskGroup.Name 549 des, ok := desiredTgs[name] 550 if !ok { 551 des = &structs.DesiredUpdates{} 552 desiredTgs[name] = des 553 } 554 555 des.Ignore++ 556 } 557 558 for _, tuple := range diff.migrate { 559 name := tuple.TaskGroup.Name 560 des, ok := desiredTgs[name] 561 if !ok { 562 des = &structs.DesiredUpdates{} 563 desiredTgs[name] = des 564 } 565 566 des.Migrate++ 567 } 568 569 for _, tuple := range inplaceUpdates { 570 name := tuple.TaskGroup.Name 571 des, ok := desiredTgs[name] 572 if !ok { 573 des = &structs.DesiredUpdates{} 574 desiredTgs[name] = des 575 } 576 577 des.InPlaceUpdate++ 578 } 579 580 for _, tuple := range destructiveUpdates { 581 name := tuple.TaskGroup.Name 582 des, ok := desiredTgs[name] 583 if !ok { 584 des = &structs.DesiredUpdates{} 585 desiredTgs[name] = des 586 } 587 588 des.DestructiveUpdate++ 589 } 590 591 return desiredTgs 592 }