github.com/ranjib/nomad@v0.1.1-0.20160225204057-97751b02f70b/scheduler/util.go (about) 1 package scheduler 2 3 import ( 4 "fmt" 5 "log" 6 "math/rand" 7 "reflect" 8 9 "github.com/hashicorp/nomad/nomad/structs" 10 ) 11 12 // allocTuple is a tuple of the allocation name and potential alloc ID 13 type allocTuple struct { 14 Name string 15 TaskGroup *structs.TaskGroup 16 Alloc *structs.Allocation 17 } 18 19 // materializeTaskGroups is used to materialize all the task groups 20 // a job requires. This is used to do the count expansion. 21 func materializeTaskGroups(job *structs.Job) map[string]*structs.TaskGroup { 22 out := make(map[string]*structs.TaskGroup) 23 if job == nil { 24 return out 25 } 26 27 for _, tg := range job.TaskGroups { 28 for i := 0; i < tg.Count; i++ { 29 name := fmt.Sprintf("%s.%s[%d]", job.Name, tg.Name, i) 30 out[name] = tg 31 } 32 } 33 return out 34 } 35 36 // diffResult is used to return the sets that result from the diff 37 type diffResult struct { 38 place, update, migrate, stop, ignore []allocTuple 39 } 40 41 func (d *diffResult) GoString() string { 42 return fmt.Sprintf("allocs: (place %d) (update %d) (migrate %d) (stop %d) (ignore %d)", 43 len(d.place), len(d.update), len(d.migrate), len(d.stop), len(d.ignore)) 44 } 45 46 func (d *diffResult) Append(other *diffResult) { 47 d.place = append(d.place, other.place...) 48 d.update = append(d.update, other.update...) 49 d.migrate = append(d.migrate, other.migrate...) 50 d.stop = append(d.stop, other.stop...) 51 d.ignore = append(d.ignore, other.ignore...) 52 } 53 54 // diffAllocs is used to do a set difference between the target allocations 55 // and the existing allocations. This returns 5 sets of results, the list of 56 // named task groups that need to be placed (no existing allocation), the 57 // allocations that need to be updated (job definition is newer), allocs that 58 // need to be migrated (node is draining), the allocs that need to be evicted 59 // (no longer required), and those that should be ignored. 60 func diffAllocs(job *structs.Job, taintedNodes map[string]bool, 61 required map[string]*structs.TaskGroup, allocs []*structs.Allocation) *diffResult { 62 result := &diffResult{} 63 64 // Scan the existing updates 65 existing := make(map[string]struct{}) 66 for _, exist := range allocs { 67 // Index the existing node 68 name := exist.Name 69 existing[name] = struct{}{} 70 71 // Check for the definition in the required set 72 tg, ok := required[name] 73 74 // If not required, we stop the alloc 75 if !ok { 76 result.stop = append(result.stop, allocTuple{ 77 Name: name, 78 TaskGroup: tg, 79 Alloc: exist, 80 }) 81 continue 82 } 83 84 // If we are on a tainted node, we must migrate 85 if taintedNodes[exist.NodeID] { 86 result.migrate = append(result.migrate, allocTuple{ 87 Name: name, 88 TaskGroup: tg, 89 Alloc: exist, 90 }) 91 continue 92 } 93 94 // If the definition is updated we need to update 95 if job.JobModifyIndex != exist.Job.JobModifyIndex { 96 result.update = append(result.update, allocTuple{ 97 Name: name, 98 TaskGroup: tg, 99 Alloc: exist, 100 }) 101 continue 102 } 103 104 // Everything is up-to-date 105 result.ignore = append(result.ignore, allocTuple{ 106 Name: name, 107 TaskGroup: tg, 108 Alloc: exist, 109 }) 110 } 111 112 // Scan the required groups 113 for name, tg := range required { 114 // Check for an existing allocation 115 _, ok := existing[name] 116 117 // Require a placement if no existing allocation. If there 118 // is an existing allocation, we would have checked for a potential 119 // update or ignore above. 120 if !ok { 121 result.place = append(result.place, allocTuple{ 122 Name: name, 123 TaskGroup: tg, 124 }) 125 } 126 } 127 return result 128 } 129 130 // diffSystemAllocs is like diffAllocs however, the allocations in the 131 // diffResult contain the specific nodeID they should be allocated on. 132 func diffSystemAllocs(job *structs.Job, nodes []*structs.Node, taintedNodes map[string]bool, 133 allocs []*structs.Allocation) *diffResult { 134 135 // Build a mapping of nodes to all their allocs. 136 nodeAllocs := make(map[string][]*structs.Allocation, len(allocs)) 137 for _, alloc := range allocs { 138 nallocs := append(nodeAllocs[alloc.NodeID], alloc) 139 nodeAllocs[alloc.NodeID] = nallocs 140 } 141 142 for _, node := range nodes { 143 if _, ok := nodeAllocs[node.ID]; !ok { 144 nodeAllocs[node.ID] = nil 145 } 146 } 147 148 // Create the required task groups. 149 required := materializeTaskGroups(job) 150 151 result := &diffResult{} 152 for nodeID, allocs := range nodeAllocs { 153 diff := diffAllocs(job, taintedNodes, required, allocs) 154 155 // Mark the alloc as being for a specific node. 156 for i := range diff.place { 157 alloc := &diff.place[i] 158 alloc.Alloc = &structs.Allocation{NodeID: nodeID} 159 } 160 161 // Migrate does not apply to system jobs and instead should be marked as 162 // stop because if a node is tainted, the job is invalid on that node. 163 diff.stop = append(diff.stop, diff.migrate...) 164 diff.migrate = nil 165 166 result.Append(diff) 167 } 168 169 return result 170 } 171 172 // readyNodesInDCs returns all the ready nodes in the given datacenters and a 173 // mapping of each data center to the count of ready nodes. 174 func readyNodesInDCs(state State, dcs []string) ([]*structs.Node, map[string]int, error) { 175 // Index the DCs 176 dcMap := make(map[string]int, len(dcs)) 177 for _, dc := range dcs { 178 dcMap[dc] = 0 179 } 180 181 // Scan the nodes 182 var out []*structs.Node 183 iter, err := state.Nodes() 184 if err != nil { 185 return nil, nil, err 186 } 187 for { 188 raw := iter.Next() 189 if raw == nil { 190 break 191 } 192 193 // Filter on datacenter and status 194 node := raw.(*structs.Node) 195 if node.Status != structs.NodeStatusReady { 196 continue 197 } 198 if node.Drain { 199 continue 200 } 201 if _, ok := dcMap[node.Datacenter]; !ok { 202 continue 203 } 204 out = append(out, node) 205 dcMap[node.Datacenter] += 1 206 } 207 return out, dcMap, nil 208 } 209 210 // retryMax is used to retry a callback until it returns success or 211 // a maximum number of attempts is reached. An optional reset function may be 212 // passed which is called after each failed iteration. If the reset function is 213 // set and returns true, the number of attempts is reset back to max. 214 func retryMax(max int, cb func() (bool, error), reset func() bool) error { 215 attempts := 0 216 for attempts < max { 217 done, err := cb() 218 if err != nil { 219 return err 220 } 221 if done { 222 return nil 223 } 224 225 // Check if we should reset the number attempts 226 if reset != nil && reset() { 227 attempts = 0 228 } else { 229 attempts += 1 230 } 231 } 232 return &SetStatusError{ 233 Err: fmt.Errorf("maximum attempts reached (%d)", max), 234 EvalStatus: structs.EvalStatusFailed, 235 } 236 } 237 238 // progressMade checks to see if the plan result made allocations or updates. 239 // If the result is nil, false is returned. 240 func progressMade(result *structs.PlanResult) bool { 241 return result != nil && (len(result.NodeUpdate) != 0 || 242 len(result.NodeAllocation) != 0) 243 } 244 245 // taintedNodes is used to scan the allocations and then check if the 246 // underlying nodes are tainted, and should force a migration of the allocation. 247 func taintedNodes(state State, allocs []*structs.Allocation) (map[string]bool, error) { 248 out := make(map[string]bool) 249 for _, alloc := range allocs { 250 if _, ok := out[alloc.NodeID]; ok { 251 continue 252 } 253 254 node, err := state.NodeByID(alloc.NodeID) 255 if err != nil { 256 return nil, err 257 } 258 259 // If the node does not exist, we should migrate 260 if node == nil { 261 out[alloc.NodeID] = true 262 continue 263 } 264 265 out[alloc.NodeID] = structs.ShouldDrainNode(node.Status) || node.Drain 266 } 267 return out, nil 268 } 269 270 // shuffleNodes randomizes the slice order with the Fisher-Yates algorithm 271 func shuffleNodes(nodes []*structs.Node) { 272 n := len(nodes) 273 for i := n - 1; i > 0; i-- { 274 j := rand.Intn(i + 1) 275 nodes[i], nodes[j] = nodes[j], nodes[i] 276 } 277 } 278 279 // tasksUpdated does a diff between task groups to see if the 280 // tasks, their drivers, environment variables or config have updated. 281 func tasksUpdated(a, b *structs.TaskGroup) bool { 282 // If the number of tasks do not match, clearly there is an update 283 if len(a.Tasks) != len(b.Tasks) { 284 return true 285 } 286 287 // Check each task 288 for _, at := range a.Tasks { 289 bt := b.LookupTask(at.Name) 290 if bt == nil { 291 return true 292 } 293 if at.Driver != bt.Driver { 294 return true 295 } 296 if !reflect.DeepEqual(at.Config, bt.Config) { 297 return true 298 } 299 if !reflect.DeepEqual(at.Env, bt.Env) { 300 return true 301 } 302 303 // Inspect the network to see if the dynamic ports are different 304 if len(at.Resources.Networks) != len(bt.Resources.Networks) { 305 return true 306 } 307 for idx := range at.Resources.Networks { 308 an := at.Resources.Networks[idx] 309 bn := bt.Resources.Networks[idx] 310 if len(an.DynamicPorts) != len(bn.DynamicPorts) { 311 return true 312 } 313 } 314 } 315 return false 316 } 317 318 // setStatus is used to update the status of the evaluation 319 func setStatus(logger *log.Logger, planner Planner, eval, nextEval *structs.Evaluation, status, desc string) error { 320 logger.Printf("[DEBUG] sched: %#v: setting status to %s", eval, status) 321 newEval := eval.Copy() 322 newEval.Status = status 323 newEval.StatusDescription = desc 324 if nextEval != nil { 325 newEval.NextEval = nextEval.ID 326 } 327 return planner.UpdateEval(newEval) 328 } 329 330 // inplaceUpdate attempts to update allocations in-place where possible. 331 func inplaceUpdate(ctx Context, eval *structs.Evaluation, job *structs.Job, 332 stack Stack, updates []allocTuple) []allocTuple { 333 334 n := len(updates) 335 inplace := 0 336 for i := 0; i < n; i++ { 337 // Get the update 338 update := updates[i] 339 340 // Check if the task drivers or config has changed, requires 341 // a rolling upgrade since that cannot be done in-place. 342 existing := update.Alloc.Job.LookupTaskGroup(update.TaskGroup.Name) 343 if tasksUpdated(update.TaskGroup, existing) { 344 continue 345 } 346 347 // Get the existing node 348 node, err := ctx.State().NodeByID(update.Alloc.NodeID) 349 if err != nil { 350 ctx.Logger().Printf("[ERR] sched: %#v failed to get node '%s': %v", 351 eval, update.Alloc.NodeID, err) 352 continue 353 } 354 if node == nil { 355 continue 356 } 357 358 // Set the existing node as the base set 359 stack.SetNodes([]*structs.Node{node}) 360 361 // Stage an eviction of the current allocation. This is done so that 362 // the current allocation is discounted when checking for feasability. 363 // Otherwise we would be trying to fit the tasks current resources and 364 // updated resources. After select is called we can remove the evict. 365 ctx.Plan().AppendUpdate(update.Alloc, structs.AllocDesiredStatusStop, 366 allocInPlace) 367 368 // Attempt to match the task group 369 option, size := stack.Select(update.TaskGroup) 370 371 // Pop the allocation 372 ctx.Plan().PopUpdate(update.Alloc) 373 374 // Skip if we could not do an in-place update 375 if option == nil { 376 continue 377 } 378 379 // Restore the network offers from the existing allocation. 380 // We do not allow network resources (reserved/dynamic ports) 381 // to be updated. This is guarded in taskUpdated, so we can 382 // safely restore those here. 383 for task, resources := range option.TaskResources { 384 existing := update.Alloc.TaskResources[task] 385 resources.Networks = existing.Networks 386 } 387 388 // Create a shallow copy 389 newAlloc := new(structs.Allocation) 390 *newAlloc = *update.Alloc 391 392 // Update the allocation 393 newAlloc.EvalID = eval.ID 394 newAlloc.Job = nil // Use the Job in the Plan 395 newAlloc.Resources = size 396 newAlloc.TaskResources = option.TaskResources 397 newAlloc.Metrics = ctx.Metrics() 398 newAlloc.DesiredStatus = structs.AllocDesiredStatusRun 399 newAlloc.ClientStatus = structs.AllocClientStatusPending 400 newAlloc.PopulateServiceIDs(update.TaskGroup) 401 ctx.Plan().AppendAlloc(newAlloc) 402 403 // Remove this allocation from the slice 404 updates[i] = updates[n-1] 405 i-- 406 n-- 407 inplace++ 408 } 409 if len(updates) > 0 { 410 ctx.Logger().Printf("[DEBUG] sched: %#v: %d in-place updates of %d", eval, inplace, len(updates)) 411 } 412 return updates[:n] 413 } 414 415 // evictAndPlace is used to mark allocations for evicts and add them to the 416 // placement queue. evictAndPlace modifies both the the diffResult and the 417 // limit. It returns true if the limit has been reached. 418 func evictAndPlace(ctx Context, diff *diffResult, allocs []allocTuple, desc string, limit *int) bool { 419 n := len(allocs) 420 for i := 0; i < n && i < *limit; i++ { 421 a := allocs[i] 422 ctx.Plan().AppendUpdate(a.Alloc, structs.AllocDesiredStatusStop, desc) 423 diff.place = append(diff.place, a) 424 } 425 if n <= *limit { 426 *limit -= n 427 return false 428 } 429 *limit = 0 430 return true 431 } 432 433 // tgConstrainTuple is used to store the total constraints of a task group. 434 type tgConstrainTuple struct { 435 // Holds the combined constraints of the task group and all it's sub-tasks. 436 constraints []*structs.Constraint 437 438 // The set of required drivers within the task group. 439 drivers map[string]struct{} 440 441 // The combined resources of all tasks within the task group. 442 size *structs.Resources 443 } 444 445 // taskGroupConstraints collects the constraints, drivers and resources required by each 446 // sub-task to aggregate the TaskGroup totals 447 func taskGroupConstraints(tg *structs.TaskGroup) tgConstrainTuple { 448 c := tgConstrainTuple{ 449 constraints: make([]*structs.Constraint, 0, len(tg.Constraints)), 450 drivers: make(map[string]struct{}), 451 size: new(structs.Resources), 452 } 453 454 c.constraints = append(c.constraints, tg.Constraints...) 455 for _, task := range tg.Tasks { 456 c.drivers[task.Driver] = struct{}{} 457 c.constraints = append(c.constraints, task.Constraints...) 458 c.size.Add(task.Resources) 459 } 460 461 return c 462 } 463 464 func initTaskState(tg *structs.TaskGroup, state string) map[string]*structs.TaskState { 465 states := make(map[string]*structs.TaskState, len(tg.Tasks)) 466 for _, task := range tg.Tasks { 467 states[task.Name] = &structs.TaskState{State: state} 468 } 469 return states 470 }