github.com/ryanslade/nomad@v0.2.4-0.20160128061903-fc95782f2089/scheduler/util.go (about) 1 package scheduler 2 3 import ( 4 "fmt" 5 "log" 6 "math/rand" 7 "reflect" 8 9 "github.com/hashicorp/nomad/nomad/structs" 10 ) 11 12 // allocTuple is a tuple of the allocation name and potential alloc ID 13 type allocTuple struct { 14 Name string 15 TaskGroup *structs.TaskGroup 16 Alloc *structs.Allocation 17 } 18 19 // materializeTaskGroups is used to materialize all the task groups 20 // a job requires. This is used to do the count expansion. 21 func materializeTaskGroups(job *structs.Job) map[string]*structs.TaskGroup { 22 out := make(map[string]*structs.TaskGroup) 23 if job == nil { 24 return out 25 } 26 27 for _, tg := range job.TaskGroups { 28 for i := 0; i < tg.Count; i++ { 29 name := fmt.Sprintf("%s.%s[%d]", job.Name, tg.Name, i) 30 out[name] = tg 31 } 32 } 33 return out 34 } 35 36 // diffResult is used to return the sets that result from the diff 37 type diffResult struct { 38 place, update, migrate, stop, ignore []allocTuple 39 } 40 41 func (d *diffResult) GoString() string { 42 return fmt.Sprintf("allocs: (place %d) (update %d) (migrate %d) (stop %d) (ignore %d)", 43 len(d.place), len(d.update), len(d.migrate), len(d.stop), len(d.ignore)) 44 } 45 46 func (d *diffResult) Append(other *diffResult) { 47 d.place = append(d.place, other.place...) 48 d.update = append(d.update, other.update...) 49 d.migrate = append(d.migrate, other.migrate...) 50 d.stop = append(d.stop, other.stop...) 51 d.ignore = append(d.ignore, other.ignore...) 52 } 53 54 // diffAllocs is used to do a set difference between the target allocations 55 // and the existing allocations. This returns 5 sets of results, the list of 56 // named task groups that need to be placed (no existing allocation), the 57 // allocations that need to be updated (job definition is newer), allocs that 58 // need to be migrated (node is draining), the allocs that need to be evicted 59 // (no longer required), and those that should be ignored. 60 func diffAllocs(job *structs.Job, taintedNodes map[string]bool, 61 required map[string]*structs.TaskGroup, allocs []*structs.Allocation) *diffResult { 62 result := &diffResult{} 63 64 // Scan the existing updates 65 existing := make(map[string]struct{}) 66 for _, exist := range allocs { 67 // Index the existing node 68 name := exist.Name 69 existing[name] = struct{}{} 70 71 // Check for the definition in the required set 72 tg, ok := required[name] 73 74 // If not required, we stop the alloc 75 if !ok { 76 result.stop = append(result.stop, allocTuple{ 77 Name: name, 78 TaskGroup: tg, 79 Alloc: exist, 80 }) 81 continue 82 } 83 84 // If we are on a tainted node, we must migrate 85 if taintedNodes[exist.NodeID] { 86 result.migrate = append(result.migrate, allocTuple{ 87 Name: name, 88 TaskGroup: tg, 89 Alloc: exist, 90 }) 91 continue 92 } 93 94 // If the definition is updated we need to update 95 if job.JobModifyIndex != exist.Job.JobModifyIndex { 96 result.update = append(result.update, allocTuple{ 97 Name: name, 98 TaskGroup: tg, 99 Alloc: exist, 100 }) 101 continue 102 } 103 104 // Everything is up-to-date 105 result.ignore = append(result.ignore, allocTuple{ 106 Name: name, 107 TaskGroup: tg, 108 Alloc: exist, 109 }) 110 } 111 112 // Scan the required groups 113 for name, tg := range required { 114 // Check for an existing allocation 115 _, ok := existing[name] 116 117 // Require a placement if no existing allocation. If there 118 // is an existing allocation, we would have checked for a potential 119 // update or ignore above. 120 if !ok { 121 result.place = append(result.place, allocTuple{ 122 Name: name, 123 TaskGroup: tg, 124 }) 125 } 126 } 127 return result 128 } 129 130 // diffSystemAllocs is like diffAllocs however, the allocations in the 131 // diffResult contain the specific nodeID they should be allocated on. 132 func diffSystemAllocs(job *structs.Job, nodes []*structs.Node, taintedNodes map[string]bool, 133 allocs []*structs.Allocation) *diffResult { 134 135 // Build a mapping of nodes to all their allocs. 136 nodeAllocs := make(map[string][]*structs.Allocation, len(allocs)) 137 for _, alloc := range allocs { 138 nallocs := append(nodeAllocs[alloc.NodeID], alloc) 139 nodeAllocs[alloc.NodeID] = nallocs 140 } 141 142 for _, node := range nodes { 143 if _, ok := nodeAllocs[node.ID]; !ok { 144 nodeAllocs[node.ID] = nil 145 } 146 } 147 148 // Create the required task groups. 149 required := materializeTaskGroups(job) 150 151 result := &diffResult{} 152 for nodeID, allocs := range nodeAllocs { 153 diff := diffAllocs(job, taintedNodes, required, allocs) 154 155 // Mark the alloc as being for a specific node. 156 for i := range diff.place { 157 alloc := &diff.place[i] 158 alloc.Alloc = &structs.Allocation{NodeID: nodeID} 159 } 160 161 // Migrate does not apply to system jobs and instead should be marked as 162 // stop because if a node is tainted, the job is invalid on that node. 163 diff.stop = append(diff.stop, diff.migrate...) 164 diff.migrate = nil 165 166 result.Append(diff) 167 } 168 169 return result 170 } 171 172 // readyNodesInDCs returns all the ready nodes in the given datacenters and a 173 // mapping of each data center to the count of ready nodes. 174 func readyNodesInDCs(state State, dcs []string) ([]*structs.Node, map[string]int, error) { 175 // Index the DCs 176 dcMap := make(map[string]int, len(dcs)) 177 for _, dc := range dcs { 178 dcMap[dc] = 0 179 } 180 181 // Scan the nodes 182 var out []*structs.Node 183 iter, err := state.Nodes() 184 if err != nil { 185 return nil, nil, err 186 } 187 for { 188 raw := iter.Next() 189 if raw == nil { 190 break 191 } 192 193 // Filter on datacenter and status 194 node := raw.(*structs.Node) 195 if node.Status != structs.NodeStatusReady { 196 continue 197 } 198 if node.Drain { 199 continue 200 } 201 if _, ok := dcMap[node.Datacenter]; !ok { 202 continue 203 } 204 out = append(out, node) 205 dcMap[node.Datacenter] += 1 206 } 207 return out, dcMap, nil 208 } 209 210 // retryMax is used to retry a callback until it returns success or 211 // a maximum number of attempts is reached 212 func retryMax(max int, cb func() (bool, error)) error { 213 attempts := 0 214 for attempts < max { 215 done, err := cb() 216 if err != nil { 217 return err 218 } 219 if done { 220 return nil 221 } 222 attempts += 1 223 } 224 return &SetStatusError{ 225 Err: fmt.Errorf("maximum attempts reached (%d)", max), 226 EvalStatus: structs.EvalStatusFailed, 227 } 228 } 229 230 // taintedNodes is used to scan the allocations and then check if the 231 // underlying nodes are tainted, and should force a migration of the allocation. 232 func taintedNodes(state State, allocs []*structs.Allocation) (map[string]bool, error) { 233 out := make(map[string]bool) 234 for _, alloc := range allocs { 235 if _, ok := out[alloc.NodeID]; ok { 236 continue 237 } 238 239 node, err := state.NodeByID(alloc.NodeID) 240 if err != nil { 241 return nil, err 242 } 243 244 // If the node does not exist, we should migrate 245 if node == nil { 246 out[alloc.NodeID] = true 247 continue 248 } 249 250 out[alloc.NodeID] = structs.ShouldDrainNode(node.Status) || node.Drain 251 } 252 return out, nil 253 } 254 255 // shuffleNodes randomizes the slice order with the Fisher-Yates algorithm 256 func shuffleNodes(nodes []*structs.Node) { 257 n := len(nodes) 258 for i := n - 1; i > 0; i-- { 259 j := rand.Intn(i + 1) 260 nodes[i], nodes[j] = nodes[j], nodes[i] 261 } 262 } 263 264 // tasksUpdated does a diff between task groups to see if the 265 // tasks, their drivers, environment variables or config have updated. 266 func tasksUpdated(a, b *structs.TaskGroup) bool { 267 // If the number of tasks do not match, clearly there is an update 268 if len(a.Tasks) != len(b.Tasks) { 269 return true 270 } 271 272 // Check each task 273 for _, at := range a.Tasks { 274 bt := b.LookupTask(at.Name) 275 if bt == nil { 276 return true 277 } 278 if at.Driver != bt.Driver { 279 return true 280 } 281 if !reflect.DeepEqual(at.Config, bt.Config) { 282 return true 283 } 284 if !reflect.DeepEqual(at.Env, bt.Env) { 285 return true 286 } 287 288 // Inspect the network to see if the dynamic ports are different 289 if len(at.Resources.Networks) != len(bt.Resources.Networks) { 290 return true 291 } 292 for idx := range at.Resources.Networks { 293 an := at.Resources.Networks[idx] 294 bn := bt.Resources.Networks[idx] 295 if len(an.DynamicPorts) != len(bn.DynamicPorts) { 296 return true 297 } 298 } 299 } 300 return false 301 } 302 303 // setStatus is used to update the status of the evaluation 304 func setStatus(logger *log.Logger, planner Planner, eval, nextEval *structs.Evaluation, status, desc string) error { 305 logger.Printf("[DEBUG] sched: %#v: setting status to %s", eval, status) 306 newEval := eval.Copy() 307 newEval.Status = status 308 newEval.StatusDescription = desc 309 if nextEval != nil { 310 newEval.NextEval = nextEval.ID 311 } 312 return planner.UpdateEval(newEval) 313 } 314 315 // inplaceUpdate attempts to update allocations in-place where possible. 316 func inplaceUpdate(ctx Context, eval *structs.Evaluation, job *structs.Job, 317 stack Stack, updates []allocTuple) []allocTuple { 318 319 n := len(updates) 320 inplace := 0 321 for i := 0; i < n; i++ { 322 // Get the update 323 update := updates[i] 324 325 // Check if the task drivers or config has changed, requires 326 // a rolling upgrade since that cannot be done in-place. 327 existing := update.Alloc.Job.LookupTaskGroup(update.TaskGroup.Name) 328 if tasksUpdated(update.TaskGroup, existing) { 329 continue 330 } 331 332 // Get the existing node 333 node, err := ctx.State().NodeByID(update.Alloc.NodeID) 334 if err != nil { 335 ctx.Logger().Printf("[ERR] sched: %#v failed to get node '%s': %v", 336 eval, update.Alloc.NodeID, err) 337 continue 338 } 339 if node == nil { 340 continue 341 } 342 343 // Set the existing node as the base set 344 stack.SetNodes([]*structs.Node{node}) 345 346 // Stage an eviction of the current allocation. This is done so that 347 // the current allocation is discounted when checking for feasability. 348 // Otherwise we would be trying to fit the tasks current resources and 349 // updated resources. After select is called we can remove the evict. 350 ctx.Plan().AppendUpdate(update.Alloc, structs.AllocDesiredStatusStop, 351 allocInPlace) 352 353 // Attempt to match the task group 354 option, size := stack.Select(update.TaskGroup) 355 356 // Pop the allocation 357 ctx.Plan().PopUpdate(update.Alloc) 358 359 // Skip if we could not do an in-place update 360 if option == nil { 361 continue 362 } 363 364 // Restore the network offers from the existing allocation. 365 // We do not allow network resources (reserved/dynamic ports) 366 // to be updated. This is guarded in taskUpdated, so we can 367 // safely restore those here. 368 for task, resources := range option.TaskResources { 369 existing := update.Alloc.TaskResources[task] 370 resources.Networks = existing.Networks 371 } 372 373 // Create a shallow copy 374 newAlloc := new(structs.Allocation) 375 *newAlloc = *update.Alloc 376 377 // Update the allocation 378 newAlloc.EvalID = eval.ID 379 newAlloc.Job = job 380 newAlloc.Resources = size 381 newAlloc.TaskResources = option.TaskResources 382 newAlloc.Metrics = ctx.Metrics() 383 newAlloc.DesiredStatus = structs.AllocDesiredStatusRun 384 newAlloc.ClientStatus = structs.AllocClientStatusPending 385 newAlloc.PopulateServiceIDs() 386 ctx.Plan().AppendAlloc(newAlloc) 387 388 // Remove this allocation from the slice 389 updates[i] = updates[n-1] 390 i-- 391 n-- 392 inplace++ 393 } 394 if len(updates) > 0 { 395 ctx.Logger().Printf("[DEBUG] sched: %#v: %d in-place updates of %d", eval, inplace, len(updates)) 396 } 397 return updates[:n] 398 } 399 400 // evictAndPlace is used to mark allocations for evicts and add them to the 401 // placement queue. evictAndPlace modifies both the the diffResult and the 402 // limit. It returns true if the limit has been reached. 403 func evictAndPlace(ctx Context, diff *diffResult, allocs []allocTuple, desc string, limit *int) bool { 404 n := len(allocs) 405 for i := 0; i < n && i < *limit; i++ { 406 a := allocs[i] 407 ctx.Plan().AppendUpdate(a.Alloc, structs.AllocDesiredStatusStop, desc) 408 diff.place = append(diff.place, a) 409 } 410 if n <= *limit { 411 *limit -= n 412 return false 413 } 414 *limit = 0 415 return true 416 } 417 418 // tgConstrainTuple is used to store the total constraints of a task group. 419 type tgConstrainTuple struct { 420 // Holds the combined constraints of the task group and all it's sub-tasks. 421 constraints []*structs.Constraint 422 423 // The set of required drivers within the task group. 424 drivers map[string]struct{} 425 426 // The combined resources of all tasks within the task group. 427 size *structs.Resources 428 } 429 430 // taskGroupConstraints collects the constraints, drivers and resources required by each 431 // sub-task to aggregate the TaskGroup totals 432 func taskGroupConstraints(tg *structs.TaskGroup) tgConstrainTuple { 433 c := tgConstrainTuple{ 434 constraints: make([]*structs.Constraint, 0, len(tg.Constraints)), 435 drivers: make(map[string]struct{}), 436 size: new(structs.Resources), 437 } 438 439 c.constraints = append(c.constraints, tg.Constraints...) 440 for _, task := range tg.Tasks { 441 c.drivers[task.Driver] = struct{}{} 442 c.constraints = append(c.constraints, task.Constraints...) 443 c.size.Add(task.Resources) 444 } 445 446 return c 447 } 448 449 func initTaskState(tg *structs.TaskGroup, state string) map[string]*structs.TaskState { 450 states := make(map[string]*structs.TaskState, len(tg.Tasks)) 451 for _, task := range tg.Tasks { 452 states[task.Name] = &structs.TaskState{State: state} 453 } 454 return states 455 }