github.com/kardianos/nomad@v0.1.3-0.20151022182107-b13df73ee850/scheduler/util.go (about) 1 package scheduler 2 3 import ( 4 "fmt" 5 "log" 6 "math/rand" 7 "reflect" 8 9 "github.com/hashicorp/nomad/nomad/structs" 10 ) 11 12 // allocTuple is a tuple of the allocation name and potential alloc ID 13 type allocTuple struct { 14 Name string 15 TaskGroup *structs.TaskGroup 16 Alloc *structs.Allocation 17 } 18 19 // materializeTaskGroups is used to materialize all the task groups 20 // a job requires. This is used to do the count expansion. 21 func materializeTaskGroups(job *structs.Job) map[string]*structs.TaskGroup { 22 out := make(map[string]*structs.TaskGroup) 23 if job == nil { 24 return out 25 } 26 27 for _, tg := range job.TaskGroups { 28 for i := 0; i < tg.Count; i++ { 29 name := fmt.Sprintf("%s.%s[%d]", job.Name, tg.Name, i) 30 out[name] = tg 31 } 32 } 33 return out 34 } 35 36 // diffResult is used to return the sets that result from the diff 37 type diffResult struct { 38 place, update, migrate, stop, ignore []allocTuple 39 } 40 41 func (d *diffResult) GoString() string { 42 return fmt.Sprintf("allocs: (place %d) (update %d) (migrate %d) (stop %d) (ignore %d)", 43 len(d.place), len(d.update), len(d.migrate), len(d.stop), len(d.ignore)) 44 } 45 46 func (d *diffResult) Append(other *diffResult) { 47 d.place = append(d.place, other.place...) 48 d.update = append(d.update, other.update...) 49 d.migrate = append(d.migrate, other.migrate...) 50 d.stop = append(d.stop, other.stop...) 51 d.ignore = append(d.ignore, other.ignore...) 52 } 53 54 // diffAllocs is used to do a set difference between the target allocations 55 // and the existing allocations. This returns 5 sets of results, the list of 56 // named task groups that need to be placed (no existing allocation), the 57 // allocations that need to be updated (job definition is newer), allocs that 58 // need to be migrated (node is draining), the allocs that need to be evicted 59 // (no longer required), and those that should be ignored. 60 func diffAllocs(job *structs.Job, taintedNodes map[string]bool, 61 required map[string]*structs.TaskGroup, allocs []*structs.Allocation) *diffResult { 62 result := &diffResult{} 63 64 // Scan the existing updates 65 existing := make(map[string]struct{}) 66 for _, exist := range allocs { 67 // Index the existing node 68 name := exist.Name 69 existing[name] = struct{}{} 70 71 // Check for the definition in the required set 72 tg, ok := required[name] 73 74 // If not required, we stop the alloc 75 if !ok { 76 result.stop = append(result.stop, allocTuple{ 77 Name: name, 78 TaskGroup: tg, 79 Alloc: exist, 80 }) 81 continue 82 } 83 84 // If we are on a tainted node, we must migrate 85 if taintedNodes[exist.NodeID] { 86 result.migrate = append(result.migrate, allocTuple{ 87 Name: name, 88 TaskGroup: tg, 89 Alloc: exist, 90 }) 91 continue 92 } 93 94 // If the definition is updated we need to update 95 // XXX: This is an extremely conservative approach. We can check 96 // if the job definition has changed in a way that affects 97 // this allocation and potentially ignore it. 98 if job.ModifyIndex != exist.Job.ModifyIndex { 99 result.update = append(result.update, allocTuple{ 100 Name: name, 101 TaskGroup: tg, 102 Alloc: exist, 103 }) 104 continue 105 } 106 107 // Everything is up-to-date 108 result.ignore = append(result.ignore, allocTuple{ 109 Name: name, 110 TaskGroup: tg, 111 Alloc: exist, 112 }) 113 } 114 115 // Scan the required groups 116 for name, tg := range required { 117 // Check for an existing allocation 118 _, ok := existing[name] 119 120 // Require a placement if no existing allocation. If there 121 // is an existing allocation, we would have checked for a potential 122 // update or ignore above. 123 if !ok { 124 result.place = append(result.place, allocTuple{ 125 Name: name, 126 TaskGroup: tg, 127 }) 128 } 129 } 130 return result 131 } 132 133 // diffSystemAllocs is like diffAllocs however, the allocations in the 134 // diffResult contain the specific nodeID they should be allocated on. 135 func diffSystemAllocs(job *structs.Job, nodes []*structs.Node, taintedNodes map[string]bool, 136 allocs []*structs.Allocation) *diffResult { 137 138 // Build a mapping of nodes to all their allocs. 139 nodeAllocs := make(map[string][]*structs.Allocation, len(allocs)) 140 for _, alloc := range allocs { 141 nallocs := append(nodeAllocs[alloc.NodeID], alloc) 142 nodeAllocs[alloc.NodeID] = nallocs 143 } 144 145 for _, node := range nodes { 146 if _, ok := nodeAllocs[node.ID]; !ok { 147 nodeAllocs[node.ID] = nil 148 } 149 } 150 151 // Create the required task groups. 152 required := materializeTaskGroups(job) 153 154 result := &diffResult{} 155 for nodeID, allocs := range nodeAllocs { 156 diff := diffAllocs(job, taintedNodes, required, allocs) 157 158 // Mark the alloc as being for a specific node. 159 for i := range diff.place { 160 alloc := &diff.place[i] 161 alloc.Alloc = &structs.Allocation{NodeID: nodeID} 162 } 163 164 // Migrate does not apply to system jobs and instead should be marked as 165 // stop because if a node is tainted, the job is invalid on that node. 166 diff.stop = append(diff.stop, diff.migrate...) 167 diff.migrate = nil 168 169 result.Append(diff) 170 } 171 172 return result 173 } 174 175 // readyNodesInDCs returns all the ready nodes in the given datacenters 176 func readyNodesInDCs(state State, dcs []string) ([]*structs.Node, error) { 177 // Index the DCs 178 dcMap := make(map[string]struct{}, len(dcs)) 179 for _, dc := range dcs { 180 dcMap[dc] = struct{}{} 181 } 182 183 // Scan the nodes 184 var out []*structs.Node 185 iter, err := state.Nodes() 186 if err != nil { 187 return nil, err 188 } 189 for { 190 raw := iter.Next() 191 if raw == nil { 192 break 193 } 194 195 // Filter on datacenter and status 196 node := raw.(*structs.Node) 197 if node.Status != structs.NodeStatusReady { 198 continue 199 } 200 if node.Drain { 201 continue 202 } 203 if _, ok := dcMap[node.Datacenter]; !ok { 204 continue 205 } 206 out = append(out, node) 207 } 208 return out, nil 209 } 210 211 // retryMax is used to retry a callback until it returns success or 212 // a maximum number of attempts is reached 213 func retryMax(max int, cb func() (bool, error)) error { 214 attempts := 0 215 for attempts < max { 216 done, err := cb() 217 if err != nil { 218 return err 219 } 220 if done { 221 return nil 222 } 223 attempts += 1 224 } 225 return &SetStatusError{ 226 Err: fmt.Errorf("maximum attempts reached (%d)", max), 227 EvalStatus: structs.EvalStatusFailed, 228 } 229 } 230 231 // taintedNodes is used to scan the allocations and then check if the 232 // underlying nodes are tainted, and should force a migration of the allocation. 233 func taintedNodes(state State, allocs []*structs.Allocation) (map[string]bool, error) { 234 out := make(map[string]bool) 235 for _, alloc := range allocs { 236 if _, ok := out[alloc.NodeID]; ok { 237 continue 238 } 239 240 node, err := state.NodeByID(alloc.NodeID) 241 if err != nil { 242 return nil, err 243 } 244 245 // If the node does not exist, we should migrate 246 if node == nil { 247 out[alloc.NodeID] = true 248 continue 249 } 250 251 out[alloc.NodeID] = structs.ShouldDrainNode(node.Status) || node.Drain 252 } 253 return out, nil 254 } 255 256 // shuffleNodes randomizes the slice order with the Fisher-Yates algorithm 257 func shuffleNodes(nodes []*structs.Node) { 258 n := len(nodes) 259 for i := n - 1; i > 0; i-- { 260 j := rand.Intn(i + 1) 261 nodes[i], nodes[j] = nodes[j], nodes[i] 262 } 263 } 264 265 // tasksUpdated does a diff between task groups to see if the 266 // tasks, their drivers or config have updated. 267 func tasksUpdated(a, b *structs.TaskGroup) bool { 268 // If the number of tasks do not match, clearly there is an update 269 if len(a.Tasks) != len(b.Tasks) { 270 return true 271 } 272 273 // Check each task 274 for _, at := range a.Tasks { 275 bt := b.LookupTask(at.Name) 276 if bt == nil { 277 return true 278 } 279 if at.Driver != bt.Driver { 280 return true 281 } 282 if !reflect.DeepEqual(at.Config, bt.Config) { 283 return true 284 } 285 286 // Inspect the network to see if the dynamic ports are different 287 if len(at.Resources.Networks) != len(bt.Resources.Networks) { 288 return true 289 } 290 for idx := range at.Resources.Networks { 291 an := at.Resources.Networks[idx] 292 bn := bt.Resources.Networks[idx] 293 if len(an.DynamicPorts) != len(bn.DynamicPorts) { 294 return true 295 } 296 } 297 } 298 return false 299 } 300 301 // setStatus is used to update the status of the evaluation 302 func setStatus(logger *log.Logger, planner Planner, eval, nextEval *structs.Evaluation, status, desc string) error { 303 logger.Printf("[DEBUG] sched: %#v: setting status to %s", eval, status) 304 newEval := eval.Copy() 305 newEval.Status = status 306 newEval.StatusDescription = desc 307 if nextEval != nil { 308 newEval.NextEval = nextEval.ID 309 } 310 return planner.UpdateEval(newEval) 311 } 312 313 // inplaceUpdate attempts to update allocations in-place where possible. 314 func inplaceUpdate(ctx Context, eval *structs.Evaluation, job *structs.Job, 315 stack Stack, updates []allocTuple) []allocTuple { 316 317 n := len(updates) 318 inplace := 0 319 for i := 0; i < n; i++ { 320 // Get the update 321 update := updates[i] 322 323 // Check if the task drivers or config has changed, requires 324 // a rolling upgrade since that cannot be done in-place. 325 existing := update.Alloc.Job.LookupTaskGroup(update.TaskGroup.Name) 326 if tasksUpdated(update.TaskGroup, existing) { 327 continue 328 } 329 330 // Get the existing node 331 node, err := ctx.State().NodeByID(update.Alloc.NodeID) 332 if err != nil { 333 ctx.Logger().Printf("[ERR] sched: %#v failed to get node '%s': %v", 334 eval, update.Alloc.NodeID, err) 335 continue 336 } 337 if node == nil { 338 continue 339 } 340 341 // Set the existing node as the base set 342 stack.SetNodes([]*structs.Node{node}) 343 344 // Stage an eviction of the current allocation. This is done so that 345 // the current allocation is discounted when checking for feasability. 346 // Otherwise we would be trying to fit the tasks current resources and 347 // updated resources. After select is called we can remove the evict. 348 ctx.Plan().AppendUpdate(update.Alloc, structs.AllocDesiredStatusStop, 349 allocInPlace) 350 351 // Attempt to match the task group 352 option, size := stack.Select(update.TaskGroup) 353 354 // Pop the allocation 355 ctx.Plan().PopUpdate(update.Alloc) 356 357 // Skip if we could not do an in-place update 358 if option == nil { 359 continue 360 } 361 362 // Restore the network offers from the existing allocation. 363 // We do not allow network resources (reserved/dynamic ports) 364 // to be updated. This is guarded in taskUpdated, so we can 365 // safely restore those here. 366 for task, resources := range option.TaskResources { 367 existing := update.Alloc.TaskResources[task] 368 resources.Networks = existing.Networks 369 } 370 371 // Create a shallow copy 372 newAlloc := new(structs.Allocation) 373 *newAlloc = *update.Alloc 374 375 // Update the allocation 376 newAlloc.EvalID = eval.ID 377 newAlloc.Job = job 378 newAlloc.Resources = size 379 newAlloc.TaskResources = option.TaskResources 380 newAlloc.Metrics = ctx.Metrics() 381 newAlloc.DesiredStatus = structs.AllocDesiredStatusRun 382 newAlloc.ClientStatus = structs.AllocClientStatusPending 383 ctx.Plan().AppendAlloc(newAlloc) 384 385 // Remove this allocation from the slice 386 updates[i] = updates[n-1] 387 i-- 388 n-- 389 inplace++ 390 } 391 if len(updates) > 0 { 392 ctx.Logger().Printf("[DEBUG] sched: %#v: %d in-place updates of %d", eval, inplace, len(updates)) 393 } 394 return updates[:n] 395 } 396 397 // evictAndPlace is used to mark allocations for evicts and add them to the 398 // placement queue. evictAndPlace modifies both the the diffResult and the 399 // limit. It returns true if the limit has been reached. 400 func evictAndPlace(ctx Context, diff *diffResult, allocs []allocTuple, desc string, limit *int) bool { 401 n := len(allocs) 402 for i := 0; i < n && i < *limit; i++ { 403 a := allocs[i] 404 ctx.Plan().AppendUpdate(a.Alloc, structs.AllocDesiredStatusStop, desc) 405 diff.place = append(diff.place, a) 406 } 407 if n <= *limit { 408 *limit -= n 409 return false 410 } 411 *limit = 0 412 return true 413 } 414 415 // tgConstrainTuple is used to store the total constraints of a task group. 416 type tgConstrainTuple struct { 417 // Holds the combined constraints of the task group and all it's sub-tasks. 418 constraints []*structs.Constraint 419 420 // The set of required drivers within the task group. 421 drivers map[string]struct{} 422 423 // The combined resources of all tasks within the task group. 424 size *structs.Resources 425 } 426 427 // taskGroupConstraints collects the constraints, drivers and resources required by each 428 // sub-task to aggregate the TaskGroup totals 429 func taskGroupConstraints(tg *structs.TaskGroup) tgConstrainTuple { 430 c := tgConstrainTuple{ 431 constraints: make([]*structs.Constraint, 0, len(tg.Constraints)), 432 drivers: make(map[string]struct{}), 433 size: new(structs.Resources), 434 } 435 436 c.constraints = append(c.constraints, tg.Constraints...) 437 for _, task := range tg.Tasks { 438 c.drivers[task.Driver] = struct{}{} 439 c.constraints = append(c.constraints, task.Constraints...) 440 c.size.Add(task.Resources) 441 } 442 443 return c 444 }