github.com/rohankumardubey/nomad@v0.11.8/scheduler/preemption.go (about) 1 package scheduler 2 3 import ( 4 "math" 5 "sort" 6 7 "github.com/hashicorp/nomad/nomad/structs" 8 ) 9 10 // maxParallelPenalty is a score penalty applied to allocations to mitigate against 11 // too many allocations of the same job being preempted. This penalty is applied after the 12 // number of allocations being preempted exceeds max_parallel value in the job's migrate stanza 13 const maxParallelPenalty = 50.0 14 15 type groupedAllocs struct { 16 priority int 17 allocs []*structs.Allocation 18 } 19 20 type allocInfo struct { 21 maxParallel int 22 resources *structs.ComparableResources 23 } 24 25 // PreemptionResource interface is implemented by different 26 // types of resources. 27 type PreemptionResource interface { 28 // MeetsRequirements returns true if the available resources match needed resources 29 MeetsRequirements() bool 30 31 // Distance returns values in the range [0, MaxFloat], lower is better 32 Distance() float64 33 } 34 35 // NetworkPreemptionResource implements PreemptionResource for network assignments 36 // It only looks at MBits needed 37 type NetworkPreemptionResource struct { 38 availableResources *structs.NetworkResource 39 resourceNeeded *structs.NetworkResource 40 } 41 42 func (n *NetworkPreemptionResource) MeetsRequirements() bool { 43 mbitsAvailable := n.availableResources.MBits 44 mbitsNeeded := n.resourceNeeded.MBits 45 if mbitsAvailable == 0 || mbitsNeeded == 0 { 46 return false 47 } 48 return mbitsAvailable >= mbitsNeeded 49 } 50 51 func (n *NetworkPreemptionResource) Distance() float64 { 52 return networkResourceDistance(n.availableResources, n.resourceNeeded) 53 } 54 55 // BasePreemptionResource implements PreemptionResource for CPU/Memory/Disk 56 type BasePreemptionResource struct { 57 availableResources *structs.ComparableResources 58 resourceNeeded *structs.ComparableResources 59 } 60 61 func (b *BasePreemptionResource) MeetsRequirements() bool { 62 super, _ := b.availableResources.Superset(b.resourceNeeded) 63 return super 64 } 65 66 func (b *BasePreemptionResource) Distance() float64 { 67 return basicResourceDistance(b.resourceNeeded, b.availableResources) 68 } 69 70 // PreemptionResourceFactory returns a new PreemptionResource 71 type PreemptionResourceFactory func(availableResources *structs.ComparableResources, resourceAsk *structs.ComparableResources) PreemptionResource 72 73 // GetNetworkPreemptionResourceFactory returns a preemption resource factory for network assignments 74 func GetNetworkPreemptionResourceFactory() PreemptionResourceFactory { 75 return func(availableResources *structs.ComparableResources, resourceNeeded *structs.ComparableResources) PreemptionResource { 76 available := availableResources.Flattened.Networks[0] 77 return &NetworkPreemptionResource{ 78 availableResources: available, 79 resourceNeeded: resourceNeeded.Flattened.Networks[0], 80 } 81 } 82 } 83 84 // GetBasePreemptionResourceFactory returns a preemption resource factory for CPU/Memory/Disk 85 func GetBasePreemptionResourceFactory() PreemptionResourceFactory { 86 return func(availableResources *structs.ComparableResources, resourceNeeded *structs.ComparableResources) PreemptionResource { 87 return &BasePreemptionResource{ 88 availableResources: availableResources, 89 resourceNeeded: resourceNeeded, 90 } 91 } 92 } 93 94 // Preemptor is used to track existing allocations 95 // and find suitable allocations to preempt 96 type Preemptor struct { 97 98 // currentPreemptions is a map computed when SetPreemptions is called 99 // it tracks the number of preempted allocations per job/taskgroup 100 currentPreemptions map[structs.NamespacedID]map[string]int 101 102 // allocDetails is a map computed when SetCandidates is called 103 // it stores some precomputed details about the allocation needed 104 // when scoring it for preemption 105 allocDetails map[string]*allocInfo 106 107 // jobPriority is the priority of the job being preempted 108 jobPriority int 109 110 // jobID is the ID of the job being preempted 111 jobID *structs.NamespacedID 112 113 // nodeRemainingResources tracks available resources on the node after 114 // accounting for running allocations 115 nodeRemainingResources *structs.ComparableResources 116 117 // currentAllocs is the candidate set used to find preemptible allocations 118 currentAllocs []*structs.Allocation 119 120 // ctx is the context from the scheduler stack 121 ctx Context 122 } 123 124 func NewPreemptor(jobPriority int, ctx Context, jobID *structs.NamespacedID) *Preemptor { 125 return &Preemptor{ 126 currentPreemptions: make(map[structs.NamespacedID]map[string]int), 127 jobPriority: jobPriority, 128 jobID: jobID, 129 allocDetails: make(map[string]*allocInfo), 130 ctx: ctx, 131 } 132 } 133 134 // SetNode sets the node 135 func (p *Preemptor) SetNode(node *structs.Node) { 136 nodeRemainingResources := node.ComparableResources() 137 138 // Subtract the reserved resources of the node 139 if c := node.ComparableReservedResources(); c != nil { 140 nodeRemainingResources.Subtract(c) 141 } 142 p.nodeRemainingResources = nodeRemainingResources 143 } 144 145 // SetCandidates initializes the candidate set from which preemptions are chosen 146 func (p *Preemptor) SetCandidates(allocs []*structs.Allocation) { 147 // Reset candidate set 148 p.currentAllocs = []*structs.Allocation{} 149 for _, alloc := range allocs { 150 // Ignore any allocations of the job being placed 151 // This filters out any previous allocs of the job, and any new allocs in the plan 152 if alloc.JobID == p.jobID.ID && alloc.Namespace == p.jobID.Namespace { 153 continue 154 } 155 156 maxParallel := 0 157 tg := alloc.Job.LookupTaskGroup(alloc.TaskGroup) 158 if tg != nil && tg.Migrate != nil { 159 maxParallel = tg.Migrate.MaxParallel 160 } 161 p.allocDetails[alloc.ID] = &allocInfo{maxParallel: maxParallel, resources: alloc.ComparableResources()} 162 p.currentAllocs = append(p.currentAllocs, alloc) 163 } 164 } 165 166 // SetPreemptions initializes a map tracking existing counts of preempted allocations 167 // per job/task group. This is used while scoring preemption options 168 func (p *Preemptor) SetPreemptions(allocs []*structs.Allocation) { 169 170 // Clear out existing values since this can be called more than once 171 p.currentPreemptions = make(map[structs.NamespacedID]map[string]int) 172 173 // Initialize counts 174 for _, alloc := range allocs { 175 id := structs.NewNamespacedID(alloc.JobID, alloc.Namespace) 176 countMap, ok := p.currentPreemptions[id] 177 if !ok { 178 countMap = make(map[string]int) 179 p.currentPreemptions[id] = countMap 180 } 181 countMap[alloc.TaskGroup]++ 182 } 183 } 184 185 // getNumPreemptions counts the number of other allocations being preempted that match the job and task group of 186 // the alloc under consideration. This is used as a scoring factor to minimize too many allocs of the same job being preempted at once 187 func (p *Preemptor) getNumPreemptions(alloc *structs.Allocation) int { 188 c, ok := p.currentPreemptions[structs.NewNamespacedID(alloc.JobID, alloc.Namespace)][alloc.TaskGroup] 189 if !ok { 190 return 0 191 } 192 return c 193 } 194 195 // PreemptForTaskGroup computes a list of allocations to preempt to accommodate 196 // the resources asked for. Only allocs with a job priority < 10 of jobPriority are considered 197 // This method is meant only for finding preemptible allocations based on CPU/Memory/Disk 198 func (p *Preemptor) PreemptForTaskGroup(resourceAsk *structs.AllocatedResources) []*structs.Allocation { 199 resourcesNeeded := resourceAsk.Comparable() 200 201 // Subtract current allocations 202 for _, alloc := range p.currentAllocs { 203 allocResources := p.allocDetails[alloc.ID].resources 204 p.nodeRemainingResources.Subtract(allocResources) 205 } 206 207 // Group candidates by priority, filter out ineligible allocs 208 allocsByPriority := filterAndGroupPreemptibleAllocs(p.jobPriority, p.currentAllocs) 209 210 var bestAllocs []*structs.Allocation 211 allRequirementsMet := false 212 213 // Initialize variable to track resources as they become available from preemption 214 availableResources := p.nodeRemainingResources.Copy() 215 216 resourcesAsked := resourceAsk.Comparable() 217 // Iterate over allocations grouped by priority to find preemptible allocations 218 for _, allocGrp := range allocsByPriority { 219 for len(allocGrp.allocs) > 0 && !allRequirementsMet { 220 closestAllocIndex := -1 221 bestDistance := math.MaxFloat64 222 // Find the alloc with the closest distance 223 for index, alloc := range allocGrp.allocs { 224 currentPreemptionCount := p.getNumPreemptions(alloc) 225 allocDetails := p.allocDetails[alloc.ID] 226 maxParallel := allocDetails.maxParallel 227 distance := scoreForTaskGroup(resourcesNeeded, allocDetails.resources, maxParallel, currentPreemptionCount) 228 if distance < bestDistance { 229 bestDistance = distance 230 closestAllocIndex = index 231 } 232 } 233 closestAlloc := allocGrp.allocs[closestAllocIndex] 234 closestResources := p.allocDetails[closestAlloc.ID].resources 235 availableResources.Add(closestResources) 236 237 // This step needs the original resources asked for as the second arg, can't use the running total 238 allRequirementsMet, _ = availableResources.Superset(resourcesAsked) 239 240 bestAllocs = append(bestAllocs, closestAlloc) 241 242 allocGrp.allocs[closestAllocIndex] = allocGrp.allocs[len(allocGrp.allocs)-1] 243 allocGrp.allocs = allocGrp.allocs[:len(allocGrp.allocs)-1] 244 245 // This is the remaining total of resources needed 246 resourcesNeeded.Subtract(closestResources) 247 } 248 if allRequirementsMet { 249 break 250 } 251 } 252 253 // Early return if all allocs examined and requirements were not met 254 if !allRequirementsMet { 255 return nil 256 } 257 258 // We do another pass to eliminate unnecessary preemptions 259 // This filters out allocs whose resources are already covered by another alloc 260 basePreemptionResource := GetBasePreemptionResourceFactory() 261 resourcesNeeded = resourceAsk.Comparable() 262 filteredBestAllocs := p.filterSuperset(bestAllocs, p.nodeRemainingResources, resourcesNeeded, basePreemptionResource) 263 return filteredBestAllocs 264 265 } 266 267 // PreemptForNetwork tries to find allocations to preempt to meet network resources. 268 // This is called once per task when assigning a network to the task. While finding allocations 269 // to preempt, this only considers allocations that share the same network device 270 func (p *Preemptor) PreemptForNetwork(networkResourceAsk *structs.NetworkResource, netIdx *structs.NetworkIndex) []*structs.Allocation { 271 272 // Early return if there are no current allocs 273 if len(p.currentAllocs) == 0 { 274 return nil 275 } 276 277 deviceToAllocs := make(map[string][]*structs.Allocation) 278 MbitsNeeded := networkResourceAsk.MBits 279 reservedPortsNeeded := networkResourceAsk.ReservedPorts 280 281 // Build map of reserved ports needed for fast access 282 reservedPorts := make(map[int]struct{}) 283 for _, port := range reservedPortsNeeded { 284 reservedPorts[port.Value] = struct{}{} 285 } 286 287 // filteredReservedPorts tracks reserved ports that are 288 // currently used by higher priority allocations that can't 289 // be preempted 290 filteredReservedPorts := make(map[string]map[int]struct{}) 291 292 // Create a map from each device to allocs 293 // We can only preempt within allocations that 294 // are using the same device 295 for _, alloc := range p.currentAllocs { 296 if alloc.Job == nil { 297 continue 298 } 299 300 allocResources := p.allocDetails[alloc.ID].resources 301 networks := allocResources.Flattened.Networks 302 if len(networks) == 0 { 303 continue 304 } 305 306 // We only check first network - TODO: why?!?! 307 net := networks[0] 308 309 // Filter out alloc that's ineligible due to priority 310 if p.jobPriority-alloc.Job.Priority < 10 { 311 // Populate any reserved ports used by 312 // this allocation that cannot be preempted 313 for _, port := range net.ReservedPorts { 314 portMap, ok := filteredReservedPorts[net.Device] 315 if !ok { 316 portMap = make(map[int]struct{}) 317 filteredReservedPorts[net.Device] = portMap 318 } 319 portMap[port.Value] = struct{}{} 320 } 321 continue 322 } 323 324 // Only include if the alloc has a network device 325 device := networks[0].Device 326 allocsForDevice := deviceToAllocs[device] 327 allocsForDevice = append(allocsForDevice, alloc) 328 deviceToAllocs[device] = allocsForDevice 329 } 330 331 // If no existing allocations use network resources, return early 332 if len(deviceToAllocs) == 0 { 333 return nil 334 } 335 336 var allocsToPreempt []*structs.Allocation 337 met := false 338 freeBandwidth := 0 339 preemptedDevice := "" 340 341 OUTER: 342 for device, currentAllocs := range deviceToAllocs { 343 preemptedDevice = device 344 totalBandwidth := netIdx.AvailBandwidth[device] 345 346 // If the device doesn't have enough total available bandwidth, skip 347 if totalBandwidth < MbitsNeeded { 348 continue 349 } 350 351 // Track how much existing free bandwidth we have before preemption 352 freeBandwidth = totalBandwidth - netIdx.UsedBandwidth[device] 353 354 preemptedBandwidth := 0 355 356 // Reset allocsToPreempt since we don't want to preempt across devices for the same task 357 allocsToPreempt = nil 358 359 // usedPortToAlloc tracks used ports by allocs in this device 360 usedPortToAlloc := make(map[int]*structs.Allocation) 361 362 // First try to satisfy needed reserved ports 363 if len(reservedPortsNeeded) > 0 { 364 365 // Populate usedPort map 366 for _, alloc := range currentAllocs { 367 allocResources := p.allocDetails[alloc.ID].resources 368 for _, n := range allocResources.Flattened.Networks { 369 reservedPorts := n.ReservedPorts 370 for _, p := range reservedPorts { 371 usedPortToAlloc[p.Value] = alloc 372 } 373 } 374 } 375 // Look for allocs that are using reserved ports needed 376 for _, port := range reservedPortsNeeded { 377 alloc, ok := usedPortToAlloc[port.Value] 378 if ok { 379 allocResources := p.allocDetails[alloc.ID].resources 380 preemptedBandwidth += allocResources.Flattened.Networks[0].MBits 381 allocsToPreempt = append(allocsToPreempt, alloc) 382 } else { 383 // Check if a higher priority allocation is using this port 384 // It cant be preempted so we skip to the next device 385 _, ok := filteredReservedPorts[device][port.Value] 386 if ok { 387 continue OUTER 388 } 389 } 390 } 391 392 // Remove allocs that were preempted to satisfy reserved ports 393 currentAllocs = structs.RemoveAllocs(currentAllocs, allocsToPreempt) 394 } 395 396 // If bandwidth requirements have been met, stop 397 if preemptedBandwidth+freeBandwidth >= MbitsNeeded { 398 met = true 399 break OUTER 400 } 401 402 // Split by priority 403 allocsByPriority := filterAndGroupPreemptibleAllocs(p.jobPriority, currentAllocs) 404 405 for _, allocsGrp := range allocsByPriority { 406 allocs := allocsGrp.allocs 407 408 // Sort by distance function 409 sort.Slice(allocs, func(i, j int) bool { 410 return p.distanceComparatorForNetwork(allocs, networkResourceAsk, i, j) 411 }) 412 413 // Iterate over allocs until end of if requirements have been met 414 for _, alloc := range allocs { 415 allocResources := p.allocDetails[alloc.ID].resources 416 preemptedBandwidth += allocResources.Flattened.Networks[0].MBits 417 allocsToPreempt = append(allocsToPreempt, alloc) 418 if preemptedBandwidth+freeBandwidth >= MbitsNeeded { 419 met = true 420 break OUTER 421 } 422 } 423 424 } 425 426 } 427 428 // Early return if we could not meet resource needs after examining allocs 429 if !met { 430 return nil 431 } 432 433 // Build a resource object with just the network Mbits filled in 434 nodeRemainingResources := &structs.ComparableResources{ 435 Flattened: structs.AllocatedTaskResources{ 436 Networks: []*structs.NetworkResource{ 437 { 438 Device: preemptedDevice, 439 MBits: freeBandwidth, 440 }, 441 }, 442 }, 443 } 444 445 // Do a final pass to eliminate any superset allocations 446 preemptionResourceFactory := GetNetworkPreemptionResourceFactory() 447 resourcesNeeded := &structs.ComparableResources{ 448 Flattened: structs.AllocatedTaskResources{ 449 Networks: []*structs.NetworkResource{networkResourceAsk}, 450 }, 451 } 452 filteredBestAllocs := p.filterSuperset(allocsToPreempt, nodeRemainingResources, resourcesNeeded, preemptionResourceFactory) 453 return filteredBestAllocs 454 } 455 456 // deviceGroupAllocs represents a group of allocs that share a device 457 type deviceGroupAllocs struct { 458 allocs []*structs.Allocation 459 460 // deviceInstances tracks the number of instances used per alloc 461 deviceInstances map[string]int 462 } 463 464 func newAllocDeviceGroup() *deviceGroupAllocs { 465 return &deviceGroupAllocs{ 466 deviceInstances: make(map[string]int), 467 } 468 } 469 470 // PreemptForDevice tries to find allocations to preempt to meet devices needed 471 // This is called once per device request when assigning devices to the task 472 func (p *Preemptor) PreemptForDevice(ask *structs.RequestedDevice, devAlloc *deviceAllocator) []*structs.Allocation { 473 474 // Group allocations by device, tracking the number of 475 // instances used in each device by alloc id 476 deviceToAllocs := make(map[structs.DeviceIdTuple]*deviceGroupAllocs) 477 for _, alloc := range p.currentAllocs { 478 for _, tr := range alloc.AllocatedResources.Tasks { 479 // Ignore allocs that don't use devices 480 if len(tr.Devices) == 0 { 481 continue 482 } 483 484 // Go through each assigned device group 485 for _, device := range tr.Devices { 486 // Look up the device instance from the device allocator 487 deviceIdTuple := *device.ID() 488 devInst := devAlloc.Devices[deviceIdTuple] 489 490 // devInst can be nil if the device is no longer healthy 491 if devInst == nil { 492 continue 493 } 494 495 // Ignore if the device doesn't match the ask 496 if !nodeDeviceMatches(p.ctx, devInst.Device, ask) { 497 continue 498 } 499 500 // Store both the alloc and the number of instances used 501 // in our tracking map 502 allocDeviceGrp := deviceToAllocs[deviceIdTuple] 503 if allocDeviceGrp == nil { 504 allocDeviceGrp = newAllocDeviceGroup() 505 deviceToAllocs[deviceIdTuple] = allocDeviceGrp 506 } 507 allocDeviceGrp.allocs = append(allocDeviceGrp.allocs, alloc) 508 allocDeviceGrp.deviceInstances[alloc.ID] += len(device.DeviceIDs) 509 } 510 } 511 } 512 513 neededCount := ask.Count 514 515 var preemptionOptions []*deviceGroupAllocs 516 // Examine matching allocs by device 517 OUTER: 518 for deviceIDTuple, allocsGrp := range deviceToAllocs { 519 // First group and sort allocations using this device by priority 520 allocsByPriority := filterAndGroupPreemptibleAllocs(p.jobPriority, allocsGrp.allocs) 521 522 // Reset preempted count for this device 523 preemptedCount := 0 524 525 // Initialize slice of preempted allocations 526 var preemptedAllocs []*structs.Allocation 527 528 for _, grpAllocs := range allocsByPriority { 529 for _, alloc := range grpAllocs.allocs { 530 // Look up the device instance from the device allocator 531 devInst := devAlloc.Devices[deviceIDTuple] 532 533 // Add to preemption list because this device matches 534 preemptedCount += allocsGrp.deviceInstances[alloc.ID] 535 preemptedAllocs = append(preemptedAllocs, alloc) 536 537 // Check if we met needed count 538 if preemptedCount+devInst.FreeCount() >= int(neededCount) { 539 preemptionOptions = append(preemptionOptions, &deviceGroupAllocs{ 540 allocs: preemptedAllocs, 541 deviceInstances: allocsGrp.deviceInstances, 542 }) 543 continue OUTER 544 } 545 } 546 } 547 } 548 549 // Find the combination of allocs with lowest net priority 550 if len(preemptionOptions) > 0 { 551 return selectBestAllocs(preemptionOptions, int(neededCount)) 552 } 553 554 return nil 555 } 556 557 // selectBestAllocs finds the best allocations based on minimal net priority amongst 558 // all options. The net priority is the sum of unique priorities in each option 559 func selectBestAllocs(preemptionOptions []*deviceGroupAllocs, neededCount int) []*structs.Allocation { 560 bestPriority := math.MaxInt32 561 var bestAllocs []*structs.Allocation 562 563 // We iterate over allocations in priority order, so its possible 564 // that we have more allocations than needed to meet the needed count. 565 // e.g we need 4 instances, and we get 3 from a priority 10 alloc, and 4 from 566 // a priority 20 alloc. We should filter out the priority 10 alloc in that case. 567 // This loop does a filter and chooses the set with the smallest net priority 568 for _, allocGrp := range preemptionOptions { 569 // Find unique priorities and add them to calculate net priority 570 priorities := map[int]struct{}{} 571 netPriority := 0 572 573 devInst := allocGrp.deviceInstances 574 var filteredAllocs []*structs.Allocation 575 576 // Sort by number of device instances used, descending 577 sort.Slice(allocGrp.allocs, func(i, j int) bool { 578 instanceCount1 := devInst[allocGrp.allocs[i].ID] 579 instanceCount2 := devInst[allocGrp.allocs[j].ID] 580 return instanceCount1 > instanceCount2 581 }) 582 583 // Filter and calculate net priority 584 preemptedInstanceCount := 0 585 for _, alloc := range allocGrp.allocs { 586 if preemptedInstanceCount >= neededCount { 587 break 588 } 589 instanceCount := devInst[alloc.ID] 590 preemptedInstanceCount += instanceCount 591 filteredAllocs = append(filteredAllocs, alloc) 592 _, ok := priorities[alloc.Job.Priority] 593 if !ok { 594 priorities[alloc.Job.Priority] = struct{}{} 595 netPriority += alloc.Job.Priority 596 } 597 } 598 if netPriority < bestPriority { 599 bestPriority = netPriority 600 bestAllocs = filteredAllocs 601 } 602 } 603 return bestAllocs 604 } 605 606 // basicResourceDistance computes a distance using a coordinate system. It compares resource fields like CPU/Memory and Disk. 607 // Values emitted are in the range [0, maxFloat] 608 func basicResourceDistance(resourceAsk *structs.ComparableResources, resourceUsed *structs.ComparableResources) float64 { 609 memoryCoord, cpuCoord, diskMBCoord := 0.0, 0.0, 0.0 610 if resourceAsk.Flattened.Memory.MemoryMB > 0 { 611 memoryCoord = (float64(resourceAsk.Flattened.Memory.MemoryMB) - float64(resourceUsed.Flattened.Memory.MemoryMB)) / float64(resourceAsk.Flattened.Memory.MemoryMB) 612 } 613 if resourceAsk.Flattened.Cpu.CpuShares > 0 { 614 cpuCoord = (float64(resourceAsk.Flattened.Cpu.CpuShares) - float64(resourceUsed.Flattened.Cpu.CpuShares)) / float64(resourceAsk.Flattened.Cpu.CpuShares) 615 } 616 if resourceAsk.Shared.DiskMB > 0 { 617 diskMBCoord = (float64(resourceAsk.Shared.DiskMB) - float64(resourceUsed.Shared.DiskMB)) / float64(resourceAsk.Shared.DiskMB) 618 } 619 originDist := math.Sqrt( 620 math.Pow(memoryCoord, 2) + 621 math.Pow(cpuCoord, 2) + 622 math.Pow(diskMBCoord, 2)) 623 return originDist 624 } 625 626 // networkResourceDistance returns a distance based only on network megabits 627 func networkResourceDistance(resourceUsed *structs.NetworkResource, resourceNeeded *structs.NetworkResource) float64 { 628 networkCoord := math.MaxFloat64 629 if resourceUsed != nil && resourceNeeded != nil { 630 networkCoord = float64(resourceNeeded.MBits-resourceUsed.MBits) / float64(resourceNeeded.MBits) 631 } 632 633 originDist := math.Abs(networkCoord) 634 return originDist 635 } 636 637 // scoreForTaskGroup is used to calculate a score (lower is better) based on the distance between 638 // the needed resource and requirements. A penalty is added when the choice already has some existing 639 // allocations in the plan that are being preempted. 640 func scoreForTaskGroup(resourceAsk *structs.ComparableResources, resourceUsed *structs.ComparableResources, maxParallel int, numPreemptedAllocs int) float64 { 641 maxParallelScorePenalty := 0.0 642 if maxParallel > 0 && numPreemptedAllocs >= maxParallel { 643 maxParallelScorePenalty = float64((numPreemptedAllocs+1)-maxParallel) * maxParallelPenalty 644 } 645 return basicResourceDistance(resourceAsk, resourceUsed) + maxParallelScorePenalty 646 } 647 648 // scoreForNetwork is similar to scoreForTaskGroup 649 // but only uses network Mbits to calculate a preemption score 650 func scoreForNetwork(resourceUsed *structs.NetworkResource, resourceNeeded *structs.NetworkResource, maxParallel int, numPreemptedAllocs int) float64 { 651 if resourceUsed == nil || resourceNeeded == nil { 652 return math.MaxFloat64 653 } 654 maxParallelScorePenalty := 0.0 655 if maxParallel > 0 && numPreemptedAllocs >= maxParallel { 656 maxParallelScorePenalty = float64((numPreemptedAllocs+1)-maxParallel) * maxParallelPenalty 657 } 658 return networkResourceDistance(resourceUsed, resourceNeeded) + maxParallelScorePenalty 659 } 660 661 // filterAndGroupPreemptibleAllocs groups allocations by priority after filtering allocs 662 // that are not preemptible based on the jobPriority arg 663 func filterAndGroupPreemptibleAllocs(jobPriority int, current []*structs.Allocation) []*groupedAllocs { 664 allocsByPriority := make(map[int][]*structs.Allocation) 665 for _, alloc := range current { 666 if alloc.Job == nil { 667 continue 668 } 669 670 // Skip allocs whose priority is within a delta of 10 671 // This also skips any allocs of the current job 672 // for which we are attempting preemption 673 if jobPriority-alloc.Job.Priority < 10 { 674 continue 675 } 676 grpAllocs, ok := allocsByPriority[alloc.Job.Priority] 677 if !ok { 678 grpAllocs = make([]*structs.Allocation, 0) 679 } 680 grpAllocs = append(grpAllocs, alloc) 681 allocsByPriority[alloc.Job.Priority] = grpAllocs 682 } 683 684 var groupedSortedAllocs []*groupedAllocs 685 for priority, allocs := range allocsByPriority { 686 groupedSortedAllocs = append(groupedSortedAllocs, &groupedAllocs{ 687 priority: priority, 688 allocs: allocs}) 689 } 690 691 // Sort by priority 692 sort.Slice(groupedSortedAllocs, func(i, j int) bool { 693 return groupedSortedAllocs[i].priority < groupedSortedAllocs[j].priority 694 }) 695 696 return groupedSortedAllocs 697 } 698 699 // filterSuperset is used as a final step to remove 700 // any allocations that meet a superset of requirements from 701 // the set of allocations to preempt 702 func (p *Preemptor) filterSuperset(bestAllocs []*structs.Allocation, 703 nodeRemainingResources *structs.ComparableResources, 704 resourceAsk *structs.ComparableResources, 705 preemptionResourceFactory PreemptionResourceFactory) []*structs.Allocation { 706 707 // Sort bestAllocs by distance descending (without penalty) 708 sort.Slice(bestAllocs, func(i, j int) bool { 709 a1Resources := p.allocDetails[bestAllocs[i].ID].resources 710 a2Resources := p.allocDetails[bestAllocs[j].ID].resources 711 distance1 := preemptionResourceFactory(a1Resources, resourceAsk).Distance() 712 distance2 := preemptionResourceFactory(a2Resources, resourceAsk).Distance() 713 return distance1 > distance2 714 }) 715 716 availableResources := nodeRemainingResources.Copy() 717 var filteredBestAllocs []*structs.Allocation 718 719 // Do another pass to eliminate allocations that are a superset of other allocations 720 // in the preemption set 721 for _, alloc := range bestAllocs { 722 filteredBestAllocs = append(filteredBestAllocs, alloc) 723 allocResources := p.allocDetails[alloc.ID].resources 724 availableResources.Add(allocResources) 725 726 premptionResource := preemptionResourceFactory(availableResources, resourceAsk) 727 requirementsMet := premptionResource.MeetsRequirements() 728 if requirementsMet { 729 break 730 } 731 } 732 return filteredBestAllocs 733 } 734 735 // distanceComparatorForNetwork is used as the sorting function when finding allocations to preempt. It uses 736 // both a coordinate distance function based on Mbits needed, and a penalty if the allocation under consideration 737 // belongs to a job that already has more preempted allocations 738 func (p *Preemptor) distanceComparatorForNetwork(allocs []*structs.Allocation, networkResourceAsk *structs.NetworkResource, i int, j int) bool { 739 firstAlloc := allocs[i] 740 currentPreemptionCount1 := p.getNumPreemptions(firstAlloc) 741 742 // Look up configured maxParallel value for these allocation's task groups 743 var maxParallel1, maxParallel2 int 744 tg1 := allocs[i].Job.LookupTaskGroup(firstAlloc.TaskGroup) 745 if tg1 != nil && tg1.Migrate != nil { 746 maxParallel1 = tg1.Migrate.MaxParallel 747 } 748 749 // Dereference network usage on first alloc if its there 750 firstAllocResources := p.allocDetails[firstAlloc.ID].resources 751 firstAllocNetworks := firstAllocResources.Flattened.Networks 752 var firstAllocNetResourceUsed *structs.NetworkResource 753 if len(firstAllocNetworks) > 0 { 754 firstAllocNetResourceUsed = firstAllocNetworks[0] 755 } 756 757 distance1 := scoreForNetwork(firstAllocNetResourceUsed, networkResourceAsk, maxParallel1, currentPreemptionCount1) 758 759 secondAlloc := allocs[j] 760 currentPreemptionCount2 := p.getNumPreemptions(secondAlloc) 761 tg2 := secondAlloc.Job.LookupTaskGroup(secondAlloc.TaskGroup) 762 if tg2 != nil && tg2.Migrate != nil { 763 maxParallel2 = tg2.Migrate.MaxParallel 764 } 765 766 // Dereference network usage on second alloc if its there 767 secondAllocResources := p.allocDetails[secondAlloc.ID].resources 768 secondAllocNetworks := secondAllocResources.Flattened.Networks 769 var secondAllocNetResourceUsed *structs.NetworkResource 770 if len(secondAllocNetworks) > 0 { 771 secondAllocNetResourceUsed = secondAllocNetworks[0] 772 } 773 774 distance2 := scoreForNetwork(secondAllocNetResourceUsed, networkResourceAsk, maxParallel2, currentPreemptionCount2) 775 return distance1 < distance2 776 }