github.com/banmanh482/nomad@v0.11.8/scheduler/preemption.go (about)

     1  package scheduler
     2  
     3  import (
     4  	"math"
     5  	"sort"
     6  
     7  	"github.com/hashicorp/nomad/nomad/structs"
     8  )
     9  
    10  // maxParallelPenalty is a score penalty applied to allocations to mitigate against
    11  // too many allocations of the same job being preempted. This penalty is applied after the
    12  // number of allocations being preempted exceeds max_parallel value in the job's migrate stanza
    13  const maxParallelPenalty = 50.0
    14  
    15  type groupedAllocs struct {
    16  	priority int
    17  	allocs   []*structs.Allocation
    18  }
    19  
    20  type allocInfo struct {
    21  	maxParallel int
    22  	resources   *structs.ComparableResources
    23  }
    24  
    25  // PreemptionResource interface is implemented by different
    26  // types of resources.
    27  type PreemptionResource interface {
    28  	// MeetsRequirements returns true if the available resources match needed resources
    29  	MeetsRequirements() bool
    30  
    31  	// Distance returns values in the range [0, MaxFloat], lower is better
    32  	Distance() float64
    33  }
    34  
    35  // NetworkPreemptionResource implements PreemptionResource for network assignments
    36  // It only looks at MBits needed
    37  type NetworkPreemptionResource struct {
    38  	availableResources *structs.NetworkResource
    39  	resourceNeeded     *structs.NetworkResource
    40  }
    41  
    42  func (n *NetworkPreemptionResource) MeetsRequirements() bool {
    43  	mbitsAvailable := n.availableResources.MBits
    44  	mbitsNeeded := n.resourceNeeded.MBits
    45  	if mbitsAvailable == 0 || mbitsNeeded == 0 {
    46  		return false
    47  	}
    48  	return mbitsAvailable >= mbitsNeeded
    49  }
    50  
    51  func (n *NetworkPreemptionResource) Distance() float64 {
    52  	return networkResourceDistance(n.availableResources, n.resourceNeeded)
    53  }
    54  
    55  // BasePreemptionResource implements PreemptionResource for CPU/Memory/Disk
    56  type BasePreemptionResource struct {
    57  	availableResources *structs.ComparableResources
    58  	resourceNeeded     *structs.ComparableResources
    59  }
    60  
    61  func (b *BasePreemptionResource) MeetsRequirements() bool {
    62  	super, _ := b.availableResources.Superset(b.resourceNeeded)
    63  	return super
    64  }
    65  
    66  func (b *BasePreemptionResource) Distance() float64 {
    67  	return basicResourceDistance(b.resourceNeeded, b.availableResources)
    68  }
    69  
    70  // PreemptionResourceFactory returns a new PreemptionResource
    71  type PreemptionResourceFactory func(availableResources *structs.ComparableResources, resourceAsk *structs.ComparableResources) PreemptionResource
    72  
    73  // GetNetworkPreemptionResourceFactory returns a preemption resource factory for network assignments
    74  func GetNetworkPreemptionResourceFactory() PreemptionResourceFactory {
    75  	return func(availableResources *structs.ComparableResources, resourceNeeded *structs.ComparableResources) PreemptionResource {
    76  		available := availableResources.Flattened.Networks[0]
    77  		return &NetworkPreemptionResource{
    78  			availableResources: available,
    79  			resourceNeeded:     resourceNeeded.Flattened.Networks[0],
    80  		}
    81  	}
    82  }
    83  
    84  // GetBasePreemptionResourceFactory returns a preemption resource factory for CPU/Memory/Disk
    85  func GetBasePreemptionResourceFactory() PreemptionResourceFactory {
    86  	return func(availableResources *structs.ComparableResources, resourceNeeded *structs.ComparableResources) PreemptionResource {
    87  		return &BasePreemptionResource{
    88  			availableResources: availableResources,
    89  			resourceNeeded:     resourceNeeded,
    90  		}
    91  	}
    92  }
    93  
    94  // Preemptor is used to track existing allocations
    95  // and find suitable allocations to preempt
    96  type Preemptor struct {
    97  
    98  	// currentPreemptions is a map computed when SetPreemptions is called
    99  	// it tracks the number of preempted allocations per job/taskgroup
   100  	currentPreemptions map[structs.NamespacedID]map[string]int
   101  
   102  	// allocDetails is a map computed when SetCandidates is called
   103  	// it stores some precomputed details about the allocation needed
   104  	// when scoring it for preemption
   105  	allocDetails map[string]*allocInfo
   106  
   107  	// jobPriority is the priority of the job being preempted
   108  	jobPriority int
   109  
   110  	// jobID is the ID of the job being preempted
   111  	jobID *structs.NamespacedID
   112  
   113  	// nodeRemainingResources tracks available resources on the node after
   114  	// accounting for running allocations
   115  	nodeRemainingResources *structs.ComparableResources
   116  
   117  	// currentAllocs is the candidate set used to find preemptible allocations
   118  	currentAllocs []*structs.Allocation
   119  
   120  	// ctx is the context from the scheduler stack
   121  	ctx Context
   122  }
   123  
   124  func NewPreemptor(jobPriority int, ctx Context, jobID *structs.NamespacedID) *Preemptor {
   125  	return &Preemptor{
   126  		currentPreemptions: make(map[structs.NamespacedID]map[string]int),
   127  		jobPriority:        jobPriority,
   128  		jobID:              jobID,
   129  		allocDetails:       make(map[string]*allocInfo),
   130  		ctx:                ctx,
   131  	}
   132  }
   133  
   134  // SetNode sets the node
   135  func (p *Preemptor) SetNode(node *structs.Node) {
   136  	nodeRemainingResources := node.ComparableResources()
   137  
   138  	// Subtract the reserved resources of the node
   139  	if c := node.ComparableReservedResources(); c != nil {
   140  		nodeRemainingResources.Subtract(c)
   141  	}
   142  	p.nodeRemainingResources = nodeRemainingResources
   143  }
   144  
   145  // SetCandidates initializes the candidate set from which preemptions are chosen
   146  func (p *Preemptor) SetCandidates(allocs []*structs.Allocation) {
   147  	// Reset candidate set
   148  	p.currentAllocs = []*structs.Allocation{}
   149  	for _, alloc := range allocs {
   150  		// Ignore any allocations of the job being placed
   151  		// This filters out any previous allocs of the job, and any new allocs in the plan
   152  		if alloc.JobID == p.jobID.ID && alloc.Namespace == p.jobID.Namespace {
   153  			continue
   154  		}
   155  
   156  		maxParallel := 0
   157  		tg := alloc.Job.LookupTaskGroup(alloc.TaskGroup)
   158  		if tg != nil && tg.Migrate != nil {
   159  			maxParallel = tg.Migrate.MaxParallel
   160  		}
   161  		p.allocDetails[alloc.ID] = &allocInfo{maxParallel: maxParallel, resources: alloc.ComparableResources()}
   162  		p.currentAllocs = append(p.currentAllocs, alloc)
   163  	}
   164  }
   165  
   166  // SetPreemptions initializes a map tracking existing counts of preempted allocations
   167  // per job/task group. This is used while scoring preemption options
   168  func (p *Preemptor) SetPreemptions(allocs []*structs.Allocation) {
   169  
   170  	// Clear out existing values since this can be called more than once
   171  	p.currentPreemptions = make(map[structs.NamespacedID]map[string]int)
   172  
   173  	// Initialize counts
   174  	for _, alloc := range allocs {
   175  		id := structs.NewNamespacedID(alloc.JobID, alloc.Namespace)
   176  		countMap, ok := p.currentPreemptions[id]
   177  		if !ok {
   178  			countMap = make(map[string]int)
   179  			p.currentPreemptions[id] = countMap
   180  		}
   181  		countMap[alloc.TaskGroup]++
   182  	}
   183  }
   184  
   185  // getNumPreemptions counts the number of other allocations being preempted that match the job and task group of
   186  // the alloc under consideration. This is used as a scoring factor to minimize too many allocs of the same job being preempted at once
   187  func (p *Preemptor) getNumPreemptions(alloc *structs.Allocation) int {
   188  	c, ok := p.currentPreemptions[structs.NewNamespacedID(alloc.JobID, alloc.Namespace)][alloc.TaskGroup]
   189  	if !ok {
   190  		return 0
   191  	}
   192  	return c
   193  }
   194  
   195  // PreemptForTaskGroup computes a list of allocations to preempt to accommodate
   196  // the resources asked for. Only allocs with a job priority < 10 of jobPriority are considered
   197  // This method is meant only for finding preemptible allocations based on CPU/Memory/Disk
   198  func (p *Preemptor) PreemptForTaskGroup(resourceAsk *structs.AllocatedResources) []*structs.Allocation {
   199  	resourcesNeeded := resourceAsk.Comparable()
   200  
   201  	// Subtract current allocations
   202  	for _, alloc := range p.currentAllocs {
   203  		allocResources := p.allocDetails[alloc.ID].resources
   204  		p.nodeRemainingResources.Subtract(allocResources)
   205  	}
   206  
   207  	// Group candidates by priority, filter out ineligible allocs
   208  	allocsByPriority := filterAndGroupPreemptibleAllocs(p.jobPriority, p.currentAllocs)
   209  
   210  	var bestAllocs []*structs.Allocation
   211  	allRequirementsMet := false
   212  
   213  	// Initialize variable to track resources as they become available from preemption
   214  	availableResources := p.nodeRemainingResources.Copy()
   215  
   216  	resourcesAsked := resourceAsk.Comparable()
   217  	// Iterate over allocations grouped by priority to find preemptible allocations
   218  	for _, allocGrp := range allocsByPriority {
   219  		for len(allocGrp.allocs) > 0 && !allRequirementsMet {
   220  			closestAllocIndex := -1
   221  			bestDistance := math.MaxFloat64
   222  			// Find the alloc with the closest distance
   223  			for index, alloc := range allocGrp.allocs {
   224  				currentPreemptionCount := p.getNumPreemptions(alloc)
   225  				allocDetails := p.allocDetails[alloc.ID]
   226  				maxParallel := allocDetails.maxParallel
   227  				distance := scoreForTaskGroup(resourcesNeeded, allocDetails.resources, maxParallel, currentPreemptionCount)
   228  				if distance < bestDistance {
   229  					bestDistance = distance
   230  					closestAllocIndex = index
   231  				}
   232  			}
   233  			closestAlloc := allocGrp.allocs[closestAllocIndex]
   234  			closestResources := p.allocDetails[closestAlloc.ID].resources
   235  			availableResources.Add(closestResources)
   236  
   237  			// This step needs the original resources asked for as the second arg, can't use the running total
   238  			allRequirementsMet, _ = availableResources.Superset(resourcesAsked)
   239  
   240  			bestAllocs = append(bestAllocs, closestAlloc)
   241  
   242  			allocGrp.allocs[closestAllocIndex] = allocGrp.allocs[len(allocGrp.allocs)-1]
   243  			allocGrp.allocs = allocGrp.allocs[:len(allocGrp.allocs)-1]
   244  
   245  			// This is the remaining total of resources needed
   246  			resourcesNeeded.Subtract(closestResources)
   247  		}
   248  		if allRequirementsMet {
   249  			break
   250  		}
   251  	}
   252  
   253  	// Early return if all allocs examined and requirements were not met
   254  	if !allRequirementsMet {
   255  		return nil
   256  	}
   257  
   258  	// We do another pass to eliminate unnecessary preemptions
   259  	// This filters out allocs whose resources are already covered by another alloc
   260  	basePreemptionResource := GetBasePreemptionResourceFactory()
   261  	resourcesNeeded = resourceAsk.Comparable()
   262  	filteredBestAllocs := p.filterSuperset(bestAllocs, p.nodeRemainingResources, resourcesNeeded, basePreemptionResource)
   263  	return filteredBestAllocs
   264  
   265  }
   266  
   267  // PreemptForNetwork tries to find allocations to preempt to meet network resources.
   268  // This is called once per task when assigning a network to the task. While finding allocations
   269  // to preempt, this only considers allocations that share the same network device
   270  func (p *Preemptor) PreemptForNetwork(networkResourceAsk *structs.NetworkResource, netIdx *structs.NetworkIndex) []*structs.Allocation {
   271  
   272  	// Early return if there are no current allocs
   273  	if len(p.currentAllocs) == 0 {
   274  		return nil
   275  	}
   276  
   277  	deviceToAllocs := make(map[string][]*structs.Allocation)
   278  	MbitsNeeded := networkResourceAsk.MBits
   279  	reservedPortsNeeded := networkResourceAsk.ReservedPorts
   280  
   281  	// Build map of reserved ports needed for fast access
   282  	reservedPorts := make(map[int]struct{})
   283  	for _, port := range reservedPortsNeeded {
   284  		reservedPorts[port.Value] = struct{}{}
   285  	}
   286  
   287  	// filteredReservedPorts tracks reserved ports that are
   288  	// currently used by higher priority allocations that can't
   289  	// be preempted
   290  	filteredReservedPorts := make(map[string]map[int]struct{})
   291  
   292  	// Create a map from each device to allocs
   293  	// We can only preempt within allocations that
   294  	// are using the same device
   295  	for _, alloc := range p.currentAllocs {
   296  		if alloc.Job == nil {
   297  			continue
   298  		}
   299  
   300  		allocResources := p.allocDetails[alloc.ID].resources
   301  		networks := allocResources.Flattened.Networks
   302  		if len(networks) == 0 {
   303  			continue
   304  		}
   305  
   306  		// We only check first network - TODO: why?!?!
   307  		net := networks[0]
   308  
   309  		// Filter out alloc that's ineligible due to priority
   310  		if p.jobPriority-alloc.Job.Priority < 10 {
   311  			// Populate any reserved ports used by
   312  			// this allocation that cannot be preempted
   313  			for _, port := range net.ReservedPorts {
   314  				portMap, ok := filteredReservedPorts[net.Device]
   315  				if !ok {
   316  					portMap = make(map[int]struct{})
   317  					filteredReservedPorts[net.Device] = portMap
   318  				}
   319  				portMap[port.Value] = struct{}{}
   320  			}
   321  			continue
   322  		}
   323  
   324  		// Only include if the alloc has a network device
   325  		device := networks[0].Device
   326  		allocsForDevice := deviceToAllocs[device]
   327  		allocsForDevice = append(allocsForDevice, alloc)
   328  		deviceToAllocs[device] = allocsForDevice
   329  	}
   330  
   331  	// If no existing allocations use network resources, return early
   332  	if len(deviceToAllocs) == 0 {
   333  		return nil
   334  	}
   335  
   336  	var allocsToPreempt []*structs.Allocation
   337  	met := false
   338  	freeBandwidth := 0
   339  	preemptedDevice := ""
   340  
   341  OUTER:
   342  	for device, currentAllocs := range deviceToAllocs {
   343  		preemptedDevice = device
   344  		totalBandwidth := netIdx.AvailBandwidth[device]
   345  
   346  		// If the device doesn't have enough total available bandwidth, skip
   347  		if totalBandwidth < MbitsNeeded {
   348  			continue
   349  		}
   350  
   351  		// Track how much existing free bandwidth we have before preemption
   352  		freeBandwidth = totalBandwidth - netIdx.UsedBandwidth[device]
   353  
   354  		preemptedBandwidth := 0
   355  
   356  		// Reset allocsToPreempt since we don't want to preempt across devices for the same task
   357  		allocsToPreempt = nil
   358  
   359  		// usedPortToAlloc tracks used ports by allocs in this device
   360  		usedPortToAlloc := make(map[int]*structs.Allocation)
   361  
   362  		// First try to satisfy needed reserved ports
   363  		if len(reservedPortsNeeded) > 0 {
   364  
   365  			// Populate usedPort map
   366  			for _, alloc := range currentAllocs {
   367  				allocResources := p.allocDetails[alloc.ID].resources
   368  				for _, n := range allocResources.Flattened.Networks {
   369  					reservedPorts := n.ReservedPorts
   370  					for _, p := range reservedPorts {
   371  						usedPortToAlloc[p.Value] = alloc
   372  					}
   373  				}
   374  			}
   375  			// Look for allocs that are using reserved ports needed
   376  			for _, port := range reservedPortsNeeded {
   377  				alloc, ok := usedPortToAlloc[port.Value]
   378  				if ok {
   379  					allocResources := p.allocDetails[alloc.ID].resources
   380  					preemptedBandwidth += allocResources.Flattened.Networks[0].MBits
   381  					allocsToPreempt = append(allocsToPreempt, alloc)
   382  				} else {
   383  					// Check if a higher priority allocation is using this port
   384  					// It cant be preempted so we skip to the next device
   385  					_, ok := filteredReservedPorts[device][port.Value]
   386  					if ok {
   387  						continue OUTER
   388  					}
   389  				}
   390  			}
   391  
   392  			// Remove allocs that were preempted to satisfy reserved ports
   393  			currentAllocs = structs.RemoveAllocs(currentAllocs, allocsToPreempt)
   394  		}
   395  
   396  		// If bandwidth requirements have been met, stop
   397  		if preemptedBandwidth+freeBandwidth >= MbitsNeeded {
   398  			met = true
   399  			break OUTER
   400  		}
   401  
   402  		// Split by priority
   403  		allocsByPriority := filterAndGroupPreemptibleAllocs(p.jobPriority, currentAllocs)
   404  
   405  		for _, allocsGrp := range allocsByPriority {
   406  			allocs := allocsGrp.allocs
   407  
   408  			// Sort by distance function
   409  			sort.Slice(allocs, func(i, j int) bool {
   410  				return p.distanceComparatorForNetwork(allocs, networkResourceAsk, i, j)
   411  			})
   412  
   413  			// Iterate over allocs until end of if requirements have been met
   414  			for _, alloc := range allocs {
   415  				allocResources := p.allocDetails[alloc.ID].resources
   416  				preemptedBandwidth += allocResources.Flattened.Networks[0].MBits
   417  				allocsToPreempt = append(allocsToPreempt, alloc)
   418  				if preemptedBandwidth+freeBandwidth >= MbitsNeeded {
   419  					met = true
   420  					break OUTER
   421  				}
   422  			}
   423  
   424  		}
   425  
   426  	}
   427  
   428  	// Early return if we could not meet resource needs after examining allocs
   429  	if !met {
   430  		return nil
   431  	}
   432  
   433  	// Build a resource object with just the network Mbits filled in
   434  	nodeRemainingResources := &structs.ComparableResources{
   435  		Flattened: structs.AllocatedTaskResources{
   436  			Networks: []*structs.NetworkResource{
   437  				{
   438  					Device: preemptedDevice,
   439  					MBits:  freeBandwidth,
   440  				},
   441  			},
   442  		},
   443  	}
   444  
   445  	// Do a final pass to eliminate any superset allocations
   446  	preemptionResourceFactory := GetNetworkPreemptionResourceFactory()
   447  	resourcesNeeded := &structs.ComparableResources{
   448  		Flattened: structs.AllocatedTaskResources{
   449  			Networks: []*structs.NetworkResource{networkResourceAsk},
   450  		},
   451  	}
   452  	filteredBestAllocs := p.filterSuperset(allocsToPreempt, nodeRemainingResources, resourcesNeeded, preemptionResourceFactory)
   453  	return filteredBestAllocs
   454  }
   455  
   456  // deviceGroupAllocs represents a group of allocs that share a device
   457  type deviceGroupAllocs struct {
   458  	allocs []*structs.Allocation
   459  
   460  	// deviceInstances tracks the number of instances used per alloc
   461  	deviceInstances map[string]int
   462  }
   463  
   464  func newAllocDeviceGroup() *deviceGroupAllocs {
   465  	return &deviceGroupAllocs{
   466  		deviceInstances: make(map[string]int),
   467  	}
   468  }
   469  
   470  // PreemptForDevice tries to find allocations to preempt to meet devices needed
   471  // This is called once per device request when assigning devices to the task
   472  func (p *Preemptor) PreemptForDevice(ask *structs.RequestedDevice, devAlloc *deviceAllocator) []*structs.Allocation {
   473  
   474  	// Group allocations by device, tracking the number of
   475  	// instances used in each device by alloc id
   476  	deviceToAllocs := make(map[structs.DeviceIdTuple]*deviceGroupAllocs)
   477  	for _, alloc := range p.currentAllocs {
   478  		for _, tr := range alloc.AllocatedResources.Tasks {
   479  			// Ignore allocs that don't use devices
   480  			if len(tr.Devices) == 0 {
   481  				continue
   482  			}
   483  
   484  			// Go through each assigned device group
   485  			for _, device := range tr.Devices {
   486  				// Look up the device instance from the device allocator
   487  				deviceIdTuple := *device.ID()
   488  				devInst := devAlloc.Devices[deviceIdTuple]
   489  
   490  				// devInst can be nil if the device is no longer healthy
   491  				if devInst == nil {
   492  					continue
   493  				}
   494  
   495  				// Ignore if the device doesn't match the ask
   496  				if !nodeDeviceMatches(p.ctx, devInst.Device, ask) {
   497  					continue
   498  				}
   499  
   500  				// Store both the alloc and the number of instances used
   501  				// in our tracking map
   502  				allocDeviceGrp := deviceToAllocs[deviceIdTuple]
   503  				if allocDeviceGrp == nil {
   504  					allocDeviceGrp = newAllocDeviceGroup()
   505  					deviceToAllocs[deviceIdTuple] = allocDeviceGrp
   506  				}
   507  				allocDeviceGrp.allocs = append(allocDeviceGrp.allocs, alloc)
   508  				allocDeviceGrp.deviceInstances[alloc.ID] += len(device.DeviceIDs)
   509  			}
   510  		}
   511  	}
   512  
   513  	neededCount := ask.Count
   514  
   515  	var preemptionOptions []*deviceGroupAllocs
   516  	// Examine matching allocs by device
   517  OUTER:
   518  	for deviceIDTuple, allocsGrp := range deviceToAllocs {
   519  		// First group and sort allocations using this device by priority
   520  		allocsByPriority := filterAndGroupPreemptibleAllocs(p.jobPriority, allocsGrp.allocs)
   521  
   522  		// Reset preempted count for this device
   523  		preemptedCount := 0
   524  
   525  		// Initialize slice of preempted allocations
   526  		var preemptedAllocs []*structs.Allocation
   527  
   528  		for _, grpAllocs := range allocsByPriority {
   529  			for _, alloc := range grpAllocs.allocs {
   530  				// Look up the device instance from the device allocator
   531  				devInst := devAlloc.Devices[deviceIDTuple]
   532  
   533  				// Add to preemption list because this device matches
   534  				preemptedCount += allocsGrp.deviceInstances[alloc.ID]
   535  				preemptedAllocs = append(preemptedAllocs, alloc)
   536  
   537  				// Check if we met needed count
   538  				if preemptedCount+devInst.FreeCount() >= int(neededCount) {
   539  					preemptionOptions = append(preemptionOptions, &deviceGroupAllocs{
   540  						allocs:          preemptedAllocs,
   541  						deviceInstances: allocsGrp.deviceInstances,
   542  					})
   543  					continue OUTER
   544  				}
   545  			}
   546  		}
   547  	}
   548  
   549  	// Find the combination of allocs with lowest net priority
   550  	if len(preemptionOptions) > 0 {
   551  		return selectBestAllocs(preemptionOptions, int(neededCount))
   552  	}
   553  
   554  	return nil
   555  }
   556  
   557  // selectBestAllocs finds the best allocations based on minimal net priority amongst
   558  // all options. The net priority is the sum of unique priorities in each option
   559  func selectBestAllocs(preemptionOptions []*deviceGroupAllocs, neededCount int) []*structs.Allocation {
   560  	bestPriority := math.MaxInt32
   561  	var bestAllocs []*structs.Allocation
   562  
   563  	// We iterate over allocations in priority order, so its possible
   564  	// that we have more allocations than needed to meet the needed count.
   565  	// e.g we need 4 instances, and we get 3 from a priority 10 alloc, and 4 from
   566  	// a priority 20 alloc. We should filter out the priority 10 alloc in that case.
   567  	// This loop does a filter and chooses the set with the smallest net priority
   568  	for _, allocGrp := range preemptionOptions {
   569  		// Find unique priorities and add them to calculate net priority
   570  		priorities := map[int]struct{}{}
   571  		netPriority := 0
   572  
   573  		devInst := allocGrp.deviceInstances
   574  		var filteredAllocs []*structs.Allocation
   575  
   576  		// Sort by number of device instances used, descending
   577  		sort.Slice(allocGrp.allocs, func(i, j int) bool {
   578  			instanceCount1 := devInst[allocGrp.allocs[i].ID]
   579  			instanceCount2 := devInst[allocGrp.allocs[j].ID]
   580  			return instanceCount1 > instanceCount2
   581  		})
   582  
   583  		// Filter and calculate net priority
   584  		preemptedInstanceCount := 0
   585  		for _, alloc := range allocGrp.allocs {
   586  			if preemptedInstanceCount >= neededCount {
   587  				break
   588  			}
   589  			instanceCount := devInst[alloc.ID]
   590  			preemptedInstanceCount += instanceCount
   591  			filteredAllocs = append(filteredAllocs, alloc)
   592  			_, ok := priorities[alloc.Job.Priority]
   593  			if !ok {
   594  				priorities[alloc.Job.Priority] = struct{}{}
   595  				netPriority += alloc.Job.Priority
   596  			}
   597  		}
   598  		if netPriority < bestPriority {
   599  			bestPriority = netPriority
   600  			bestAllocs = filteredAllocs
   601  		}
   602  	}
   603  	return bestAllocs
   604  }
   605  
   606  // basicResourceDistance computes a distance using a coordinate system. It compares resource fields like CPU/Memory and Disk.
   607  // Values emitted are in the range [0, maxFloat]
   608  func basicResourceDistance(resourceAsk *structs.ComparableResources, resourceUsed *structs.ComparableResources) float64 {
   609  	memoryCoord, cpuCoord, diskMBCoord := 0.0, 0.0, 0.0
   610  	if resourceAsk.Flattened.Memory.MemoryMB > 0 {
   611  		memoryCoord = (float64(resourceAsk.Flattened.Memory.MemoryMB) - float64(resourceUsed.Flattened.Memory.MemoryMB)) / float64(resourceAsk.Flattened.Memory.MemoryMB)
   612  	}
   613  	if resourceAsk.Flattened.Cpu.CpuShares > 0 {
   614  		cpuCoord = (float64(resourceAsk.Flattened.Cpu.CpuShares) - float64(resourceUsed.Flattened.Cpu.CpuShares)) / float64(resourceAsk.Flattened.Cpu.CpuShares)
   615  	}
   616  	if resourceAsk.Shared.DiskMB > 0 {
   617  		diskMBCoord = (float64(resourceAsk.Shared.DiskMB) - float64(resourceUsed.Shared.DiskMB)) / float64(resourceAsk.Shared.DiskMB)
   618  	}
   619  	originDist := math.Sqrt(
   620  		math.Pow(memoryCoord, 2) +
   621  			math.Pow(cpuCoord, 2) +
   622  			math.Pow(diskMBCoord, 2))
   623  	return originDist
   624  }
   625  
   626  // networkResourceDistance returns a distance based only on network megabits
   627  func networkResourceDistance(resourceUsed *structs.NetworkResource, resourceNeeded *structs.NetworkResource) float64 {
   628  	networkCoord := math.MaxFloat64
   629  	if resourceUsed != nil && resourceNeeded != nil {
   630  		networkCoord = float64(resourceNeeded.MBits-resourceUsed.MBits) / float64(resourceNeeded.MBits)
   631  	}
   632  
   633  	originDist := math.Abs(networkCoord)
   634  	return originDist
   635  }
   636  
   637  // scoreForTaskGroup is used to calculate a score (lower is better) based on the distance between
   638  // the needed resource and requirements. A penalty is added when the choice already has some existing
   639  // allocations in the plan that are being preempted.
   640  func scoreForTaskGroup(resourceAsk *structs.ComparableResources, resourceUsed *structs.ComparableResources, maxParallel int, numPreemptedAllocs int) float64 {
   641  	maxParallelScorePenalty := 0.0
   642  	if maxParallel > 0 && numPreemptedAllocs >= maxParallel {
   643  		maxParallelScorePenalty = float64((numPreemptedAllocs+1)-maxParallel) * maxParallelPenalty
   644  	}
   645  	return basicResourceDistance(resourceAsk, resourceUsed) + maxParallelScorePenalty
   646  }
   647  
   648  // scoreForNetwork is similar to scoreForTaskGroup
   649  // but only uses network Mbits to calculate a preemption score
   650  func scoreForNetwork(resourceUsed *structs.NetworkResource, resourceNeeded *structs.NetworkResource, maxParallel int, numPreemptedAllocs int) float64 {
   651  	if resourceUsed == nil || resourceNeeded == nil {
   652  		return math.MaxFloat64
   653  	}
   654  	maxParallelScorePenalty := 0.0
   655  	if maxParallel > 0 && numPreemptedAllocs >= maxParallel {
   656  		maxParallelScorePenalty = float64((numPreemptedAllocs+1)-maxParallel) * maxParallelPenalty
   657  	}
   658  	return networkResourceDistance(resourceUsed, resourceNeeded) + maxParallelScorePenalty
   659  }
   660  
   661  // filterAndGroupPreemptibleAllocs groups allocations by priority after filtering allocs
   662  // that are not preemptible based on the jobPriority arg
   663  func filterAndGroupPreemptibleAllocs(jobPriority int, current []*structs.Allocation) []*groupedAllocs {
   664  	allocsByPriority := make(map[int][]*structs.Allocation)
   665  	for _, alloc := range current {
   666  		if alloc.Job == nil {
   667  			continue
   668  		}
   669  
   670  		// Skip allocs whose priority is within a delta of 10
   671  		// This also skips any allocs of the current job
   672  		// for which we are attempting preemption
   673  		if jobPriority-alloc.Job.Priority < 10 {
   674  			continue
   675  		}
   676  		grpAllocs, ok := allocsByPriority[alloc.Job.Priority]
   677  		if !ok {
   678  			grpAllocs = make([]*structs.Allocation, 0)
   679  		}
   680  		grpAllocs = append(grpAllocs, alloc)
   681  		allocsByPriority[alloc.Job.Priority] = grpAllocs
   682  	}
   683  
   684  	var groupedSortedAllocs []*groupedAllocs
   685  	for priority, allocs := range allocsByPriority {
   686  		groupedSortedAllocs = append(groupedSortedAllocs, &groupedAllocs{
   687  			priority: priority,
   688  			allocs:   allocs})
   689  	}
   690  
   691  	// Sort by priority
   692  	sort.Slice(groupedSortedAllocs, func(i, j int) bool {
   693  		return groupedSortedAllocs[i].priority < groupedSortedAllocs[j].priority
   694  	})
   695  
   696  	return groupedSortedAllocs
   697  }
   698  
   699  // filterSuperset is used as a final step to remove
   700  // any allocations that meet a superset of requirements from
   701  // the set of allocations to preempt
   702  func (p *Preemptor) filterSuperset(bestAllocs []*structs.Allocation,
   703  	nodeRemainingResources *structs.ComparableResources,
   704  	resourceAsk *structs.ComparableResources,
   705  	preemptionResourceFactory PreemptionResourceFactory) []*structs.Allocation {
   706  
   707  	// Sort bestAllocs by distance descending (without penalty)
   708  	sort.Slice(bestAllocs, func(i, j int) bool {
   709  		a1Resources := p.allocDetails[bestAllocs[i].ID].resources
   710  		a2Resources := p.allocDetails[bestAllocs[j].ID].resources
   711  		distance1 := preemptionResourceFactory(a1Resources, resourceAsk).Distance()
   712  		distance2 := preemptionResourceFactory(a2Resources, resourceAsk).Distance()
   713  		return distance1 > distance2
   714  	})
   715  
   716  	availableResources := nodeRemainingResources.Copy()
   717  	var filteredBestAllocs []*structs.Allocation
   718  
   719  	// Do another pass to eliminate allocations that are a superset of other allocations
   720  	// in the preemption set
   721  	for _, alloc := range bestAllocs {
   722  		filteredBestAllocs = append(filteredBestAllocs, alloc)
   723  		allocResources := p.allocDetails[alloc.ID].resources
   724  		availableResources.Add(allocResources)
   725  
   726  		premptionResource := preemptionResourceFactory(availableResources, resourceAsk)
   727  		requirementsMet := premptionResource.MeetsRequirements()
   728  		if requirementsMet {
   729  			break
   730  		}
   731  	}
   732  	return filteredBestAllocs
   733  }
   734  
   735  // distanceComparatorForNetwork is used as the sorting function when finding allocations to preempt. It uses
   736  // both a coordinate distance function based on Mbits needed, and a penalty if the allocation under consideration
   737  // belongs to a job that already has more preempted allocations
   738  func (p *Preemptor) distanceComparatorForNetwork(allocs []*structs.Allocation, networkResourceAsk *structs.NetworkResource, i int, j int) bool {
   739  	firstAlloc := allocs[i]
   740  	currentPreemptionCount1 := p.getNumPreemptions(firstAlloc)
   741  
   742  	// Look up configured maxParallel value for these allocation's task groups
   743  	var maxParallel1, maxParallel2 int
   744  	tg1 := allocs[i].Job.LookupTaskGroup(firstAlloc.TaskGroup)
   745  	if tg1 != nil && tg1.Migrate != nil {
   746  		maxParallel1 = tg1.Migrate.MaxParallel
   747  	}
   748  
   749  	// Dereference network usage on first alloc if its there
   750  	firstAllocResources := p.allocDetails[firstAlloc.ID].resources
   751  	firstAllocNetworks := firstAllocResources.Flattened.Networks
   752  	var firstAllocNetResourceUsed *structs.NetworkResource
   753  	if len(firstAllocNetworks) > 0 {
   754  		firstAllocNetResourceUsed = firstAllocNetworks[0]
   755  	}
   756  
   757  	distance1 := scoreForNetwork(firstAllocNetResourceUsed, networkResourceAsk, maxParallel1, currentPreemptionCount1)
   758  
   759  	secondAlloc := allocs[j]
   760  	currentPreemptionCount2 := p.getNumPreemptions(secondAlloc)
   761  	tg2 := secondAlloc.Job.LookupTaskGroup(secondAlloc.TaskGroup)
   762  	if tg2 != nil && tg2.Migrate != nil {
   763  		maxParallel2 = tg2.Migrate.MaxParallel
   764  	}
   765  
   766  	// Dereference network usage on second alloc if its there
   767  	secondAllocResources := p.allocDetails[secondAlloc.ID].resources
   768  	secondAllocNetworks := secondAllocResources.Flattened.Networks
   769  	var secondAllocNetResourceUsed *structs.NetworkResource
   770  	if len(secondAllocNetworks) > 0 {
   771  		secondAllocNetResourceUsed = secondAllocNetworks[0]
   772  	}
   773  
   774  	distance2 := scoreForNetwork(secondAllocNetResourceUsed, networkResourceAsk, maxParallel2, currentPreemptionCount2)
   775  	return distance1 < distance2
   776  }