k8s.io/kubernetes@v1.29.3/pkg/kubelet/cm/cpumanager/cpu_assignment.go (about)

     1  /*
     2  Copyright 2017 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package cpumanager
    18  
    19  import (
    20  	"fmt"
    21  	"math"
    22  	"sort"
    23  
    24  	"k8s.io/klog/v2"
    25  
    26  	"k8s.io/kubernetes/pkg/kubelet/cm/cpumanager/topology"
    27  	"k8s.io/utils/cpuset"
    28  )
    29  
    30  // LoopControl controls the behavior of the cpu accumulator loop logic
    31  type LoopControl int
    32  
    33  // Possible loop control outcomes
    34  const (
    35  	Continue LoopControl = iota
    36  	Break
    37  )
    38  
    39  type mapIntInt map[int]int
    40  
    41  func (m mapIntInt) Clone() mapIntInt {
    42  	cp := make(mapIntInt, len(m))
    43  	for k, v := range m {
    44  		cp[k] = v
    45  	}
    46  	return cp
    47  }
    48  
    49  func (m mapIntInt) Keys() []int {
    50  	var keys []int
    51  	for k := range m {
    52  		keys = append(keys, k)
    53  	}
    54  	return keys
    55  }
    56  
    57  func (m mapIntInt) Values(keys ...int) []int {
    58  	if keys == nil {
    59  		keys = m.Keys()
    60  	}
    61  	var values []int
    62  	for _, k := range keys {
    63  		values = append(values, m[k])
    64  	}
    65  	return values
    66  }
    67  
    68  func sum(xs []int) int {
    69  	var s int
    70  	for _, x := range xs {
    71  		s += x
    72  	}
    73  	return s
    74  }
    75  
    76  func mean(xs []int) float64 {
    77  	var sum float64
    78  	for _, x := range xs {
    79  		sum += float64(x)
    80  	}
    81  	m := sum / float64(len(xs))
    82  	return math.Round(m*1000) / 1000
    83  }
    84  
    85  func standardDeviation(xs []int) float64 {
    86  	m := mean(xs)
    87  	var sum float64
    88  	for _, x := range xs {
    89  		sum += (float64(x) - m) * (float64(x) - m)
    90  	}
    91  	s := math.Sqrt(sum / float64(len(xs)))
    92  	return math.Round(s*1000) / 1000
    93  }
    94  
    95  func min(x, y int) int {
    96  	if x < y {
    97  		return x
    98  	}
    99  	return y
   100  }
   101  
   102  type numaOrSocketsFirstFuncs interface {
   103  	takeFullFirstLevel()
   104  	takeFullSecondLevel()
   105  	sortAvailableNUMANodes() []int
   106  	sortAvailableSockets() []int
   107  	sortAvailableCores() []int
   108  }
   109  
   110  type numaFirst struct{ acc *cpuAccumulator }
   111  type socketsFirst struct{ acc *cpuAccumulator }
   112  
   113  var _ numaOrSocketsFirstFuncs = (*numaFirst)(nil)
   114  var _ numaOrSocketsFirstFuncs = (*socketsFirst)(nil)
   115  
   116  // If NUMA nodes are higher in the memory hierarchy than sockets, then we take
   117  // from the set of NUMA Nodes as the first level.
   118  func (n *numaFirst) takeFullFirstLevel() {
   119  	n.acc.takeFullNUMANodes()
   120  }
   121  
   122  // If NUMA nodes are higher in the memory hierarchy than sockets, then we take
   123  // from the set of sockets as the second level.
   124  func (n *numaFirst) takeFullSecondLevel() {
   125  	n.acc.takeFullSockets()
   126  }
   127  
   128  // If NUMA nodes are higher in the memory hierarchy than sockets, then just
   129  // sort the NUMA nodes directly, and return them.
   130  func (n *numaFirst) sortAvailableNUMANodes() []int {
   131  	numas := n.acc.details.NUMANodes().UnsortedList()
   132  	n.acc.sort(numas, n.acc.details.CPUsInNUMANodes)
   133  	return numas
   134  }
   135  
   136  // If NUMA nodes are higher in the memory hierarchy than sockets, then we need
   137  // to pull the set of sockets out of each sorted NUMA node, and accumulate the
   138  // partial order across them.
   139  func (n *numaFirst) sortAvailableSockets() []int {
   140  	var result []int
   141  	for _, numa := range n.sortAvailableNUMANodes() {
   142  		sockets := n.acc.details.SocketsInNUMANodes(numa).UnsortedList()
   143  		n.acc.sort(sockets, n.acc.details.CPUsInSockets)
   144  		result = append(result, sockets...)
   145  	}
   146  	return result
   147  }
   148  
   149  // If NUMA nodes are higher in the memory hierarchy than sockets, then
   150  // cores sit directly below sockets in the memory hierarchy.
   151  func (n *numaFirst) sortAvailableCores() []int {
   152  	var result []int
   153  	for _, socket := range n.acc.sortAvailableSockets() {
   154  		cores := n.acc.details.CoresInSockets(socket).UnsortedList()
   155  		n.acc.sort(cores, n.acc.details.CPUsInCores)
   156  		result = append(result, cores...)
   157  	}
   158  	return result
   159  }
   160  
   161  // If sockets are higher in the memory hierarchy than NUMA nodes, then we take
   162  // from the set of sockets as the first level.
   163  func (s *socketsFirst) takeFullFirstLevel() {
   164  	s.acc.takeFullSockets()
   165  }
   166  
   167  // If sockets are higher in the memory hierarchy than NUMA nodes, then we take
   168  // from the set of NUMA Nodes as the second level.
   169  func (s *socketsFirst) takeFullSecondLevel() {
   170  	s.acc.takeFullNUMANodes()
   171  }
   172  
   173  // If sockets are higher in the memory hierarchy than NUMA nodes, then we need
   174  // to pull the set of NUMA nodes out of each sorted Socket, and accumulate the
   175  // partial order across them.
   176  func (s *socketsFirst) sortAvailableNUMANodes() []int {
   177  	var result []int
   178  	for _, socket := range s.sortAvailableSockets() {
   179  		numas := s.acc.details.NUMANodesInSockets(socket).UnsortedList()
   180  		s.acc.sort(numas, s.acc.details.CPUsInNUMANodes)
   181  		result = append(result, numas...)
   182  	}
   183  	return result
   184  }
   185  
   186  // If sockets are higher in the memory hierarchy than NUMA nodes, then just
   187  // sort the sockets directly, and return them.
   188  func (s *socketsFirst) sortAvailableSockets() []int {
   189  	sockets := s.acc.details.Sockets().UnsortedList()
   190  	s.acc.sort(sockets, s.acc.details.CPUsInSockets)
   191  	return sockets
   192  }
   193  
   194  // If sockets are higher in the memory hierarchy than NUMA nodes, then cores
   195  // sit directly below NUMA Nodes in the memory hierarchy.
   196  func (s *socketsFirst) sortAvailableCores() []int {
   197  	var result []int
   198  	for _, numa := range s.acc.sortAvailableNUMANodes() {
   199  		cores := s.acc.details.CoresInNUMANodes(numa).UnsortedList()
   200  		s.acc.sort(cores, s.acc.details.CPUsInCores)
   201  		result = append(result, cores...)
   202  	}
   203  	return result
   204  }
   205  
   206  type cpuAccumulator struct {
   207  	topo               *topology.CPUTopology
   208  	details            topology.CPUDetails
   209  	numCPUsNeeded      int
   210  	result             cpuset.CPUSet
   211  	numaOrSocketsFirst numaOrSocketsFirstFuncs
   212  }
   213  
   214  func newCPUAccumulator(topo *topology.CPUTopology, availableCPUs cpuset.CPUSet, numCPUs int) *cpuAccumulator {
   215  	acc := &cpuAccumulator{
   216  		topo:          topo,
   217  		details:       topo.CPUDetails.KeepOnly(availableCPUs),
   218  		numCPUsNeeded: numCPUs,
   219  		result:        cpuset.New(),
   220  	}
   221  
   222  	if topo.NumSockets >= topo.NumNUMANodes {
   223  		acc.numaOrSocketsFirst = &numaFirst{acc}
   224  	} else {
   225  		acc.numaOrSocketsFirst = &socketsFirst{acc}
   226  	}
   227  
   228  	return acc
   229  }
   230  
   231  // Returns true if the supplied NUMANode is fully available in `topoDetails`.
   232  func (a *cpuAccumulator) isNUMANodeFree(numaID int) bool {
   233  	return a.details.CPUsInNUMANodes(numaID).Size() == a.topo.CPUDetails.CPUsInNUMANodes(numaID).Size()
   234  }
   235  
   236  // Returns true if the supplied socket is fully available in `topoDetails`.
   237  func (a *cpuAccumulator) isSocketFree(socketID int) bool {
   238  	return a.details.CPUsInSockets(socketID).Size() == a.topo.CPUsPerSocket()
   239  }
   240  
   241  // Returns true if the supplied core is fully available in `topoDetails`.
   242  func (a *cpuAccumulator) isCoreFree(coreID int) bool {
   243  	return a.details.CPUsInCores(coreID).Size() == a.topo.CPUsPerCore()
   244  }
   245  
   246  // Returns free NUMA Node IDs as a slice sorted by sortAvailableNUMANodes().
   247  func (a *cpuAccumulator) freeNUMANodes() []int {
   248  	free := []int{}
   249  	for _, numa := range a.sortAvailableNUMANodes() {
   250  		if a.isNUMANodeFree(numa) {
   251  			free = append(free, numa)
   252  		}
   253  	}
   254  	return free
   255  }
   256  
   257  // Returns free socket IDs as a slice sorted by sortAvailableSockets().
   258  func (a *cpuAccumulator) freeSockets() []int {
   259  	free := []int{}
   260  	for _, socket := range a.sortAvailableSockets() {
   261  		if a.isSocketFree(socket) {
   262  			free = append(free, socket)
   263  		}
   264  	}
   265  	return free
   266  }
   267  
   268  // Returns free core IDs as a slice sorted by sortAvailableCores().
   269  func (a *cpuAccumulator) freeCores() []int {
   270  	free := []int{}
   271  	for _, core := range a.sortAvailableCores() {
   272  		if a.isCoreFree(core) {
   273  			free = append(free, core)
   274  		}
   275  	}
   276  	return free
   277  }
   278  
   279  // Returns free CPU IDs as a slice sorted by sortAvailableCPUs().
   280  func (a *cpuAccumulator) freeCPUs() []int {
   281  	return a.sortAvailableCPUs()
   282  }
   283  
   284  // Sorts the provided list of NUMA nodes/sockets/cores/cpus referenced in 'ids'
   285  // by the number of available CPUs contained within them (smallest to largest).
   286  // The 'getCPU()' paramater defines the function that should be called to
   287  // retrieve the list of available CPUs for the type being referenced. If two
   288  // NUMA nodes/sockets/cores/cpus have the same number of available CPUs, they
   289  // are sorted in ascending order by their id.
   290  func (a *cpuAccumulator) sort(ids []int, getCPUs func(ids ...int) cpuset.CPUSet) {
   291  	sort.Slice(ids,
   292  		func(i, j int) bool {
   293  			iCPUs := getCPUs(ids[i])
   294  			jCPUs := getCPUs(ids[j])
   295  			if iCPUs.Size() < jCPUs.Size() {
   296  				return true
   297  			}
   298  			if iCPUs.Size() > jCPUs.Size() {
   299  				return false
   300  			}
   301  			return ids[i] < ids[j]
   302  		})
   303  }
   304  
   305  // Sort all NUMA nodes with free CPUs.
   306  func (a *cpuAccumulator) sortAvailableNUMANodes() []int {
   307  	return a.numaOrSocketsFirst.sortAvailableNUMANodes()
   308  }
   309  
   310  // Sort all sockets with free CPUs.
   311  func (a *cpuAccumulator) sortAvailableSockets() []int {
   312  	return a.numaOrSocketsFirst.sortAvailableSockets()
   313  }
   314  
   315  // Sort all cores with free CPUs:
   316  func (a *cpuAccumulator) sortAvailableCores() []int {
   317  	return a.numaOrSocketsFirst.sortAvailableCores()
   318  }
   319  
   320  // Sort all available CPUs:
   321  // - First by core using sortAvailableCores().
   322  // - Then within each core, using the sort() algorithm defined above.
   323  func (a *cpuAccumulator) sortAvailableCPUs() []int {
   324  	var result []int
   325  	for _, core := range a.sortAvailableCores() {
   326  		cpus := a.details.CPUsInCores(core).UnsortedList()
   327  		sort.Ints(cpus)
   328  		result = append(result, cpus...)
   329  	}
   330  	return result
   331  }
   332  
   333  func (a *cpuAccumulator) take(cpus cpuset.CPUSet) {
   334  	a.result = a.result.Union(cpus)
   335  	a.details = a.details.KeepOnly(a.details.CPUs().Difference(a.result))
   336  	a.numCPUsNeeded -= cpus.Size()
   337  }
   338  
   339  func (a *cpuAccumulator) takeFullNUMANodes() {
   340  	for _, numa := range a.freeNUMANodes() {
   341  		cpusInNUMANode := a.topo.CPUDetails.CPUsInNUMANodes(numa)
   342  		if !a.needs(cpusInNUMANode.Size()) {
   343  			continue
   344  		}
   345  		klog.V(4).InfoS("takeFullNUMANodes: claiming NUMA node", "numa", numa)
   346  		a.take(cpusInNUMANode)
   347  	}
   348  }
   349  
   350  func (a *cpuAccumulator) takeFullSockets() {
   351  	for _, socket := range a.freeSockets() {
   352  		cpusInSocket := a.topo.CPUDetails.CPUsInSockets(socket)
   353  		if !a.needs(cpusInSocket.Size()) {
   354  			continue
   355  		}
   356  		klog.V(4).InfoS("takeFullSockets: claiming socket", "socket", socket)
   357  		a.take(cpusInSocket)
   358  	}
   359  }
   360  
   361  func (a *cpuAccumulator) takeFullCores() {
   362  	for _, core := range a.freeCores() {
   363  		cpusInCore := a.topo.CPUDetails.CPUsInCores(core)
   364  		if !a.needs(cpusInCore.Size()) {
   365  			continue
   366  		}
   367  		klog.V(4).InfoS("takeFullCores: claiming core", "core", core)
   368  		a.take(cpusInCore)
   369  	}
   370  }
   371  
   372  func (a *cpuAccumulator) takeRemainingCPUs() {
   373  	for _, cpu := range a.sortAvailableCPUs() {
   374  		klog.V(4).InfoS("takeRemainingCPUs: claiming CPU", "cpu", cpu)
   375  		a.take(cpuset.New(cpu))
   376  		if a.isSatisfied() {
   377  			return
   378  		}
   379  	}
   380  }
   381  
   382  func (a *cpuAccumulator) rangeNUMANodesNeededToSatisfy(cpuGroupSize int) (int, int) {
   383  	// Get the total number of NUMA nodes in the system.
   384  	numNUMANodes := a.topo.CPUDetails.NUMANodes().Size()
   385  
   386  	// Get the total number of NUMA nodes that have CPUs available on them.
   387  	numNUMANodesAvailable := a.details.NUMANodes().Size()
   388  
   389  	// Get the total number of CPUs in the system.
   390  	numCPUs := a.topo.CPUDetails.CPUs().Size()
   391  
   392  	// Get the total number of 'cpuGroups' in the system.
   393  	numCPUGroups := (numCPUs-1)/cpuGroupSize + 1
   394  
   395  	// Calculate the number of 'cpuGroups' per NUMA Node in the system (rounding up).
   396  	numCPUGroupsPerNUMANode := (numCPUGroups-1)/numNUMANodes + 1
   397  
   398  	// Calculate the number of available 'cpuGroups' across all NUMA nodes as
   399  	// well as the number of 'cpuGroups' that need to be allocated (rounding up).
   400  	numCPUGroupsNeeded := (a.numCPUsNeeded-1)/cpuGroupSize + 1
   401  
   402  	// Calculate the minimum number of numa nodes required to satisfy the
   403  	// allocation (rounding up).
   404  	minNUMAs := (numCPUGroupsNeeded-1)/numCPUGroupsPerNUMANode + 1
   405  
   406  	// Calculate the maximum number of numa nodes required to satisfy the allocation.
   407  	maxNUMAs := min(numCPUGroupsNeeded, numNUMANodesAvailable)
   408  
   409  	return minNUMAs, maxNUMAs
   410  }
   411  
   412  func (a *cpuAccumulator) needs(n int) bool {
   413  	return a.numCPUsNeeded >= n
   414  }
   415  
   416  func (a *cpuAccumulator) isSatisfied() bool {
   417  	return a.numCPUsNeeded < 1
   418  }
   419  
   420  func (a *cpuAccumulator) isFailed() bool {
   421  	return a.numCPUsNeeded > a.details.CPUs().Size()
   422  }
   423  
   424  // iterateCombinations walks through all n-choose-k subsets of size k in n and
   425  // calls function 'f()' on each subset. For example, if n={0,1,2}, and k=2,
   426  // then f() will be called on the subsets {0,1}, {0,2}. and {1,2}. If f() ever
   427  // returns 'Break', we break early and exit the loop.
   428  func (a *cpuAccumulator) iterateCombinations(n []int, k int, f func([]int) LoopControl) {
   429  	if k < 1 {
   430  		return
   431  	}
   432  
   433  	var helper func(n []int, k int, start int, accum []int, f func([]int) LoopControl) LoopControl
   434  	helper = func(n []int, k int, start int, accum []int, f func([]int) LoopControl) LoopControl {
   435  		if k == 0 {
   436  			return f(accum)
   437  		}
   438  		for i := start; i <= len(n)-k; i++ {
   439  			control := helper(n, k-1, i+1, append(accum, n[i]), f)
   440  			if control == Break {
   441  				return Break
   442  			}
   443  		}
   444  		return Continue
   445  	}
   446  
   447  	helper(n, k, 0, []int{}, f)
   448  }
   449  
   450  func takeByTopologyNUMAPacked(topo *topology.CPUTopology, availableCPUs cpuset.CPUSet, numCPUs int) (cpuset.CPUSet, error) {
   451  	acc := newCPUAccumulator(topo, availableCPUs, numCPUs)
   452  	if acc.isSatisfied() {
   453  		return acc.result, nil
   454  	}
   455  	if acc.isFailed() {
   456  		return cpuset.New(), fmt.Errorf("not enough cpus available to satisfy request: requested=%d, available=%d", numCPUs, availableCPUs.Size())
   457  	}
   458  
   459  	// Algorithm: topology-aware best-fit
   460  	// 1. Acquire whole NUMA nodes and sockets, if available and the container
   461  	//    requires at least a NUMA node or socket's-worth of CPUs. If NUMA
   462  	//    Nodes map to 1 or more sockets, pull from NUMA nodes first.
   463  	//    Otherwise pull from sockets first.
   464  	acc.numaOrSocketsFirst.takeFullFirstLevel()
   465  	if acc.isSatisfied() {
   466  		return acc.result, nil
   467  	}
   468  	acc.numaOrSocketsFirst.takeFullSecondLevel()
   469  	if acc.isSatisfied() {
   470  		return acc.result, nil
   471  	}
   472  
   473  	// 2. Acquire whole cores, if available and the container requires at least
   474  	//    a core's-worth of CPUs.
   475  	acc.takeFullCores()
   476  	if acc.isSatisfied() {
   477  		return acc.result, nil
   478  	}
   479  
   480  	// 3. Acquire single threads, preferring to fill partially-allocated cores
   481  	//    on the same sockets as the whole cores we have already taken in this
   482  	//    allocation.
   483  	acc.takeRemainingCPUs()
   484  	if acc.isSatisfied() {
   485  		return acc.result, nil
   486  	}
   487  
   488  	return cpuset.New(), fmt.Errorf("failed to allocate cpus")
   489  }
   490  
   491  // takeByTopologyNUMADistributed returns a CPUSet of size 'numCPUs'.
   492  //
   493  // It generates this CPUset by allocating CPUs from 'availableCPUs' according
   494  // to the algorithm outlined in KEP-2902:
   495  //
   496  // https://github.com/kubernetes/enhancements/tree/e7f51ffbe2ee398ffd1fba4a6d854f276bfad9fb/keps/sig-node/2902-cpumanager-distribute-cpus-policy-option
   497  //
   498  // This algorithm evenly distribute CPUs across NUMA nodes in cases where more
   499  // than one NUMA node is required to satisfy the allocation. This is in
   500  // contrast to the takeByTopologyNUMAPacked algorithm, which attempts to 'pack'
   501  // CPUs onto NUMA nodes and fill them up before moving on to the next one.
   502  //
   503  // At a high-level this algorithm can be summarized as:
   504  //
   505  // For each NUMA single node:
   506  //   - If all requested CPUs can be allocated from this NUMA node;
   507  //     --> Do the allocation by running takeByTopologyNUMAPacked() over the
   508  //     available CPUs in that NUMA node and return
   509  //
   510  // Otherwise, for each pair of NUMA nodes:
   511  //   - If the set of requested CPUs (modulo 2) can be evenly split across
   512  //     the 2 NUMA nodes; AND
   513  //   - Any remaining CPUs (after the modulo operation) can be striped across
   514  //     some subset of the NUMA nodes;
   515  //     --> Do the allocation by running takeByTopologyNUMAPacked() over the
   516  //     available CPUs in both NUMA nodes and return
   517  //
   518  // Otherwise, for each 3-tuple of NUMA nodes:
   519  //   - If the set of requested CPUs (modulo 3) can be evenly distributed
   520  //     across the 3 NUMA nodes; AND
   521  //   - Any remaining CPUs (after the modulo operation) can be striped across
   522  //     some subset of the NUMA nodes;
   523  //     --> Do the allocation by running takeByTopologyNUMAPacked() over the
   524  //     available CPUs in all three NUMA nodes and return
   525  //
   526  // ...
   527  //
   528  // Otherwise, for the set of all NUMA nodes:
   529  //   - If the set of requested CPUs (modulo NUM_NUMA_NODES) can be evenly
   530  //     distributed across all NUMA nodes; AND
   531  //   - Any remaining CPUs (after the modulo operation) can be striped across
   532  //     some subset of the NUMA nodes;
   533  //     --> Do the allocation by running takeByTopologyNUMAPacked() over the
   534  //     available CPUs in all NUMA nodes and return
   535  //
   536  // If none of the above conditions can be met, then resort back to a
   537  // best-effort fit of packing CPUs into NUMA nodes by calling
   538  // takeByTopologyNUMAPacked() over all available CPUs.
   539  //
   540  // NOTE: A "balance score" will be calculated to help find the best subset of
   541  // NUMA nodes to allocate any 'remainder' CPUs from (in cases where the total
   542  // number of CPUs to allocate cannot be evenly distributed across the chosen
   543  // set of NUMA nodes). This "balance score" is calculated as the standard
   544  // deviation of how many CPUs will be available on each NUMA node after all
   545  // evenly distributed and remainder CPUs are allocated. The subset with the
   546  // lowest "balance score" will receive the CPUs in order to keep the overall
   547  // allocation of CPUs as "balanced" as possible.
   548  //
   549  // NOTE: This algorithm has been generalized to take an additional
   550  // 'cpuGroupSize' parameter to ensure that CPUs are always allocated in groups
   551  // of size 'cpuGroupSize' according to the algorithm described above. This is
   552  // important, for example, to ensure that all CPUs (i.e. all hyperthreads) from
   553  // a single core are allocated together.
   554  func takeByTopologyNUMADistributed(topo *topology.CPUTopology, availableCPUs cpuset.CPUSet, numCPUs int, cpuGroupSize int) (cpuset.CPUSet, error) {
   555  	// If the number of CPUs requested cannot be handed out in chunks of
   556  	// 'cpuGroupSize', then we just call out the packing algorithm since we
   557  	// can't distribute CPUs in this chunk size.
   558  	if (numCPUs % cpuGroupSize) != 0 {
   559  		return takeByTopologyNUMAPacked(topo, availableCPUs, numCPUs)
   560  	}
   561  
   562  	// Otherwise build an accumulator to start allocating CPUs from.
   563  	acc := newCPUAccumulator(topo, availableCPUs, numCPUs)
   564  	if acc.isSatisfied() {
   565  		return acc.result, nil
   566  	}
   567  	if acc.isFailed() {
   568  		return cpuset.New(), fmt.Errorf("not enough cpus available to satisfy request: requested=%d, available=%d", numCPUs, availableCPUs.Size())
   569  	}
   570  
   571  	// Get the list of NUMA nodes represented by the set of CPUs in 'availableCPUs'.
   572  	numas := acc.sortAvailableNUMANodes()
   573  
   574  	// Calculate the minimum and maximum possible number of NUMA nodes that
   575  	// could satisfy this request. This is used to optimize how many iterations
   576  	// of the loop we need to go through below.
   577  	minNUMAs, maxNUMAs := acc.rangeNUMANodesNeededToSatisfy(cpuGroupSize)
   578  
   579  	// Try combinations of 1,2,3,... NUMA nodes until we find a combination
   580  	// where we can evenly distribute CPUs across them. To optimize things, we
   581  	// don't always start at 1 and end at len(numas). Instead, we use the
   582  	// values of 'minNUMAs' and 'maxNUMAs' calculated above.
   583  	for k := minNUMAs; k <= maxNUMAs; k++ {
   584  		// Iterate through the various n-choose-k NUMA node combinations,
   585  		// looking for the combination of NUMA nodes that can best have CPUs
   586  		// distributed across them.
   587  		var bestBalance float64 = math.MaxFloat64
   588  		var bestRemainder []int = nil
   589  		var bestCombo []int = nil
   590  		acc.iterateCombinations(numas, k, func(combo []int) LoopControl {
   591  			// If we've already found a combo with a balance of 0 in a
   592  			// different iteration, then don't bother checking any others.
   593  			if bestBalance == 0 {
   594  				return Break
   595  			}
   596  
   597  			// Check that this combination of NUMA nodes has enough CPUs to
   598  			// satisfy the allocation overall.
   599  			cpus := acc.details.CPUsInNUMANodes(combo...)
   600  			if cpus.Size() < numCPUs {
   601  				return Continue
   602  			}
   603  
   604  			// Check that CPUs can be handed out in groups of size
   605  			// 'cpuGroupSize' across the NUMA nodes in this combo.
   606  			numCPUGroups := 0
   607  			for _, numa := range combo {
   608  				numCPUGroups += (acc.details.CPUsInNUMANodes(numa).Size() / cpuGroupSize)
   609  			}
   610  			if (numCPUGroups * cpuGroupSize) < numCPUs {
   611  				return Continue
   612  			}
   613  
   614  			// Check that each NUMA node in this combination can allocate an
   615  			// even distribution of CPUs in groups of size 'cpuGroupSize',
   616  			// modulo some remainder.
   617  			distribution := (numCPUs / len(combo) / cpuGroupSize) * cpuGroupSize
   618  			for _, numa := range combo {
   619  				cpus := acc.details.CPUsInNUMANodes(numa)
   620  				if cpus.Size() < distribution {
   621  					return Continue
   622  				}
   623  			}
   624  
   625  			// Calculate how many CPUs will be available on each NUMA node in
   626  			// the system after allocating an even distribution of CPU groups
   627  			// of size 'cpuGroupSize' from each NUMA node in 'combo'. This will
   628  			// be used in the "balance score" calculation to help decide if
   629  			// this combo should ultimately be chosen.
   630  			availableAfterAllocation := make(mapIntInt, len(numas))
   631  			for _, numa := range numas {
   632  				availableAfterAllocation[numa] = acc.details.CPUsInNUMANodes(numa).Size()
   633  			}
   634  			for _, numa := range combo {
   635  				availableAfterAllocation[numa] -= distribution
   636  			}
   637  
   638  			// Check if there are any remaining CPUs to distribute across the
   639  			// NUMA nodes once CPUs have been evenly distributed in groups of
   640  			// size 'cpuGroupSize'.
   641  			remainder := numCPUs - (distribution * len(combo))
   642  
   643  			// Get a list of NUMA nodes to consider pulling the remainder CPUs
   644  			// from. This list excludes NUMA nodes that don't have at least
   645  			// 'cpuGroupSize' CPUs available after being allocated
   646  			// 'distribution' number of CPUs.
   647  			var remainderCombo []int
   648  			for _, numa := range combo {
   649  				if availableAfterAllocation[numa] >= cpuGroupSize {
   650  					remainderCombo = append(remainderCombo, numa)
   651  				}
   652  			}
   653  
   654  			// Declare a set of local variables to help track the "balance
   655  			// scores" calculated when using different subsets of
   656  			// 'remainderCombo' to allocate remainder CPUs from.
   657  			var bestLocalBalance float64 = math.MaxFloat64
   658  			var bestLocalRemainder []int = nil
   659  
   660  			// If there aren't any remainder CPUs to allocate, then calculate
   661  			// the "balance score" of this combo as the standard deviation of
   662  			// the values contained in 'availableAfterAllocation'.
   663  			if remainder == 0 {
   664  				bestLocalBalance = standardDeviation(availableAfterAllocation.Values())
   665  				bestLocalRemainder = nil
   666  			}
   667  
   668  			// Otherwise, find the best "balance score" when allocating the
   669  			// remainder CPUs across different subsets of NUMA nodes in 'remainderCombo'.
   670  			// These remainder CPUs are handed out in groups of size 'cpuGroupSize'.
   671  			// We start from k=len(remainderCombo) and walk down to k=1 so that
   672  			// we continue to distribute CPUs as much as possible across
   673  			// multiple NUMA nodes.
   674  			for k := len(remainderCombo); remainder > 0 && k >= 1; k-- {
   675  				acc.iterateCombinations(remainderCombo, k, func(subset []int) LoopControl {
   676  					// Make a local copy of 'remainder'.
   677  					remainder := remainder
   678  
   679  					// Make a local copy of 'availableAfterAllocation'.
   680  					availableAfterAllocation := availableAfterAllocation.Clone()
   681  
   682  					// If this subset is not capable of allocating all
   683  					// remainder CPUs, continue to the next one.
   684  					if sum(availableAfterAllocation.Values(subset...)) < remainder {
   685  						return Continue
   686  					}
   687  
   688  					// For all NUMA nodes in 'subset', walk through them,
   689  					// removing 'cpuGroupSize' number of CPUs from each
   690  					// until all remainder CPUs have been accounted for.
   691  					for remainder > 0 {
   692  						for _, numa := range subset {
   693  							if remainder == 0 {
   694  								break
   695  							}
   696  							if availableAfterAllocation[numa] < cpuGroupSize {
   697  								continue
   698  							}
   699  							availableAfterAllocation[numa] -= cpuGroupSize
   700  							remainder -= cpuGroupSize
   701  						}
   702  					}
   703  
   704  					// Calculate the "balance score" as the standard deviation
   705  					// of the number of CPUs available on all NUMA nodes in the
   706  					// system after the remainder CPUs have been allocated
   707  					// across 'subset' in groups of size 'cpuGroupSize'.
   708  					balance := standardDeviation(availableAfterAllocation.Values())
   709  					if balance < bestLocalBalance {
   710  						bestLocalBalance = balance
   711  						bestLocalRemainder = subset
   712  					}
   713  
   714  					return Continue
   715  				})
   716  			}
   717  
   718  			// If the best "balance score" for this combo is less than the
   719  			// lowest "balance score" of all previous combos, then update this
   720  			// combo (and remainder set) to be the best one found so far.
   721  			if bestLocalBalance < bestBalance {
   722  				bestBalance = bestLocalBalance
   723  				bestRemainder = bestLocalRemainder
   724  				bestCombo = combo
   725  			}
   726  
   727  			return Continue
   728  		})
   729  
   730  		// If we made it through all of the iterations above without finding a
   731  		// combination of NUMA nodes that can properly balance CPU allocations,
   732  		// then move on to the next larger set of NUMA node combinations.
   733  		if bestCombo == nil {
   734  			continue
   735  		}
   736  
   737  		// Otherwise, start allocating CPUs from the NUMA node combination
   738  		// chosen. First allocate an even distribution of CPUs in groups of
   739  		// size 'cpuGroupSize' from 'bestCombo'.
   740  		distribution := (numCPUs / len(bestCombo) / cpuGroupSize) * cpuGroupSize
   741  		for _, numa := range bestCombo {
   742  			cpus, _ := takeByTopologyNUMAPacked(acc.topo, acc.details.CPUsInNUMANodes(numa), distribution)
   743  			acc.take(cpus)
   744  		}
   745  
   746  		// Then allocate any remaining CPUs in groups of size 'cpuGroupSize'
   747  		// from each NUMA node in the remainder set.
   748  		remainder := numCPUs - (distribution * len(bestCombo))
   749  		for remainder > 0 {
   750  			for _, numa := range bestRemainder {
   751  				if remainder == 0 {
   752  					break
   753  				}
   754  				if acc.details.CPUsInNUMANodes(numa).Size() < cpuGroupSize {
   755  					continue
   756  				}
   757  				cpus, _ := takeByTopologyNUMAPacked(acc.topo, acc.details.CPUsInNUMANodes(numa), cpuGroupSize)
   758  				acc.take(cpus)
   759  				remainder -= cpuGroupSize
   760  			}
   761  		}
   762  
   763  		// If we haven't allocated all of our CPUs at this point, then something
   764  		// went wrong in our accounting and we should error out.
   765  		if acc.numCPUsNeeded > 0 {
   766  			return cpuset.New(), fmt.Errorf("accounting error, not enough CPUs allocated, remaining: %v", acc.numCPUsNeeded)
   767  		}
   768  
   769  		// Likewise, if we have allocated too many CPUs at this point, then something
   770  		// went wrong in our accounting and we should error out.
   771  		if acc.numCPUsNeeded < 0 {
   772  			return cpuset.New(), fmt.Errorf("accounting error, too many CPUs allocated, remaining: %v", acc.numCPUsNeeded)
   773  		}
   774  
   775  		// Otherwise, return the result
   776  		return acc.result, nil
   777  	}
   778  
   779  	// If we never found a combination of NUMA nodes that we could properly
   780  	// distribute CPUs across, fall back to the packing algorithm.
   781  	return takeByTopologyNUMAPacked(topo, availableCPUs, numCPUs)
   782  }