k8s.io/kubernetes@v1.29.3/pkg/kubelet/cm/cpumanager/cpu_assignment.go (about) 1 /* 2 Copyright 2017 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package cpumanager 18 19 import ( 20 "fmt" 21 "math" 22 "sort" 23 24 "k8s.io/klog/v2" 25 26 "k8s.io/kubernetes/pkg/kubelet/cm/cpumanager/topology" 27 "k8s.io/utils/cpuset" 28 ) 29 30 // LoopControl controls the behavior of the cpu accumulator loop logic 31 type LoopControl int 32 33 // Possible loop control outcomes 34 const ( 35 Continue LoopControl = iota 36 Break 37 ) 38 39 type mapIntInt map[int]int 40 41 func (m mapIntInt) Clone() mapIntInt { 42 cp := make(mapIntInt, len(m)) 43 for k, v := range m { 44 cp[k] = v 45 } 46 return cp 47 } 48 49 func (m mapIntInt) Keys() []int { 50 var keys []int 51 for k := range m { 52 keys = append(keys, k) 53 } 54 return keys 55 } 56 57 func (m mapIntInt) Values(keys ...int) []int { 58 if keys == nil { 59 keys = m.Keys() 60 } 61 var values []int 62 for _, k := range keys { 63 values = append(values, m[k]) 64 } 65 return values 66 } 67 68 func sum(xs []int) int { 69 var s int 70 for _, x := range xs { 71 s += x 72 } 73 return s 74 } 75 76 func mean(xs []int) float64 { 77 var sum float64 78 for _, x := range xs { 79 sum += float64(x) 80 } 81 m := sum / float64(len(xs)) 82 return math.Round(m*1000) / 1000 83 } 84 85 func standardDeviation(xs []int) float64 { 86 m := mean(xs) 87 var sum float64 88 for _, x := range xs { 89 sum += (float64(x) - m) * (float64(x) - m) 90 } 91 s := math.Sqrt(sum / float64(len(xs))) 92 return math.Round(s*1000) / 1000 93 } 94 95 func min(x, y int) int { 96 if x < y { 97 return x 98 } 99 return y 100 } 101 102 type numaOrSocketsFirstFuncs interface { 103 takeFullFirstLevel() 104 takeFullSecondLevel() 105 sortAvailableNUMANodes() []int 106 sortAvailableSockets() []int 107 sortAvailableCores() []int 108 } 109 110 type numaFirst struct{ acc *cpuAccumulator } 111 type socketsFirst struct{ acc *cpuAccumulator } 112 113 var _ numaOrSocketsFirstFuncs = (*numaFirst)(nil) 114 var _ numaOrSocketsFirstFuncs = (*socketsFirst)(nil) 115 116 // If NUMA nodes are higher in the memory hierarchy than sockets, then we take 117 // from the set of NUMA Nodes as the first level. 118 func (n *numaFirst) takeFullFirstLevel() { 119 n.acc.takeFullNUMANodes() 120 } 121 122 // If NUMA nodes are higher in the memory hierarchy than sockets, then we take 123 // from the set of sockets as the second level. 124 func (n *numaFirst) takeFullSecondLevel() { 125 n.acc.takeFullSockets() 126 } 127 128 // If NUMA nodes are higher in the memory hierarchy than sockets, then just 129 // sort the NUMA nodes directly, and return them. 130 func (n *numaFirst) sortAvailableNUMANodes() []int { 131 numas := n.acc.details.NUMANodes().UnsortedList() 132 n.acc.sort(numas, n.acc.details.CPUsInNUMANodes) 133 return numas 134 } 135 136 // If NUMA nodes are higher in the memory hierarchy than sockets, then we need 137 // to pull the set of sockets out of each sorted NUMA node, and accumulate the 138 // partial order across them. 139 func (n *numaFirst) sortAvailableSockets() []int { 140 var result []int 141 for _, numa := range n.sortAvailableNUMANodes() { 142 sockets := n.acc.details.SocketsInNUMANodes(numa).UnsortedList() 143 n.acc.sort(sockets, n.acc.details.CPUsInSockets) 144 result = append(result, sockets...) 145 } 146 return result 147 } 148 149 // If NUMA nodes are higher in the memory hierarchy than sockets, then 150 // cores sit directly below sockets in the memory hierarchy. 151 func (n *numaFirst) sortAvailableCores() []int { 152 var result []int 153 for _, socket := range n.acc.sortAvailableSockets() { 154 cores := n.acc.details.CoresInSockets(socket).UnsortedList() 155 n.acc.sort(cores, n.acc.details.CPUsInCores) 156 result = append(result, cores...) 157 } 158 return result 159 } 160 161 // If sockets are higher in the memory hierarchy than NUMA nodes, then we take 162 // from the set of sockets as the first level. 163 func (s *socketsFirst) takeFullFirstLevel() { 164 s.acc.takeFullSockets() 165 } 166 167 // If sockets are higher in the memory hierarchy than NUMA nodes, then we take 168 // from the set of NUMA Nodes as the second level. 169 func (s *socketsFirst) takeFullSecondLevel() { 170 s.acc.takeFullNUMANodes() 171 } 172 173 // If sockets are higher in the memory hierarchy than NUMA nodes, then we need 174 // to pull the set of NUMA nodes out of each sorted Socket, and accumulate the 175 // partial order across them. 176 func (s *socketsFirst) sortAvailableNUMANodes() []int { 177 var result []int 178 for _, socket := range s.sortAvailableSockets() { 179 numas := s.acc.details.NUMANodesInSockets(socket).UnsortedList() 180 s.acc.sort(numas, s.acc.details.CPUsInNUMANodes) 181 result = append(result, numas...) 182 } 183 return result 184 } 185 186 // If sockets are higher in the memory hierarchy than NUMA nodes, then just 187 // sort the sockets directly, and return them. 188 func (s *socketsFirst) sortAvailableSockets() []int { 189 sockets := s.acc.details.Sockets().UnsortedList() 190 s.acc.sort(sockets, s.acc.details.CPUsInSockets) 191 return sockets 192 } 193 194 // If sockets are higher in the memory hierarchy than NUMA nodes, then cores 195 // sit directly below NUMA Nodes in the memory hierarchy. 196 func (s *socketsFirst) sortAvailableCores() []int { 197 var result []int 198 for _, numa := range s.acc.sortAvailableNUMANodes() { 199 cores := s.acc.details.CoresInNUMANodes(numa).UnsortedList() 200 s.acc.sort(cores, s.acc.details.CPUsInCores) 201 result = append(result, cores...) 202 } 203 return result 204 } 205 206 type cpuAccumulator struct { 207 topo *topology.CPUTopology 208 details topology.CPUDetails 209 numCPUsNeeded int 210 result cpuset.CPUSet 211 numaOrSocketsFirst numaOrSocketsFirstFuncs 212 } 213 214 func newCPUAccumulator(topo *topology.CPUTopology, availableCPUs cpuset.CPUSet, numCPUs int) *cpuAccumulator { 215 acc := &cpuAccumulator{ 216 topo: topo, 217 details: topo.CPUDetails.KeepOnly(availableCPUs), 218 numCPUsNeeded: numCPUs, 219 result: cpuset.New(), 220 } 221 222 if topo.NumSockets >= topo.NumNUMANodes { 223 acc.numaOrSocketsFirst = &numaFirst{acc} 224 } else { 225 acc.numaOrSocketsFirst = &socketsFirst{acc} 226 } 227 228 return acc 229 } 230 231 // Returns true if the supplied NUMANode is fully available in `topoDetails`. 232 func (a *cpuAccumulator) isNUMANodeFree(numaID int) bool { 233 return a.details.CPUsInNUMANodes(numaID).Size() == a.topo.CPUDetails.CPUsInNUMANodes(numaID).Size() 234 } 235 236 // Returns true if the supplied socket is fully available in `topoDetails`. 237 func (a *cpuAccumulator) isSocketFree(socketID int) bool { 238 return a.details.CPUsInSockets(socketID).Size() == a.topo.CPUsPerSocket() 239 } 240 241 // Returns true if the supplied core is fully available in `topoDetails`. 242 func (a *cpuAccumulator) isCoreFree(coreID int) bool { 243 return a.details.CPUsInCores(coreID).Size() == a.topo.CPUsPerCore() 244 } 245 246 // Returns free NUMA Node IDs as a slice sorted by sortAvailableNUMANodes(). 247 func (a *cpuAccumulator) freeNUMANodes() []int { 248 free := []int{} 249 for _, numa := range a.sortAvailableNUMANodes() { 250 if a.isNUMANodeFree(numa) { 251 free = append(free, numa) 252 } 253 } 254 return free 255 } 256 257 // Returns free socket IDs as a slice sorted by sortAvailableSockets(). 258 func (a *cpuAccumulator) freeSockets() []int { 259 free := []int{} 260 for _, socket := range a.sortAvailableSockets() { 261 if a.isSocketFree(socket) { 262 free = append(free, socket) 263 } 264 } 265 return free 266 } 267 268 // Returns free core IDs as a slice sorted by sortAvailableCores(). 269 func (a *cpuAccumulator) freeCores() []int { 270 free := []int{} 271 for _, core := range a.sortAvailableCores() { 272 if a.isCoreFree(core) { 273 free = append(free, core) 274 } 275 } 276 return free 277 } 278 279 // Returns free CPU IDs as a slice sorted by sortAvailableCPUs(). 280 func (a *cpuAccumulator) freeCPUs() []int { 281 return a.sortAvailableCPUs() 282 } 283 284 // Sorts the provided list of NUMA nodes/sockets/cores/cpus referenced in 'ids' 285 // by the number of available CPUs contained within them (smallest to largest). 286 // The 'getCPU()' paramater defines the function that should be called to 287 // retrieve the list of available CPUs for the type being referenced. If two 288 // NUMA nodes/sockets/cores/cpus have the same number of available CPUs, they 289 // are sorted in ascending order by their id. 290 func (a *cpuAccumulator) sort(ids []int, getCPUs func(ids ...int) cpuset.CPUSet) { 291 sort.Slice(ids, 292 func(i, j int) bool { 293 iCPUs := getCPUs(ids[i]) 294 jCPUs := getCPUs(ids[j]) 295 if iCPUs.Size() < jCPUs.Size() { 296 return true 297 } 298 if iCPUs.Size() > jCPUs.Size() { 299 return false 300 } 301 return ids[i] < ids[j] 302 }) 303 } 304 305 // Sort all NUMA nodes with free CPUs. 306 func (a *cpuAccumulator) sortAvailableNUMANodes() []int { 307 return a.numaOrSocketsFirst.sortAvailableNUMANodes() 308 } 309 310 // Sort all sockets with free CPUs. 311 func (a *cpuAccumulator) sortAvailableSockets() []int { 312 return a.numaOrSocketsFirst.sortAvailableSockets() 313 } 314 315 // Sort all cores with free CPUs: 316 func (a *cpuAccumulator) sortAvailableCores() []int { 317 return a.numaOrSocketsFirst.sortAvailableCores() 318 } 319 320 // Sort all available CPUs: 321 // - First by core using sortAvailableCores(). 322 // - Then within each core, using the sort() algorithm defined above. 323 func (a *cpuAccumulator) sortAvailableCPUs() []int { 324 var result []int 325 for _, core := range a.sortAvailableCores() { 326 cpus := a.details.CPUsInCores(core).UnsortedList() 327 sort.Ints(cpus) 328 result = append(result, cpus...) 329 } 330 return result 331 } 332 333 func (a *cpuAccumulator) take(cpus cpuset.CPUSet) { 334 a.result = a.result.Union(cpus) 335 a.details = a.details.KeepOnly(a.details.CPUs().Difference(a.result)) 336 a.numCPUsNeeded -= cpus.Size() 337 } 338 339 func (a *cpuAccumulator) takeFullNUMANodes() { 340 for _, numa := range a.freeNUMANodes() { 341 cpusInNUMANode := a.topo.CPUDetails.CPUsInNUMANodes(numa) 342 if !a.needs(cpusInNUMANode.Size()) { 343 continue 344 } 345 klog.V(4).InfoS("takeFullNUMANodes: claiming NUMA node", "numa", numa) 346 a.take(cpusInNUMANode) 347 } 348 } 349 350 func (a *cpuAccumulator) takeFullSockets() { 351 for _, socket := range a.freeSockets() { 352 cpusInSocket := a.topo.CPUDetails.CPUsInSockets(socket) 353 if !a.needs(cpusInSocket.Size()) { 354 continue 355 } 356 klog.V(4).InfoS("takeFullSockets: claiming socket", "socket", socket) 357 a.take(cpusInSocket) 358 } 359 } 360 361 func (a *cpuAccumulator) takeFullCores() { 362 for _, core := range a.freeCores() { 363 cpusInCore := a.topo.CPUDetails.CPUsInCores(core) 364 if !a.needs(cpusInCore.Size()) { 365 continue 366 } 367 klog.V(4).InfoS("takeFullCores: claiming core", "core", core) 368 a.take(cpusInCore) 369 } 370 } 371 372 func (a *cpuAccumulator) takeRemainingCPUs() { 373 for _, cpu := range a.sortAvailableCPUs() { 374 klog.V(4).InfoS("takeRemainingCPUs: claiming CPU", "cpu", cpu) 375 a.take(cpuset.New(cpu)) 376 if a.isSatisfied() { 377 return 378 } 379 } 380 } 381 382 func (a *cpuAccumulator) rangeNUMANodesNeededToSatisfy(cpuGroupSize int) (int, int) { 383 // Get the total number of NUMA nodes in the system. 384 numNUMANodes := a.topo.CPUDetails.NUMANodes().Size() 385 386 // Get the total number of NUMA nodes that have CPUs available on them. 387 numNUMANodesAvailable := a.details.NUMANodes().Size() 388 389 // Get the total number of CPUs in the system. 390 numCPUs := a.topo.CPUDetails.CPUs().Size() 391 392 // Get the total number of 'cpuGroups' in the system. 393 numCPUGroups := (numCPUs-1)/cpuGroupSize + 1 394 395 // Calculate the number of 'cpuGroups' per NUMA Node in the system (rounding up). 396 numCPUGroupsPerNUMANode := (numCPUGroups-1)/numNUMANodes + 1 397 398 // Calculate the number of available 'cpuGroups' across all NUMA nodes as 399 // well as the number of 'cpuGroups' that need to be allocated (rounding up). 400 numCPUGroupsNeeded := (a.numCPUsNeeded-1)/cpuGroupSize + 1 401 402 // Calculate the minimum number of numa nodes required to satisfy the 403 // allocation (rounding up). 404 minNUMAs := (numCPUGroupsNeeded-1)/numCPUGroupsPerNUMANode + 1 405 406 // Calculate the maximum number of numa nodes required to satisfy the allocation. 407 maxNUMAs := min(numCPUGroupsNeeded, numNUMANodesAvailable) 408 409 return minNUMAs, maxNUMAs 410 } 411 412 func (a *cpuAccumulator) needs(n int) bool { 413 return a.numCPUsNeeded >= n 414 } 415 416 func (a *cpuAccumulator) isSatisfied() bool { 417 return a.numCPUsNeeded < 1 418 } 419 420 func (a *cpuAccumulator) isFailed() bool { 421 return a.numCPUsNeeded > a.details.CPUs().Size() 422 } 423 424 // iterateCombinations walks through all n-choose-k subsets of size k in n and 425 // calls function 'f()' on each subset. For example, if n={0,1,2}, and k=2, 426 // then f() will be called on the subsets {0,1}, {0,2}. and {1,2}. If f() ever 427 // returns 'Break', we break early and exit the loop. 428 func (a *cpuAccumulator) iterateCombinations(n []int, k int, f func([]int) LoopControl) { 429 if k < 1 { 430 return 431 } 432 433 var helper func(n []int, k int, start int, accum []int, f func([]int) LoopControl) LoopControl 434 helper = func(n []int, k int, start int, accum []int, f func([]int) LoopControl) LoopControl { 435 if k == 0 { 436 return f(accum) 437 } 438 for i := start; i <= len(n)-k; i++ { 439 control := helper(n, k-1, i+1, append(accum, n[i]), f) 440 if control == Break { 441 return Break 442 } 443 } 444 return Continue 445 } 446 447 helper(n, k, 0, []int{}, f) 448 } 449 450 func takeByTopologyNUMAPacked(topo *topology.CPUTopology, availableCPUs cpuset.CPUSet, numCPUs int) (cpuset.CPUSet, error) { 451 acc := newCPUAccumulator(topo, availableCPUs, numCPUs) 452 if acc.isSatisfied() { 453 return acc.result, nil 454 } 455 if acc.isFailed() { 456 return cpuset.New(), fmt.Errorf("not enough cpus available to satisfy request: requested=%d, available=%d", numCPUs, availableCPUs.Size()) 457 } 458 459 // Algorithm: topology-aware best-fit 460 // 1. Acquire whole NUMA nodes and sockets, if available and the container 461 // requires at least a NUMA node or socket's-worth of CPUs. If NUMA 462 // Nodes map to 1 or more sockets, pull from NUMA nodes first. 463 // Otherwise pull from sockets first. 464 acc.numaOrSocketsFirst.takeFullFirstLevel() 465 if acc.isSatisfied() { 466 return acc.result, nil 467 } 468 acc.numaOrSocketsFirst.takeFullSecondLevel() 469 if acc.isSatisfied() { 470 return acc.result, nil 471 } 472 473 // 2. Acquire whole cores, if available and the container requires at least 474 // a core's-worth of CPUs. 475 acc.takeFullCores() 476 if acc.isSatisfied() { 477 return acc.result, nil 478 } 479 480 // 3. Acquire single threads, preferring to fill partially-allocated cores 481 // on the same sockets as the whole cores we have already taken in this 482 // allocation. 483 acc.takeRemainingCPUs() 484 if acc.isSatisfied() { 485 return acc.result, nil 486 } 487 488 return cpuset.New(), fmt.Errorf("failed to allocate cpus") 489 } 490 491 // takeByTopologyNUMADistributed returns a CPUSet of size 'numCPUs'. 492 // 493 // It generates this CPUset by allocating CPUs from 'availableCPUs' according 494 // to the algorithm outlined in KEP-2902: 495 // 496 // https://github.com/kubernetes/enhancements/tree/e7f51ffbe2ee398ffd1fba4a6d854f276bfad9fb/keps/sig-node/2902-cpumanager-distribute-cpus-policy-option 497 // 498 // This algorithm evenly distribute CPUs across NUMA nodes in cases where more 499 // than one NUMA node is required to satisfy the allocation. This is in 500 // contrast to the takeByTopologyNUMAPacked algorithm, which attempts to 'pack' 501 // CPUs onto NUMA nodes and fill them up before moving on to the next one. 502 // 503 // At a high-level this algorithm can be summarized as: 504 // 505 // For each NUMA single node: 506 // - If all requested CPUs can be allocated from this NUMA node; 507 // --> Do the allocation by running takeByTopologyNUMAPacked() over the 508 // available CPUs in that NUMA node and return 509 // 510 // Otherwise, for each pair of NUMA nodes: 511 // - If the set of requested CPUs (modulo 2) can be evenly split across 512 // the 2 NUMA nodes; AND 513 // - Any remaining CPUs (after the modulo operation) can be striped across 514 // some subset of the NUMA nodes; 515 // --> Do the allocation by running takeByTopologyNUMAPacked() over the 516 // available CPUs in both NUMA nodes and return 517 // 518 // Otherwise, for each 3-tuple of NUMA nodes: 519 // - If the set of requested CPUs (modulo 3) can be evenly distributed 520 // across the 3 NUMA nodes; AND 521 // - Any remaining CPUs (after the modulo operation) can be striped across 522 // some subset of the NUMA nodes; 523 // --> Do the allocation by running takeByTopologyNUMAPacked() over the 524 // available CPUs in all three NUMA nodes and return 525 // 526 // ... 527 // 528 // Otherwise, for the set of all NUMA nodes: 529 // - If the set of requested CPUs (modulo NUM_NUMA_NODES) can be evenly 530 // distributed across all NUMA nodes; AND 531 // - Any remaining CPUs (after the modulo operation) can be striped across 532 // some subset of the NUMA nodes; 533 // --> Do the allocation by running takeByTopologyNUMAPacked() over the 534 // available CPUs in all NUMA nodes and return 535 // 536 // If none of the above conditions can be met, then resort back to a 537 // best-effort fit of packing CPUs into NUMA nodes by calling 538 // takeByTopologyNUMAPacked() over all available CPUs. 539 // 540 // NOTE: A "balance score" will be calculated to help find the best subset of 541 // NUMA nodes to allocate any 'remainder' CPUs from (in cases where the total 542 // number of CPUs to allocate cannot be evenly distributed across the chosen 543 // set of NUMA nodes). This "balance score" is calculated as the standard 544 // deviation of how many CPUs will be available on each NUMA node after all 545 // evenly distributed and remainder CPUs are allocated. The subset with the 546 // lowest "balance score" will receive the CPUs in order to keep the overall 547 // allocation of CPUs as "balanced" as possible. 548 // 549 // NOTE: This algorithm has been generalized to take an additional 550 // 'cpuGroupSize' parameter to ensure that CPUs are always allocated in groups 551 // of size 'cpuGroupSize' according to the algorithm described above. This is 552 // important, for example, to ensure that all CPUs (i.e. all hyperthreads) from 553 // a single core are allocated together. 554 func takeByTopologyNUMADistributed(topo *topology.CPUTopology, availableCPUs cpuset.CPUSet, numCPUs int, cpuGroupSize int) (cpuset.CPUSet, error) { 555 // If the number of CPUs requested cannot be handed out in chunks of 556 // 'cpuGroupSize', then we just call out the packing algorithm since we 557 // can't distribute CPUs in this chunk size. 558 if (numCPUs % cpuGroupSize) != 0 { 559 return takeByTopologyNUMAPacked(topo, availableCPUs, numCPUs) 560 } 561 562 // Otherwise build an accumulator to start allocating CPUs from. 563 acc := newCPUAccumulator(topo, availableCPUs, numCPUs) 564 if acc.isSatisfied() { 565 return acc.result, nil 566 } 567 if acc.isFailed() { 568 return cpuset.New(), fmt.Errorf("not enough cpus available to satisfy request: requested=%d, available=%d", numCPUs, availableCPUs.Size()) 569 } 570 571 // Get the list of NUMA nodes represented by the set of CPUs in 'availableCPUs'. 572 numas := acc.sortAvailableNUMANodes() 573 574 // Calculate the minimum and maximum possible number of NUMA nodes that 575 // could satisfy this request. This is used to optimize how many iterations 576 // of the loop we need to go through below. 577 minNUMAs, maxNUMAs := acc.rangeNUMANodesNeededToSatisfy(cpuGroupSize) 578 579 // Try combinations of 1,2,3,... NUMA nodes until we find a combination 580 // where we can evenly distribute CPUs across them. To optimize things, we 581 // don't always start at 1 and end at len(numas). Instead, we use the 582 // values of 'minNUMAs' and 'maxNUMAs' calculated above. 583 for k := minNUMAs; k <= maxNUMAs; k++ { 584 // Iterate through the various n-choose-k NUMA node combinations, 585 // looking for the combination of NUMA nodes that can best have CPUs 586 // distributed across them. 587 var bestBalance float64 = math.MaxFloat64 588 var bestRemainder []int = nil 589 var bestCombo []int = nil 590 acc.iterateCombinations(numas, k, func(combo []int) LoopControl { 591 // If we've already found a combo with a balance of 0 in a 592 // different iteration, then don't bother checking any others. 593 if bestBalance == 0 { 594 return Break 595 } 596 597 // Check that this combination of NUMA nodes has enough CPUs to 598 // satisfy the allocation overall. 599 cpus := acc.details.CPUsInNUMANodes(combo...) 600 if cpus.Size() < numCPUs { 601 return Continue 602 } 603 604 // Check that CPUs can be handed out in groups of size 605 // 'cpuGroupSize' across the NUMA nodes in this combo. 606 numCPUGroups := 0 607 for _, numa := range combo { 608 numCPUGroups += (acc.details.CPUsInNUMANodes(numa).Size() / cpuGroupSize) 609 } 610 if (numCPUGroups * cpuGroupSize) < numCPUs { 611 return Continue 612 } 613 614 // Check that each NUMA node in this combination can allocate an 615 // even distribution of CPUs in groups of size 'cpuGroupSize', 616 // modulo some remainder. 617 distribution := (numCPUs / len(combo) / cpuGroupSize) * cpuGroupSize 618 for _, numa := range combo { 619 cpus := acc.details.CPUsInNUMANodes(numa) 620 if cpus.Size() < distribution { 621 return Continue 622 } 623 } 624 625 // Calculate how many CPUs will be available on each NUMA node in 626 // the system after allocating an even distribution of CPU groups 627 // of size 'cpuGroupSize' from each NUMA node in 'combo'. This will 628 // be used in the "balance score" calculation to help decide if 629 // this combo should ultimately be chosen. 630 availableAfterAllocation := make(mapIntInt, len(numas)) 631 for _, numa := range numas { 632 availableAfterAllocation[numa] = acc.details.CPUsInNUMANodes(numa).Size() 633 } 634 for _, numa := range combo { 635 availableAfterAllocation[numa] -= distribution 636 } 637 638 // Check if there are any remaining CPUs to distribute across the 639 // NUMA nodes once CPUs have been evenly distributed in groups of 640 // size 'cpuGroupSize'. 641 remainder := numCPUs - (distribution * len(combo)) 642 643 // Get a list of NUMA nodes to consider pulling the remainder CPUs 644 // from. This list excludes NUMA nodes that don't have at least 645 // 'cpuGroupSize' CPUs available after being allocated 646 // 'distribution' number of CPUs. 647 var remainderCombo []int 648 for _, numa := range combo { 649 if availableAfterAllocation[numa] >= cpuGroupSize { 650 remainderCombo = append(remainderCombo, numa) 651 } 652 } 653 654 // Declare a set of local variables to help track the "balance 655 // scores" calculated when using different subsets of 656 // 'remainderCombo' to allocate remainder CPUs from. 657 var bestLocalBalance float64 = math.MaxFloat64 658 var bestLocalRemainder []int = nil 659 660 // If there aren't any remainder CPUs to allocate, then calculate 661 // the "balance score" of this combo as the standard deviation of 662 // the values contained in 'availableAfterAllocation'. 663 if remainder == 0 { 664 bestLocalBalance = standardDeviation(availableAfterAllocation.Values()) 665 bestLocalRemainder = nil 666 } 667 668 // Otherwise, find the best "balance score" when allocating the 669 // remainder CPUs across different subsets of NUMA nodes in 'remainderCombo'. 670 // These remainder CPUs are handed out in groups of size 'cpuGroupSize'. 671 // We start from k=len(remainderCombo) and walk down to k=1 so that 672 // we continue to distribute CPUs as much as possible across 673 // multiple NUMA nodes. 674 for k := len(remainderCombo); remainder > 0 && k >= 1; k-- { 675 acc.iterateCombinations(remainderCombo, k, func(subset []int) LoopControl { 676 // Make a local copy of 'remainder'. 677 remainder := remainder 678 679 // Make a local copy of 'availableAfterAllocation'. 680 availableAfterAllocation := availableAfterAllocation.Clone() 681 682 // If this subset is not capable of allocating all 683 // remainder CPUs, continue to the next one. 684 if sum(availableAfterAllocation.Values(subset...)) < remainder { 685 return Continue 686 } 687 688 // For all NUMA nodes in 'subset', walk through them, 689 // removing 'cpuGroupSize' number of CPUs from each 690 // until all remainder CPUs have been accounted for. 691 for remainder > 0 { 692 for _, numa := range subset { 693 if remainder == 0 { 694 break 695 } 696 if availableAfterAllocation[numa] < cpuGroupSize { 697 continue 698 } 699 availableAfterAllocation[numa] -= cpuGroupSize 700 remainder -= cpuGroupSize 701 } 702 } 703 704 // Calculate the "balance score" as the standard deviation 705 // of the number of CPUs available on all NUMA nodes in the 706 // system after the remainder CPUs have been allocated 707 // across 'subset' in groups of size 'cpuGroupSize'. 708 balance := standardDeviation(availableAfterAllocation.Values()) 709 if balance < bestLocalBalance { 710 bestLocalBalance = balance 711 bestLocalRemainder = subset 712 } 713 714 return Continue 715 }) 716 } 717 718 // If the best "balance score" for this combo is less than the 719 // lowest "balance score" of all previous combos, then update this 720 // combo (and remainder set) to be the best one found so far. 721 if bestLocalBalance < bestBalance { 722 bestBalance = bestLocalBalance 723 bestRemainder = bestLocalRemainder 724 bestCombo = combo 725 } 726 727 return Continue 728 }) 729 730 // If we made it through all of the iterations above without finding a 731 // combination of NUMA nodes that can properly balance CPU allocations, 732 // then move on to the next larger set of NUMA node combinations. 733 if bestCombo == nil { 734 continue 735 } 736 737 // Otherwise, start allocating CPUs from the NUMA node combination 738 // chosen. First allocate an even distribution of CPUs in groups of 739 // size 'cpuGroupSize' from 'bestCombo'. 740 distribution := (numCPUs / len(bestCombo) / cpuGroupSize) * cpuGroupSize 741 for _, numa := range bestCombo { 742 cpus, _ := takeByTopologyNUMAPacked(acc.topo, acc.details.CPUsInNUMANodes(numa), distribution) 743 acc.take(cpus) 744 } 745 746 // Then allocate any remaining CPUs in groups of size 'cpuGroupSize' 747 // from each NUMA node in the remainder set. 748 remainder := numCPUs - (distribution * len(bestCombo)) 749 for remainder > 0 { 750 for _, numa := range bestRemainder { 751 if remainder == 0 { 752 break 753 } 754 if acc.details.CPUsInNUMANodes(numa).Size() < cpuGroupSize { 755 continue 756 } 757 cpus, _ := takeByTopologyNUMAPacked(acc.topo, acc.details.CPUsInNUMANodes(numa), cpuGroupSize) 758 acc.take(cpus) 759 remainder -= cpuGroupSize 760 } 761 } 762 763 // If we haven't allocated all of our CPUs at this point, then something 764 // went wrong in our accounting and we should error out. 765 if acc.numCPUsNeeded > 0 { 766 return cpuset.New(), fmt.Errorf("accounting error, not enough CPUs allocated, remaining: %v", acc.numCPUsNeeded) 767 } 768 769 // Likewise, if we have allocated too many CPUs at this point, then something 770 // went wrong in our accounting and we should error out. 771 if acc.numCPUsNeeded < 0 { 772 return cpuset.New(), fmt.Errorf("accounting error, too many CPUs allocated, remaining: %v", acc.numCPUsNeeded) 773 } 774 775 // Otherwise, return the result 776 return acc.result, nil 777 } 778 779 // If we never found a combination of NUMA nodes that we could properly 780 // distribute CPUs across, fall back to the packing algorithm. 781 return takeByTopologyNUMAPacked(topo, availableCPUs, numCPUs) 782 }