github.com/kubewharf/katalyst-core@v0.5.3/pkg/agent/qrm-plugins/cpu/nativepolicy/policy_allocation_handlers.go

github.com/kubewharf/katalyst-core@v0.5.3/pkg/agent/qrm-plugins/cpu/nativepolicy/policy_allocation_handlers.go (about)

     1  /*
     2  Copyright 2022 The Katalyst Authors.
     3  Copyright 2017 The Kubernetes Authors.
     4  
     5  Licensed under the Apache License, Version 2.0 (the "License");
     6  you may not use this file except in compliance with the License.
     7  You may obtain a copy of the License at
     8  
     9      http://www.apache.org/licenses/LICENSE-2.0
    10  
    11  Unless required by applicable law or agreed to in writing, software
    12  distributed under the License is distributed on an "AS IS" BASIS,
    13  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  See the License for the specific language governing permissions and
    15  limitations under the License.
    16  */
    17  
    18  package nativepolicy
    19  
    20  import (
    21  	"context"
    22  	"errors"
    23  	"fmt"
    24  	"time"
    25  
    26  	v1 "k8s.io/api/core/v1"
    27  	"k8s.io/klog/v2"
    28  	pluginapi "k8s.io/kubelet/pkg/apis/resourceplugin/v1alpha1"
    29  
    30  	"github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/cpu/dynamicpolicy/state"
    31  	"github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/cpu/nativepolicy/calculator"
    32  	nativepolicyutil "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/cpu/nativepolicy/util"
    33  	cpuutil "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/cpu/util"
    34  	"github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/util"
    35  	"github.com/kubewharf/katalyst-core/pkg/util/general"
    36  	"github.com/kubewharf/katalyst-core/pkg/util/machine"
    37  )
    38  
    39  const (
    40  	// ErrorSMTAlignment represents the type of an SMTAlignmentError.
    41  	ErrorSMTAlignment = "SMTAlignmentError"
    42  )
    43  
    44  // SMTAlignmentError represents an error due to SMT alignment
    45  type SMTAlignmentError struct {
    46  	RequestedCPUs int
    47  	CpusPerCore   int
    48  }
    49  
    50  func (e SMTAlignmentError) Error() string {
    51  	return fmt.Sprintf("SMT Alignment Error: requested %d cpus not multiple cpus per core = %d", e.RequestedCPUs, e.CpusPerCore)
    52  }
    53  
    54  func (e SMTAlignmentError) Type() string {
    55  	return ErrorSMTAlignment
    56  }
    57  
    58  func (p *NativePolicy) dedicatedCoresAllocationHandler(_ context.Context,
    59  	req *pluginapi.ResourceRequest,
    60  ) (*pluginapi.ResourceAllocationResponse, error) {
    61  	if req == nil {
    62  		return nil, fmt.Errorf("dedicatedCoresAllocationHandler got nil req")
    63  	}
    64  
    65  	reqInt, reqFloat64, err := util.GetQuantityFromResourceReq(req)
    66  	if err != nil {
    67  		return nil, fmt.Errorf("getReqQuantityFromResourceReq failed with error: %v", err)
    68  	}
    69  
    70  	if p.enableFullPhysicalCPUsOnly && ((reqInt % p.machineInfo.CPUsPerCore()) != 0) {
    71  		// Since CPU plugin has been enabled requesting strict SMT alignment, it means a guaranteed pod can only be admitted
    72  		// if the CPU requested is a multiple of the number of virtual cpus per physical cores.
    73  		// In case CPU request is not a multiple of the number of virtual cpus per physical cores the Pod will be put
    74  		// in Failed state, with SMTAlignmentError as reason. Since the allocation happens in terms of physical cores
    75  		// and the scheduler is responsible for ensuring that the workload goes to a node that has enough CPUs,
    76  		// the pod would be placed on a node where there are enough physical cores available to be allocated.
    77  		// Just like the behavior in case of static policy, takeByTopology will try to first allocate CPUs from the same socket
    78  		// and only in case the request cannot be satisfied on a single socket, CPU allocation is done for a workload to occupy all
    79  		// CPUs on a physical core. Allocation of individual threads would never have to occur.
    80  		return nil, SMTAlignmentError{
    81  			RequestedCPUs: reqInt,
    82  			CpusPerCore:   p.machineInfo.CPUsPerCore(),
    83  		}
    84  	}
    85  
    86  	machineState := p.state.GetMachineState()
    87  
    88  	// Allocate CPUs according to the NUMA affinity contained in the hint.
    89  	result, err := p.allocateCPUs(machineState, reqInt, req.Hint, p.cpusToReuse[req.PodUid])
    90  	if err != nil {
    91  		general.ErrorS(err, "unable to allocate CPUs",
    92  			"podNamespace", req.PodNamespace,
    93  			"podName", req.PodName,
    94  			"containerName", req.ContainerName,
    95  			"numCPUs", reqInt)
    96  		return nil, err
    97  	}
    98  
    99  	general.InfoS("allocate CPUs successfully",
   100  		"podNamespace", req.PodNamespace,
   101  		"podName", req.PodName,
   102  		"containerName", req.ContainerName,
   103  		"numCPUs", reqInt,
   104  		"result", result.String())
   105  
   106  	topologyAwareAssignments, err := machine.GetNumaAwareAssignments(p.machineInfo.CPUTopology, result)
   107  	if err != nil {
   108  		general.ErrorS(err, "unable to calculate topologyAwareAssignments",
   109  			"podNamespace", req.PodNamespace,
   110  			"podName", req.PodName,
   111  			"containerName", req.ContainerName,
   112  			"numCPUs", reqInt,
   113  			"cpuset", result.String())
   114  		return nil, err
   115  	}
   116  
   117  	allocationInfo := &state.AllocationInfo{
   118  		PodUid:                           req.PodUid,
   119  		PodNamespace:                     req.PodNamespace,
   120  		PodName:                          req.PodName,
   121  		ContainerName:                    req.ContainerName,
   122  		ContainerType:                    req.ContainerType.String(),
   123  		ContainerIndex:                   req.ContainerIndex,
   124  		PodType:                          req.PodType,
   125  		OwnerPoolName:                    state.PoolNameDedicated,
   126  		AllocationResult:                 result.Clone(),
   127  		OriginalAllocationResult:         result.Clone(),
   128  		TopologyAwareAssignments:         topologyAwareAssignments,
   129  		OriginalTopologyAwareAssignments: machine.DeepcopyCPUAssignment(topologyAwareAssignments),
   130  		InitTimestamp:                    time.Now().Format(util.QRMTimeFormat),
   131  		Labels:                           general.DeepCopyMap(req.Labels),
   132  		Annotations:                      general.DeepCopyMap(req.Annotations),
   133  		RequestQuantity:                  reqFloat64,
   134  	}
   135  
   136  	// update pod entries directly.
   137  	// if one of subsequent steps is failed, we will delete current allocationInfo from podEntries in defer function of allocation function.
   138  	p.state.SetAllocationInfo(allocationInfo.PodUid, allocationInfo.ContainerName, allocationInfo)
   139  	podEntries := p.state.GetPodEntries()
   140  
   141  	updatedMachineState, err := nativepolicyutil.GenerateMachineStateFromPodEntries(p.machineInfo.CPUTopology, podEntries)
   142  	if err != nil {
   143  		general.Errorf("pod: %s/%s, container: %s GenerateMachineStateFromPodEntries failed with error: %v",
   144  			req.PodNamespace, req.PodName, req.ContainerName, err)
   145  		return nil, fmt.Errorf("GenerateMachineStateFromPodEntries failed with error: %v", err)
   146  	}
   147  	p.state.SetMachineState(updatedMachineState)
   148  
   149  	resp, err := cpuutil.PackAllocationResponse(allocationInfo, string(v1.ResourceCPU), util.OCIPropertyNameCPUSetCPUs, false, true, req)
   150  	if err != nil {
   151  		general.Errorf("pod: %s/%s, container: %s PackResourceAllocationResponseByAllocationInfo failed with error: %v",
   152  			req.PodNamespace, req.PodName, req.ContainerName, err)
   153  		return nil, fmt.Errorf("PackResourceAllocationResponseByAllocationInfo failed with error: %v", err)
   154  	}
   155  	return resp, nil
   156  }
   157  
   158  func (p *NativePolicy) sharedPoolAllocationHandler(ctx context.Context,
   159  	req *pluginapi.ResourceRequest,
   160  ) (*pluginapi.ResourceAllocationResponse, error) {
   161  	if req == nil {
   162  		return nil, fmt.Errorf("dedicatedCoresAllocationHandler got nil req")
   163  	}
   164  
   165  	reqInt, reqFloat64, err := util.GetQuantityFromResourceReq(req)
   166  	if err != nil {
   167  		return nil, fmt.Errorf("getReqQuantityFromResourceReq failed with error: %v", err)
   168  	}
   169  
   170  	defaultCPUSet := p.state.GetMachineState().GetDefaultCPUSet()
   171  	if defaultCPUSet.IsEmpty() {
   172  		return nil, errors.New("default cpuset is empty")
   173  	}
   174  
   175  	general.InfoS("allocate default cpuset successfully",
   176  		"podNamespace", req.PodNamespace,
   177  		"podName", req.PodName,
   178  		"containerName", req.ContainerName,
   179  		"numCPUs", reqInt,
   180  		"result", defaultCPUSet.String())
   181  
   182  	topologyAwareAssignments, err := machine.GetNumaAwareAssignments(p.machineInfo.CPUTopology, defaultCPUSet)
   183  	if err != nil {
   184  		general.ErrorS(err, "unable to calculate topologyAwareAssignments",
   185  			"podNamespace", req.PodNamespace,
   186  			"podName", req.PodName,
   187  			"containerName", req.ContainerName,
   188  			"numCPUs", reqInt,
   189  			"cpuset", defaultCPUSet.String())
   190  		return nil, err
   191  	}
   192  
   193  	allocationInfo := &state.AllocationInfo{
   194  		PodUid:                           req.PodUid,
   195  		PodNamespace:                     req.PodNamespace,
   196  		PodName:                          req.PodName,
   197  		ContainerName:                    req.ContainerName,
   198  		ContainerType:                    req.ContainerType.String(),
   199  		ContainerIndex:                   req.ContainerIndex,
   200  		PodType:                          req.PodType,
   201  		OwnerPoolName:                    state.PoolNameShare,
   202  		AllocationResult:                 defaultCPUSet.Clone(),
   203  		OriginalAllocationResult:         defaultCPUSet.Clone(),
   204  		TopologyAwareAssignments:         topologyAwareAssignments,
   205  		OriginalTopologyAwareAssignments: machine.DeepcopyCPUAssignment(topologyAwareAssignments),
   206  		InitTimestamp:                    time.Now().Format(util.QRMTimeFormat),
   207  		Labels:                           general.DeepCopyMap(req.Labels),
   208  		Annotations:                      general.DeepCopyMap(req.Annotations),
   209  		RequestQuantity:                  reqFloat64,
   210  	}
   211  
   212  	// update pod entries directly.
   213  	// if one of subsequent steps is failed, we will delete current allocationInfo from podEntries in defer function of allocation function.
   214  	p.state.SetAllocationInfo(allocationInfo.PodUid, allocationInfo.ContainerName, allocationInfo)
   215  	podEntries := p.state.GetPodEntries()
   216  
   217  	updatedMachineState, err := nativepolicyutil.GenerateMachineStateFromPodEntries(p.machineInfo.CPUTopology, podEntries)
   218  	if err != nil {
   219  		general.Errorf("pod: %s/%s, container: %s GenerateMachineStateFromPodEntries failed with error: %v",
   220  			req.PodNamespace, req.PodName, req.ContainerName, err)
   221  		return nil, fmt.Errorf("GenerateMachineStateFromPodEntries failed with error: %v", err)
   222  	}
   223  	p.state.SetMachineState(updatedMachineState)
   224  
   225  	resp, err := cpuutil.PackAllocationResponse(allocationInfo, string(v1.ResourceCPU), util.OCIPropertyNameCPUSetCPUs, false, true, req)
   226  	if err != nil {
   227  		general.Errorf("pod: %s/%s, container: %s PackResourceAllocationResponseByAllocationInfo failed with error: %v",
   228  			req.PodNamespace, req.PodName, req.ContainerName, err)
   229  		return nil, fmt.Errorf("PackResourceAllocationResponseByAllocationInfo failed with error: %v", err)
   230  	}
   231  	return resp, nil
   232  }
   233  
   234  func (p *NativePolicy) allocateCPUs(machineState state.NUMANodeMap, numCPUs int, hint *pluginapi.TopologyHint, reusableCPUs machine.CPUSet) (machine.CPUSet, error) {
   235  	klog.InfoS("AllocateCPUs", "numCPUs", numCPUs, "hint", hint)
   236  
   237  	allocatableCPUs := machineState.GetAvailableCPUSet(p.reservedCPUs).Union(reusableCPUs)
   238  
   239  	// If there are aligned CPUs in numaAffinity, attempt to take those first.
   240  	result := machine.NewCPUSet()
   241  	if hint != nil {
   242  		alignedCPUs := machine.NewCPUSet()
   243  		for _, numaNode := range hint.Nodes {
   244  			alignedCPUs = alignedCPUs.Union(machineState[int(numaNode)].GetAvailableCPUSet(p.reservedCPUs))
   245  		}
   246  
   247  		numAlignedToAlloc := alignedCPUs.Size()
   248  		if numCPUs < numAlignedToAlloc {
   249  			numAlignedToAlloc = numCPUs
   250  		}
   251  
   252  		alignedCPUs, err := p.takeByTopology(alignedCPUs, numAlignedToAlloc)
   253  		if err != nil {
   254  			return machine.NewCPUSet(), err
   255  		}
   256  
   257  		result = result.Union(alignedCPUs)
   258  	}
   259  
   260  	// Get any remaining CPUs from what's leftover after attempting to grab aligned ones.
   261  	remainingCPUs, err := p.takeByTopology(allocatableCPUs.Difference(result), numCPUs-result.Size())
   262  	if err != nil {
   263  		return machine.NewCPUSet(), err
   264  	}
   265  	result = result.Union(remainingCPUs)
   266  
   267  	klog.InfoS("AllocateCPUs", "result", result)
   268  	return result, nil
   269  }
   270  
   271  func (p *NativePolicy) takeByTopology(availableCPUs machine.CPUSet, numCPUs int) (machine.CPUSet, error) {
   272  	if p.cpuAllocationOption == nativepolicyutil.CPUResourcePluginNativePolicyAllocationOptionDistributed {
   273  		cpuGroupSize := 1
   274  		if p.enableFullPhysicalCPUsOnly {
   275  			cpuGroupSize = p.machineInfo.CPUsPerCore()
   276  		}
   277  		return calculator.TakeByTopologyNUMADistributed(p.machineInfo.CPUTopology, availableCPUs, numCPUs, cpuGroupSize)
   278  	}
   279  	return calculator.TakeByTopologyNUMAPacked(p.machineInfo.CPUTopology, availableCPUs, numCPUs)
   280  }
   281  
   282  func (p *NativePolicy) updateCPUsToReuse(req *pluginapi.ResourceRequest, cset machine.CPUSet) {
   283  	// If pod entries to m.cpusToReuse other than the current pod exist, delete them.
   284  	for podUID := range p.cpusToReuse {
   285  		if podUID != req.PodUid {
   286  			delete(p.cpusToReuse, podUID)
   287  		}
   288  	}
   289  	// If no cpuset exists for cpusToReuse by this pod yet, create one.
   290  	if _, ok := p.cpusToReuse[req.PodUid]; !ok {
   291  		p.cpusToReuse[req.PodUid] = machine.NewCPUSet()
   292  	}
   293  	// Check if the container is an init container.
   294  	// If so, add its cpuset to the cpuset of reusable CPUs for any new allocations.
   295  	if req.ContainerType == pluginapi.ContainerType_INIT {
   296  		p.cpusToReuse[req.PodUid] = p.cpusToReuse[req.PodUid].Union(cset)
   297  		return
   298  	}
   299  	// Otherwise it is an app container.
   300  	// Remove its cpuset from the cpuset of reusable CPUs for any new allocations.
   301  	p.cpusToReuse[req.PodUid] = p.cpusToReuse[req.PodUid].Difference(cset)
   302  }