github.com/kubewharf/katalyst-core@v0.5.3/pkg/agent/resourcemanager/fetcher/kubelet/topology/topology_adapter.go (about)

     1  /*
     2  Copyright 2022 The Katalyst Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package topology
    18  
    19  import (
    20  	"context"
    21  	"encoding/json"
    22  	"fmt"
    23  	"strconv"
    24  	"sync"
    25  	"time"
    26  
    27  	"github.com/fsnotify/fsnotify"
    28  	info "github.com/google/cadvisor/info/v1"
    29  	"github.com/pkg/errors"
    30  	"google.golang.org/grpc"
    31  	v1 "k8s.io/api/core/v1"
    32  	"k8s.io/apimachinery/pkg/api/resource"
    33  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    34  	utilerrors "k8s.io/apimachinery/pkg/util/errors"
    35  	"k8s.io/apimachinery/pkg/util/sets"
    36  	"k8s.io/klog/v2"
    37  	podresv1 "k8s.io/kubelet/pkg/apis/podresources/v1"
    38  	resourceutil "k8s.io/kubernetes/pkg/api/v1/resource"
    39  
    40  	nodev1alpha1 "github.com/kubewharf/katalyst-api/pkg/apis/node/v1alpha1"
    41  	apiconsts "github.com/kubewharf/katalyst-api/pkg/consts"
    42  	"github.com/kubewharf/katalyst-api/pkg/utils"
    43  	"github.com/kubewharf/katalyst-core/pkg/config/generic"
    44  	"github.com/kubewharf/katalyst-core/pkg/consts"
    45  	"github.com/kubewharf/katalyst-core/pkg/metaserver"
    46  	metaserverpod "github.com/kubewharf/katalyst-core/pkg/metaserver/agent/pod"
    47  	"github.com/kubewharf/katalyst-core/pkg/metaserver/spd"
    48  	"github.com/kubewharf/katalyst-core/pkg/util"
    49  	"github.com/kubewharf/katalyst-core/pkg/util/general"
    50  	"github.com/kubewharf/katalyst-core/pkg/util/kubelet/podresources"
    51  	"github.com/kubewharf/katalyst-core/pkg/util/native"
    52  )
    53  
    54  const (
    55  	podResourcesClientTimeout    = 10 * time.Second
    56  	getTopologyZonesTimeout      = 10 * time.Second
    57  	podResourcesClientMaxMsgSize = 1024 * 1024 * 16
    58  )
    59  
    60  // NumaInfoGetter is to get numa info
    61  type NumaInfoGetter func() ([]info.Node, error)
    62  
    63  // PodResourcesFilter is to filter pod resources which does need to be reported
    64  type PodResourcesFilter func(*v1.Pod, *podresv1.PodResources) (*podresv1.PodResources, error)
    65  
    66  var oneQuantity = *resource.NewQuantity(1, resource.DecimalSI)
    67  
    68  type topologyAdapterImpl struct {
    69  	mutex     sync.Mutex
    70  	client    podresv1.PodResourcesListerClient
    71  	endpoints []string
    72  
    73  	// qosConf is used to get pod qos configuration
    74  	qosConf *generic.QoSConfiguration
    75  
    76  	// metaServer is used to fetch pod list to calculate numa allocation
    77  	metaServer *metaserver.MetaServer
    78  
    79  	// numaSocketZoneNodeMap map numa zone node => socket zone node
    80  	numaSocketZoneNodeMap map[util.ZoneNode]util.ZoneNode
    81  
    82  	// skipDeviceNames name of devices which will be skipped in getting numa allocatable and allocation
    83  	skipDeviceNames sets.String
    84  
    85  	// getClientFunc is func to get pod resources lister client
    86  	getClientFunc podresources.GetClientFunc
    87  
    88  	// podResourcesFilter is support to filter out pods or resources which no need report to cnr
    89  	podResourcesFilter PodResourcesFilter
    90  
    91  	// kubeletResourcePluginPaths is the path of kubelet resource plugin
    92  	kubeletResourcePluginPaths []string
    93  
    94  	// resourceNameToZoneTypeMap is a map that stores the mapping relationship between resource names to zone types for device zones
    95  	resourceNameToZoneTypeMap map[string]string
    96  
    97  	// needValidationResources is the resources needed to be validated
    98  	needValidationResources []string
    99  }
   100  
   101  // NewPodResourcesServerTopologyAdapter creates a topology adapter which uses pod resources server
   102  func NewPodResourcesServerTopologyAdapter(metaServer *metaserver.MetaServer, qosConf *generic.QoSConfiguration,
   103  	endpoints []string, kubeletResourcePluginPaths []string, resourceNameToZoneTypeMap map[string]string,
   104  	skipDeviceNames sets.String, numaInfoGetter NumaInfoGetter, podResourcesFilter PodResourcesFilter,
   105  	getClientFunc podresources.GetClientFunc, needValidationResources []string,
   106  ) (Adapter, error) {
   107  	numaInfo, err := numaInfoGetter()
   108  	if err != nil {
   109  		return nil, fmt.Errorf("failed to get numa info: %s", err)
   110  	}
   111  
   112  	// make sure all candidate kubelet resource plugin paths exist
   113  	for _, path := range kubeletResourcePluginPaths {
   114  		// ensure resource plugin path exists
   115  		err = general.EnsureDirectory(path)
   116  		if err != nil {
   117  			return nil, errors.Wrapf(err, "ensure resource plugin path %s exists failed", path)
   118  		}
   119  	}
   120  
   121  	numaSocketZoneNodeMap := util.GenerateNumaSocketZone(numaInfo)
   122  	return &topologyAdapterImpl{
   123  		endpoints:                  endpoints,
   124  		kubeletResourcePluginPaths: kubeletResourcePluginPaths,
   125  		qosConf:                    qosConf,
   126  		metaServer:                 metaServer,
   127  		numaSocketZoneNodeMap:      numaSocketZoneNodeMap,
   128  		skipDeviceNames:            skipDeviceNames,
   129  		getClientFunc:              getClientFunc,
   130  		podResourcesFilter:         podResourcesFilter,
   131  		resourceNameToZoneTypeMap:  resourceNameToZoneTypeMap,
   132  		needValidationResources:    needValidationResources,
   133  	}, nil
   134  }
   135  
   136  func (p *topologyAdapterImpl) GetTopologyZones(parentCtx context.Context) ([]*nodev1alpha1.TopologyZone, error) {
   137  	p.mutex.Lock()
   138  	defer p.mutex.Unlock()
   139  
   140  	// always force getting pod list instead of cache
   141  	ctx := context.WithValue(parentCtx, metaserverpod.BypassCacheKey, metaserverpod.BypassCacheTrue)
   142  
   143  	ctx, cancel := context.WithTimeout(ctx, getTopologyZonesTimeout)
   144  	defer cancel()
   145  	podList, err := p.metaServer.GetPodList(ctx, nil)
   146  	if err != nil {
   147  		return nil, errors.Wrap(err, "get pod list from metaServer failed")
   148  	}
   149  
   150  	listPodResourcesResponse, err := p.client.List(ctx, &podresv1.ListPodResourcesRequest{})
   151  	if err != nil {
   152  		return nil, errors.Wrap(err, "list pod from pod resource server failed")
   153  	}
   154  
   155  	allocatableResources, err := p.client.GetAllocatableResources(ctx, &podresv1.AllocatableResourcesRequest{})
   156  	if err != nil {
   157  		return nil, errors.Wrap(err, "get allocatable Resources from pod resource server failed")
   158  	}
   159  
   160  	if klog.V(5).Enabled() {
   161  		listPodResourcesResponseStr, _ := json.Marshal(listPodResourcesResponse)
   162  		allocatableResourcesResponseStr, _ := json.Marshal(allocatableResources)
   163  		klog.Infof("list pod Resources: %s\n allocatable Resources: %s", string(listPodResourcesResponseStr),
   164  			string(allocatableResourcesResponseStr))
   165  	}
   166  
   167  	// validate pod Resources server response to make sure report topology status is correct
   168  	if err = p.validatePodResourcesServerResponse(allocatableResources, listPodResourcesResponse); err != nil {
   169  		return nil, errors.Wrap(err, "validate pod Resources server response failed")
   170  	}
   171  
   172  	podResources := listPodResourcesResponse.GetPodResources()
   173  	if len(podResources) == 0 {
   174  		return nil, errors.Errorf("list pod resources response is empty")
   175  	}
   176  
   177  	// filter already allocated pods
   178  	podResourcesList := filterAllocatedPodResourcesList(podResources)
   179  
   180  	// get numa Allocations by pod Resources
   181  	zoneAllocations, err := p.getZoneAllocations(podList, podResourcesList)
   182  	if err != nil {
   183  		return nil, errors.Wrap(err, "get zone allocations failed")
   184  	}
   185  
   186  	// get zone resources by allocatable resources
   187  	zoneResources, err := p.getZoneResources(allocatableResources)
   188  	if err != nil {
   189  		return nil, errors.Wrap(err, "get zone resources failed")
   190  	}
   191  
   192  	// get zone attributes by allocatable resources
   193  	zoneAttributes, err := p.getZoneAttributes(allocatableResources)
   194  	if err != nil {
   195  		return nil, errors.Wrap(err, "get zone attributes failed")
   196  	}
   197  
   198  	// get zone siblings by SiblingNumaMap
   199  	zoneSiblings, err := p.getZoneSiblings()
   200  	if err != nil {
   201  		return nil, errors.Wrap(err, "get zone siblings failed")
   202  	}
   203  
   204  	// initialize a topology zone generator by numa socket zone node map
   205  	topologyZoneGenerator, err := util.NewNumaSocketTopologyZoneGenerator(p.numaSocketZoneNodeMap)
   206  	if err != nil {
   207  		return nil, err
   208  	}
   209  
   210  	// add other children zone node of numa or socket into topology zone generator by allocatable resources
   211  	err = p.addNumaSocketChildrenZoneNodes(topologyZoneGenerator, allocatableResources)
   212  	if err != nil {
   213  		return nil, errors.Wrap(err, "get socket and numa zone topology failed")
   214  	}
   215  
   216  	err = p.addDeviceZoneNodes(topologyZoneGenerator, allocatableResources)
   217  	if err != nil {
   218  		return nil, errors.Wrap(err, "get device zone topology failed")
   219  	}
   220  
   221  	return topologyZoneGenerator.GenerateTopologyZoneStatus(zoneAllocations, zoneResources, zoneAttributes, zoneSiblings), nil
   222  }
   223  
   224  // GetTopologyPolicy return newest topology policy status
   225  func (p *topologyAdapterImpl) GetTopologyPolicy(ctx context.Context) (nodev1alpha1.TopologyPolicy, error) {
   226  	p.mutex.Lock()
   227  	defer p.mutex.Unlock()
   228  
   229  	klConfig, err := p.metaServer.GetKubeletConfig(ctx)
   230  	if err != nil {
   231  		return "", errors.Wrap(err, "get kubelet config failed")
   232  	}
   233  
   234  	return utils.GenerateTopologyPolicy(klConfig.TopologyManagerPolicy, klConfig.TopologyManagerScope), nil
   235  }
   236  
   237  func (p *topologyAdapterImpl) Run(ctx context.Context, handler func()) error {
   238  	var (
   239  		err  error
   240  		conn *grpc.ClientConn
   241  	)
   242  	p.mutex.Lock()
   243  	defer p.mutex.Unlock()
   244  
   245  	p.client, conn, err = p.getClientFunc(
   246  		general.GetOneExistPath(p.endpoints), podResourcesClientTimeout, podResourcesClientMaxMsgSize)
   247  	if err != nil {
   248  		return fmt.Errorf("get podResources client failed, connect err: %s", err)
   249  	}
   250  
   251  	// register file watcher to watch qrm checkpoint file change
   252  	watcher, err := general.RegisterFileEventWatcher(
   253  		ctx.Done(),
   254  		general.FileWatcherInfo{
   255  			Path:     p.kubeletResourcePluginPaths,
   256  			Filename: consts.KubeletQoSResourceManagerCheckpoint,
   257  			Op:       fsnotify.Create,
   258  		},
   259  	)
   260  	if err != nil {
   261  		return fmt.Errorf("register file watcher failed, err: %s", err)
   262  	}
   263  
   264  	// start a goroutine to watch qrm checkpoint file change and notify to update topology status,
   265  	// and when qrm checkpoint file changed, it means that the topology status may be changed
   266  	go func() {
   267  		defer func() {
   268  			err = conn.Close()
   269  			if err != nil {
   270  				klog.Errorf("pod resource connection close failed: %v", err)
   271  			}
   272  		}()
   273  		for {
   274  			select {
   275  			case <-ctx.Done():
   276  				klog.Infof("stopping pod resources server topology adapter")
   277  				return
   278  			case _, ok := <-watcher:
   279  				if !ok {
   280  					klog.Warningf("watcher channel closed")
   281  					return
   282  				}
   283  				klog.Infof("qrm state file changed, notify to update topology status")
   284  				if handler != nil {
   285  					handler()
   286  				}
   287  			}
   288  		}
   289  	}()
   290  
   291  	return nil
   292  }
   293  
   294  // validatePodResourcesServerResponse validate pod resources server response, if the resource is empty,
   295  // maybe the kubelet or qrm plugin is restarting
   296  func (p *topologyAdapterImpl) validatePodResourcesServerResponse(allocatableResourcesResponse *podresv1.
   297  	AllocatableResourcesResponse, listPodResourcesResponse *podresv1.ListPodResourcesResponse,
   298  ) error {
   299  	if len(p.needValidationResources) > 0 {
   300  		if allocatableResourcesResponse == nil {
   301  			return fmt.Errorf("allocatable resources response is nil")
   302  		}
   303  
   304  		allocResSet := sets.NewString()
   305  		for _, res := range allocatableResourcesResponse.Resources {
   306  			allocResSet.Insert(res.ResourceName)
   307  		}
   308  
   309  		if !allocResSet.HasAll(p.needValidationResources...) {
   310  			return fmt.Errorf("allocatable resources response doen't contain all the resources that need to be validated")
   311  		}
   312  	}
   313  
   314  	if listPodResourcesResponse == nil {
   315  		return fmt.Errorf("list pod Resources response is nil")
   316  	}
   317  
   318  	return nil
   319  }
   320  
   321  // addNumaSocketChildrenZoneNodes add the child nodes of socket or numa zone nodes to the generator, the child nodes are
   322  // generated by generateZoneNode according to TopologyLevel, Type and Name in TopologyAwareAllocatableQuantityList
   323  func (p *topologyAdapterImpl) addNumaSocketChildrenZoneNodes(generator *util.TopologyZoneGenerator,
   324  	allocatableResources *podresv1.AllocatableResourcesResponse,
   325  ) error {
   326  	if allocatableResources == nil {
   327  		return fmt.Errorf("allocatable Resources is nil")
   328  	}
   329  
   330  	var errList []error
   331  	for _, resources := range allocatableResources.Resources {
   332  		for _, quantity := range resources.TopologyAwareAllocatableQuantityList {
   333  			if quantity == nil || len(quantity.Type) == 0 {
   334  				continue
   335  			}
   336  
   337  			zoneNode, parentZoneNode, err := p.generateZoneNode(*quantity)
   338  			if err != nil {
   339  				errList = append(errList, fmt.Errorf("get zone key from quantity %v failed: %v", quantity, err))
   340  				continue
   341  			}
   342  
   343  			err = generator.AddNode(parentZoneNode, zoneNode)
   344  			if err != nil {
   345  				errList = append(errList, err)
   346  				continue
   347  			}
   348  		}
   349  	}
   350  
   351  	if len(errList) > 0 {
   352  		return utilerrors.NewAggregate(errList)
   353  	}
   354  
   355  	return nil
   356  }
   357  
   358  // addDeviceZoneNodes add the device nodes which are children of numa zone nodes to the generator, the device nodes are
   359  // generated by generateZoneNode according to TopologyLevel, Type and Name in TopologyAwareAllocatableQuantityList
   360  func (p *topologyAdapterImpl) addDeviceZoneNodes(generator *util.TopologyZoneGenerator,
   361  	allocatableResources *podresv1.AllocatableResourcesResponse,
   362  ) error {
   363  	if allocatableResources == nil {
   364  		return fmt.Errorf("allocatable Resources is nil")
   365  	}
   366  	var errList []error
   367  	for _, device := range allocatableResources.Devices {
   368  		if targetZoneType, ok := p.resourceNameToZoneTypeMap[device.ResourceName]; ok {
   369  			for _, deviceId := range device.DeviceIds {
   370  				deviceNode := util.GenerateDeviceZoneNode(deviceId, targetZoneType)
   371  				for _, numaNode := range device.Topology.Nodes {
   372  					numaZoneNode := util.GenerateNumaZoneNode(int(numaNode.ID))
   373  					err := generator.AddNode(&numaZoneNode, deviceNode)
   374  					if err != nil {
   375  						errList = append(errList, err)
   376  					}
   377  				}
   378  			}
   379  		}
   380  	}
   381  
   382  	if len(errList) > 0 {
   383  		return utilerrors.NewAggregate(errList)
   384  	}
   385  
   386  	return nil
   387  }
   388  
   389  // getZoneResources gets a map of zone node to zone Resources. The zone node Resources is combined by allocatable
   390  // device and allocatable resources from pod resources server
   391  func (p *topologyAdapterImpl) getZoneResources(allocatableResources *podresv1.AllocatableResourcesResponse) (map[util.ZoneNode]nodev1alpha1.Resources, error) {
   392  	var (
   393  		errList []error
   394  		err     error
   395  	)
   396  
   397  	if allocatableResources == nil {
   398  		return nil, fmt.Errorf("allocatable Resources is nil")
   399  	}
   400  
   401  	zoneAllocatable := make(map[util.ZoneNode]*v1.ResourceList)
   402  	zoneCapacity := make(map[util.ZoneNode]*v1.ResourceList)
   403  
   404  	zoneAllocatable, err = p.addContainerDevices(zoneAllocatable, allocatableResources.Devices)
   405  	if err != nil {
   406  		return nil, err
   407  	}
   408  
   409  	// todo: the capacity and allocatable are equally now because the response includes all
   410  	// 		devices which don't consider them whether is healthy
   411  	zoneCapacity, err = p.addContainerDevices(zoneCapacity, allocatableResources.Devices)
   412  	if err != nil {
   413  		return nil, err
   414  	}
   415  
   416  	// calculate Resources capacity and allocatable
   417  	for _, resources := range allocatableResources.Resources {
   418  		if resources == nil {
   419  			continue
   420  		}
   421  
   422  		resourceName := v1.ResourceName(resources.ResourceName)
   423  		zoneCapacity, err = p.addTopologyAwareQuantity(zoneCapacity, resourceName, resources.TopologyAwareCapacityQuantityList)
   424  		if err != nil {
   425  			errList = append(errList, err)
   426  			continue
   427  		}
   428  
   429  		zoneAllocatable, err = p.addTopologyAwareQuantity(zoneAllocatable, resourceName, resources.TopologyAwareAllocatableQuantityList)
   430  		if err != nil {
   431  			errList = append(errList, err)
   432  			continue
   433  		}
   434  	}
   435  
   436  	zoneCapacity, err = p.addNumaMemoryBandwidthResources(zoneCapacity, p.metaServer.SiblingNumaAvgMBWCapacityMap)
   437  	if err != nil {
   438  		errList = append(errList, err)
   439  	}
   440  
   441  	zoneAllocatable, err = p.addNumaMemoryBandwidthResources(zoneAllocatable, p.metaServer.SiblingNumaAvgMBWAllocatableMap)
   442  	if err != nil {
   443  		errList = append(errList, err)
   444  	}
   445  
   446  	if len(errList) > 0 {
   447  		return nil, utilerrors.NewAggregate(errList)
   448  	}
   449  
   450  	resources := make(map[util.ZoneNode]nodev1alpha1.Resources)
   451  	for zone, capacity := range zoneCapacity {
   452  		allocatable, ok := zoneAllocatable[zone]
   453  		if !ok {
   454  			return nil, fmt.Errorf("zone %v capacity found but allocatable is not found", zone)
   455  		}
   456  
   457  		resources[zone] = nodev1alpha1.Resources{
   458  			Capacity:    capacity,
   459  			Allocatable: allocatable,
   460  		}
   461  	}
   462  
   463  	return resources, nil
   464  }
   465  
   466  // getZoneAllocations gets a map of zone nodes to zone allocations computed from a list of pod resources that aggregates per-container allocations using
   467  // aggregateContainerAllocated. The podResourcesFilter is used to filter out some pods that do not need to be reported to cnr
   468  func (p *topologyAdapterImpl) getZoneAllocations(podList []*v1.Pod, podResourcesList []*podresv1.PodResources) (map[util.ZoneNode]util.ZoneAllocations, error) {
   469  	var (
   470  		err     error
   471  		errList []error
   472  	)
   473  
   474  	podMap := native.GetPodNamespaceNameKeyMap(podList)
   475  	zoneAllocationsMap := make(map[util.ZoneNode]util.ZoneAllocations)
   476  	for _, podResources := range podResourcesList {
   477  		if podResources == nil {
   478  			continue
   479  		}
   480  
   481  		podKey := native.GenerateNamespaceNameKey(podResources.Namespace, podResources.Name)
   482  		pod, ok := podMap[podKey]
   483  		if !ok {
   484  			errList = append(errList, fmt.Errorf("pod %s not found in metaserver", podKey))
   485  			continue
   486  		}
   487  
   488  		if native.PodIsTerminated(pod) {
   489  			continue
   490  		}
   491  
   492  		// the pod resource filter will filter out unwanted pods
   493  		if p.podResourcesFilter != nil {
   494  			podResources, err = p.podResourcesFilter(pod, podResources)
   495  			if err != nil {
   496  				errList = append(errList, err)
   497  				continue
   498  			}
   499  
   500  			// if podResources is nil, it means that the pod is filtered out
   501  			if podResources == nil {
   502  				continue
   503  			}
   504  		}
   505  
   506  		// aggregates resources in each zone used by all containers of the pod
   507  		podAllocated, err := p.aggregateContainerAllocated(pod.ObjectMeta, podResources.Containers)
   508  		if err != nil {
   509  			errList = append(errList, fmt.Errorf("pod %s aggregate container allocated failed, %s", podKey, err))
   510  			continue
   511  		}
   512  
   513  		// revise pod allocated according qos level
   514  		err = p.revisePodAllocated(pod, podAllocated)
   515  		if err != nil {
   516  			errList = append(errList, fmt.Errorf("pod %s revise pod allocated failed, %s", podKey, err))
   517  			continue
   518  		}
   519  
   520  		for zoneNode, resourceList := range podAllocated {
   521  			_, ok := zoneAllocationsMap[zoneNode]
   522  			if !ok {
   523  				zoneAllocationsMap[zoneNode] = util.ZoneAllocations{}
   524  			}
   525  
   526  			zoneAllocationsMap[zoneNode] = append(zoneAllocationsMap[zoneNode], &nodev1alpha1.Allocation{
   527  				Consumer: native.GenerateUniqObjectUIDKey(pod),
   528  				Requests: resourceList,
   529  			})
   530  		}
   531  	}
   532  
   533  	if len(errList) > 0 {
   534  		return nil, utilerrors.NewAggregate(errList)
   535  	}
   536  
   537  	return zoneAllocationsMap, nil
   538  }
   539  
   540  // revisePodAllocated is to revise pod allocated according to its qos level
   541  func (p *topologyAdapterImpl) revisePodAllocated(pod *v1.Pod, podAllocated map[util.ZoneNode]*v1.ResourceList) error {
   542  	qosLevel, err := p.qosConf.GetQoSLevel(pod, map[string]string{})
   543  	if err != nil {
   544  		return err
   545  	}
   546  
   547  	switch qosLevel {
   548  	case apiconsts.PodAnnotationQoSLevelSharedCores:
   549  		// revise shared_cores pod allocated according to its numa binding
   550  		return p.reviseSharedCoresPodAllocated(pod, podAllocated)
   551  	default:
   552  		return nil
   553  	}
   554  }
   555  
   556  // reviseSharedCoresPodAllocated is to revise shared_cores pod allocated according to its numa binding
   557  func (p *topologyAdapterImpl) reviseSharedCoresPodAllocated(pod *v1.Pod, podAllocated map[util.ZoneNode]*v1.ResourceList) error {
   558  	ok, err := util.ValidateSharedCoresWithNumaBindingPod(p.qosConf, pod, podAllocated)
   559  	if !ok || err != nil {
   560  		return err
   561  	}
   562  
   563  	for zoneNode, resourceList := range podAllocated {
   564  		if zoneNode.Meta.Type != nodev1alpha1.TopologyTypeNuma {
   565  			continue
   566  		}
   567  
   568  		if resourceList != nil &&
   569  			(!resourceList.Cpu().IsZero() || !resourceList.Memory().IsZero()) {
   570  
   571  			// revise the allocated resources to the binding numa node
   572  			requests, _ := resourceutil.PodRequestsAndLimits(pod)
   573  			if requests != nil {
   574  				(*resourceList)[v1.ResourceCPU] = requests.Cpu().DeepCopy()
   575  				(*resourceList)[v1.ResourceMemory] = requests.Memory().DeepCopy()
   576  			}
   577  
   578  			// shared_cores with numa binding pod cpu and memory are only bound to one numa,
   579  			break
   580  		}
   581  	}
   582  
   583  	return nil
   584  }
   585  
   586  // getZoneAttributes gets a map of zone node to zone attributes, which is generated from the annotation of
   587  // topology aware quantity and socket and numa zone are not support attribute here
   588  func (p *topologyAdapterImpl) getZoneAttributes(allocatableResources *podresv1.AllocatableResourcesResponse) (map[util.ZoneNode]util.ZoneAttributes, error) {
   589  	if allocatableResources == nil {
   590  		return nil, fmt.Errorf("allocatable Resources is nil")
   591  	}
   592  
   593  	var errList []error
   594  	zoneAttributes := make(map[util.ZoneNode]util.ZoneAttributes)
   595  	for _, resources := range allocatableResources.Resources {
   596  		if resources == nil {
   597  			continue
   598  		}
   599  
   600  		for _, quantity := range resources.TopologyAwareAllocatableQuantityList {
   601  			// only quantity with type need report attributes, and others such as Socket and Numa
   602  			// no need report that
   603  			if quantity == nil || len(quantity.Type) == 0 {
   604  				continue
   605  			}
   606  
   607  			zoneNode, _, err := p.generateZoneNode(*quantity)
   608  			if err != nil {
   609  				errList = append(errList, fmt.Errorf("get zone node from quantity %v failed: %v", quantity, err))
   610  				continue
   611  			}
   612  
   613  			if _, ok := zoneAttributes[zoneNode]; !ok {
   614  				zoneAttributes[zoneNode] = util.ZoneAttributes{}
   615  			}
   616  
   617  			var attrs []nodev1alpha1.Attribute
   618  			for annoKey, value := range quantity.Annotations {
   619  				attrs = append(attrs, nodev1alpha1.Attribute{
   620  					Name:  annoKey,
   621  					Value: value,
   622  				})
   623  			}
   624  
   625  			zoneAttributes[zoneNode] = util.MergeAttributes(zoneAttributes[zoneNode], attrs)
   626  		}
   627  	}
   628  
   629  	if len(errList) > 0 {
   630  		return nil, utilerrors.NewAggregate(errList)
   631  	}
   632  
   633  	return zoneAttributes, nil
   634  }
   635  
   636  // aggregateContainerAllocated aggregates resources in each zone used by all containers of a pod and returns a map of zone node to
   637  // container allocated resources.
   638  func (p *topologyAdapterImpl) aggregateContainerAllocated(podMeta metav1.ObjectMeta, containers []*podresv1.ContainerResources) (map[util.ZoneNode]*v1.ResourceList, error) {
   639  	var errList []error
   640  
   641  	podAllocated := make(map[util.ZoneNode]*v1.ResourceList)
   642  	for _, containerResources := range containers {
   643  		if containerResources == nil {
   644  			continue
   645  		}
   646  
   647  		var err error
   648  		containerAllocated := make(map[util.ZoneNode]*v1.ResourceList)
   649  		containerAllocated, err = p.addContainerDevices(containerAllocated, containerResources.Devices)
   650  		if err != nil {
   651  			errList = append(errList, fmt.Errorf("get container %s devices allocated failed: %s",
   652  				containerResources.Name, err))
   653  			continue
   654  		}
   655  
   656  		containerAllocated, err = p.addContainerResources(containerAllocated, containerResources.Resources)
   657  		if err != nil {
   658  			errList = append(errList, fmt.Errorf("get container %s resources allocated failed: %s",
   659  				containerResources.Name, err))
   660  			continue
   661  		}
   662  
   663  		// add container memory bandwidth according to its allocated numa resources
   664  		containerAllocated, err = p.addContainerMemoryBandwidth(containerAllocated, podMeta, containerResources.Name)
   665  		if err != nil {
   666  			errList = append(errList, fmt.Errorf("get container %s memory bandwidth failed: %s",
   667  				containerResources.Name, err))
   668  			continue
   669  		}
   670  
   671  		for zoneNode, resourceList := range containerAllocated {
   672  			if resourceList == nil {
   673  				continue
   674  			}
   675  
   676  			for resourceName, quantity := range *resourceList {
   677  				podAllocated = addZoneQuantity(podAllocated, zoneNode, resourceName, quantity)
   678  			}
   679  		}
   680  	}
   681  
   682  	if len(errList) > 0 {
   683  		return nil, utilerrors.NewAggregate(errList)
   684  	}
   685  
   686  	return podAllocated, nil
   687  }
   688  
   689  // addContainerDevices add all numa zone device into the zone resources map, and the skipDeviceNames is used
   690  // to filter out some devices that do not need to be reported to cnr. The device name is the resource name and
   691  // the quantity is the number of devices.
   692  func (p *topologyAdapterImpl) addContainerDevices(zoneResources map[util.ZoneNode]*v1.ResourceList,
   693  	containerDevices []*podresv1.ContainerDevices,
   694  ) (map[util.ZoneNode]*v1.ResourceList, error) {
   695  	var errList []error
   696  
   697  	if zoneResources == nil {
   698  		zoneResources = make(map[util.ZoneNode]*v1.ResourceList)
   699  	}
   700  
   701  	for _, device := range containerDevices {
   702  		if device == nil || device.Topology == nil {
   703  			continue
   704  		}
   705  
   706  		if p.skipDeviceNames != nil && p.skipDeviceNames.Has(device.ResourceName) {
   707  			continue
   708  		}
   709  
   710  		resourceName := v1.ResourceName(device.ResourceName)
   711  		for _, node := range device.Topology.Nodes {
   712  			if node == nil {
   713  				continue
   714  			}
   715  
   716  			zoneNode := util.GenerateNumaZoneNode(int(node.ID))
   717  			zoneResources = addZoneQuantity(zoneResources, zoneNode, resourceName, oneQuantity)
   718  
   719  			if zoneType, ok := p.resourceNameToZoneTypeMap[device.ResourceName]; ok {
   720  				for _, deviceId := range device.DeviceIds {
   721  					deviceNode := util.GenerateDeviceZoneNode(deviceId, zoneType)
   722  					zoneResources = addZoneQuantity(zoneResources, deviceNode, resourceName, oneQuantity)
   723  				}
   724  			}
   725  		}
   726  	}
   727  
   728  	if len(errList) > 0 {
   729  		return nil, utilerrors.NewAggregate(errList)
   730  	}
   731  
   732  	return zoneResources, nil
   733  }
   734  
   735  // addContainerResources add all container resources into the zone resources map, get each resource of each zone node
   736  // and add them together to get the total resource of each zone node.
   737  func (p *topologyAdapterImpl) addContainerResources(zoneResources map[util.ZoneNode]*v1.ResourceList,
   738  	topoAwareResources []*podresv1.TopologyAwareResource,
   739  ) (map[util.ZoneNode]*v1.ResourceList, error) {
   740  	var (
   741  		errList []error
   742  		err     error
   743  	)
   744  
   745  	if zoneResources == nil {
   746  		zoneResources = make(map[util.ZoneNode]*v1.ResourceList)
   747  	}
   748  
   749  	for _, resources := range topoAwareResources {
   750  		if resources == nil {
   751  			continue
   752  		}
   753  
   754  		resourceName := v1.ResourceName(resources.ResourceName)
   755  		zoneResources, err = p.addTopologyAwareQuantity(zoneResources, resourceName, resources.OriginalTopologyAwareQuantityList)
   756  		if err != nil {
   757  			errList = append(errList, err)
   758  			continue
   759  		}
   760  	}
   761  
   762  	if len(errList) > 0 {
   763  		return nil, utilerrors.NewAggregate(errList)
   764  	}
   765  
   766  	return zoneResources, nil
   767  }
   768  
   769  // addTopologyAwareQuantity add zone node resource into the map according to TopologyAwareQuantity list. Each TopologyAwareQuantity has a
   770  // list of topology nodes, and each topology node has name, type, topology level, and annotations, and the resource value. The zone node
   771  // is determined by the topology node name, type, topology level,
   772  func (p *topologyAdapterImpl) addTopologyAwareQuantity(zoneResourceList map[util.ZoneNode]*v1.ResourceList, resourceName v1.ResourceName,
   773  	topoAwareQuantityList []*podresv1.TopologyAwareQuantity,
   774  ) (map[util.ZoneNode]*v1.ResourceList, error) {
   775  	var errList []error
   776  
   777  	if zoneResourceList == nil {
   778  		zoneResourceList = make(map[util.ZoneNode]*v1.ResourceList)
   779  	}
   780  
   781  	for _, quantity := range topoAwareQuantityList {
   782  
   783  		if quantity == nil {
   784  			continue
   785  		}
   786  
   787  		zoneNode, _, err := p.generateZoneNode(*quantity)
   788  		if err != nil {
   789  			errList = append(errList, fmt.Errorf("get zone node from quantity %v failed: %v", quantity, err))
   790  			continue
   791  		}
   792  
   793  		resourceValue, err := resource.ParseQuantity(fmt.Sprintf("%.2f", quantity.ResourceValue))
   794  		if err != nil {
   795  			errList = append(errList, fmt.Errorf("parse resource: %s for zone %s failed: %s", resourceName, zoneNode, err))
   796  			continue
   797  		}
   798  
   799  		zoneResourceList = addZoneQuantity(zoneResourceList, zoneNode, resourceName, resourceValue)
   800  	}
   801  
   802  	if len(errList) > 0 {
   803  		return nil, utilerrors.NewAggregate(errList)
   804  	}
   805  
   806  	return zoneResourceList, nil
   807  }
   808  
   809  // addZoneQuantity add a zone and resource quantity into the zone resource map, if the zone node is not in the map,
   810  // then create a new resource list for the zone node, and add the resource quantity into the resource list. If the
   811  // zone node is in the map, then get the resource list from the map, and add the resource quantity into the resource
   812  // list.
   813  func addZoneQuantity(zoneResourceList map[util.ZoneNode]*v1.ResourceList, zoneNode util.ZoneNode,
   814  	resourceName v1.ResourceName, value resource.Quantity,
   815  ) map[util.ZoneNode]*v1.ResourceList {
   816  	if zoneResourceList == nil {
   817  		zoneResourceList = make(map[util.ZoneNode]*v1.ResourceList)
   818  	}
   819  
   820  	resourceListPtr, ok := zoneResourceList[zoneNode]
   821  	if !ok || resourceListPtr == nil {
   822  		resourceListPtr = &v1.ResourceList{}
   823  		zoneResourceList[zoneNode] = resourceListPtr
   824  	}
   825  	resourceList := *resourceListPtr
   826  
   827  	quantity, resourceOk := resourceList[resourceName]
   828  	if !resourceOk {
   829  		quantity = resource.Quantity{}
   830  		resourceList[resourceName] = quantity
   831  	}
   832  
   833  	quantity.Add(value)
   834  	resourceList[resourceName] = quantity
   835  
   836  	return zoneResourceList
   837  }
   838  
   839  // generateZoneNode get zone node and its parent zone node from quantity according to quantity type and topology level
   840  //   - if Type is empty, it means that the zone is socket or numa according to TopologyLevel
   841  //   - if Type is not empty, it means that the zone is a child of socket or a child of numa determined by TopologyLevel,
   842  //     and the zone name is determined by the quantity name or its resource identifier if existed.
   843  func (p *topologyAdapterImpl) generateZoneNode(quantity podresv1.TopologyAwareQuantity) (util.ZoneNode, *util.ZoneNode, error) {
   844  	nodeID := int(quantity.Node)
   845  	if len(quantity.Type) == 0 {
   846  		switch quantity.TopologyLevel {
   847  		case podresv1.TopologyLevel_NUMA:
   848  			zoneNode := util.GenerateNumaZoneNode(nodeID)
   849  			parentZoneNode, ok := p.numaSocketZoneNodeMap[zoneNode]
   850  			if !ok {
   851  				return util.ZoneNode{}, nil, fmt.Errorf("numa zone node %v parent not found", zoneNode)
   852  			}
   853  			return zoneNode, &parentZoneNode, nil
   854  		case podresv1.TopologyLevel_SOCKET:
   855  			zoneNode := util.GenerateSocketZoneNode(nodeID)
   856  			return zoneNode, nil, nil
   857  		default:
   858  			return util.ZoneNode{}, nil, fmt.Errorf("quantity %v unsupport topology level: %s", quantity, quantity.TopologyLevel)
   859  		}
   860  	} else {
   861  		// if quantity has type, the zone's type is quantity type and name is quantity name by default,
   862  		// and if it has resource identifier annotation use it instead
   863  		zoneName := quantity.Name
   864  		if identifier, ok := quantity.Annotations[apiconsts.ResourceAnnotationKeyResourceIdentifier]; ok && len(identifier) != 0 {
   865  			zoneName = identifier
   866  		}
   867  
   868  		zoneNode := util.ZoneNode{
   869  			Meta: util.ZoneMeta{
   870  				Type: nodev1alpha1.TopologyType(quantity.Type),
   871  				Name: zoneName,
   872  			},
   873  		}
   874  
   875  		switch quantity.TopologyLevel {
   876  		case podresv1.TopologyLevel_NUMA:
   877  			parentZoneNode := util.GenerateNumaZoneNode(nodeID)
   878  			return zoneNode, &parentZoneNode, nil
   879  		case podresv1.TopologyLevel_SOCKET:
   880  			parentZoneNode := util.GenerateSocketZoneNode(nodeID)
   881  			return zoneNode, &parentZoneNode, nil
   882  		default:
   883  			return zoneNode, nil, fmt.Errorf("quantity %v unsupport topology level: %s", quantity, quantity.TopologyLevel)
   884  		}
   885  	}
   886  }
   887  
   888  func (p *topologyAdapterImpl) getZoneSiblings() (map[util.ZoneNode]util.ZoneSiblings, error) {
   889  	zoneSiblings := make(map[util.ZoneNode]util.ZoneSiblings)
   890  	for id, siblings := range p.metaServer.SiblingNumaMap {
   891  		zoneNode := util.GenerateNumaZoneNode(id)
   892  		zoneSiblings[zoneNode] = make(util.ZoneSiblings, 0)
   893  		for sibling := range siblings {
   894  			zoneSiblings[zoneNode] = append(zoneSiblings[zoneNode], nodev1alpha1.Sibling{
   895  				Type: nodev1alpha1.TopologyTypeNuma,
   896  				Name: strconv.Itoa(sibling),
   897  			})
   898  		}
   899  	}
   900  
   901  	return zoneSiblings, nil
   902  }
   903  
   904  // addContainerMemoryBandwidth add container memory bandwidth according to numa cpu allocated and cpu request
   905  func (p *topologyAdapterImpl) addContainerMemoryBandwidth(zoneAllocated map[util.ZoneNode]*v1.ResourceList, podMeta metav1.ObjectMeta, name string) (map[util.ZoneNode]*v1.ResourceList, error) {
   906  	spec, err := p.metaServer.GetContainerSpec(string(podMeta.UID), name)
   907  	if err != nil {
   908  		return nil, err
   909  	}
   910  
   911  	cpuRequest := native.CPUQuantityGetter()(spec.Resources.Requests)
   912  	if cpuRequest.IsZero() {
   913  		return zoneAllocated, nil
   914  	}
   915  
   916  	numaAllocated := make(map[util.ZoneNode]*v1.ResourceList)
   917  	for zoneNode, allocated := range zoneAllocated {
   918  		// only consider numa which is allocated cpu and memory bandwidth capacity greater than zero
   919  		if zoneNode.Meta.Type == nodev1alpha1.TopologyTypeNuma && allocated != nil &&
   920  			(*allocated).Cpu().CmpInt64(0) > 0 {
   921  			numaID, err := util.GetZoneID(zoneNode)
   922  			if err != nil {
   923  				return nil, err
   924  			}
   925  
   926  			// if the numa avg mbw capacity is zero, we will not consider its mbw allocation
   927  			if p.metaServer.SiblingNumaAvgMBWCapacityMap[numaID] > 0 {
   928  				numaAllocated[zoneNode] = allocated
   929  			}
   930  		}
   931  	}
   932  
   933  	// only numa allocated container need consider memory bandwidth
   934  	if len(numaAllocated) > 0 {
   935  		memoryBandwidthRequest, err := spd.GetContainerMemoryBandwidthRequest(p.metaServer, podMeta, int(cpuRequest.Value()))
   936  		if err != nil {
   937  			return nil, err
   938  		}
   939  
   940  		if memoryBandwidthRequest > 0 {
   941  			memoryBandwidthRequestPerNuma := memoryBandwidthRequest / len(numaAllocated)
   942  			for _, allocated := range numaAllocated {
   943  				(*allocated)[apiconsts.ResourceMemoryBandwidth] = *resource.NewQuantity(int64(memoryBandwidthRequestPerNuma), resource.BinarySI)
   944  			}
   945  		}
   946  	}
   947  
   948  	return zoneAllocated, nil
   949  }
   950  
   951  // addNumaMemoryBandwidthResources add numa memory bandwidth by numa to memory bandwidth map
   952  func (p *topologyAdapterImpl) addNumaMemoryBandwidthResources(zoneResources map[util.ZoneNode]*v1.ResourceList, memoryBandwidthMap map[int]int64) (map[util.ZoneNode]*v1.ResourceList, error) {
   953  	for id, memoryBandwidth := range memoryBandwidthMap {
   954  		if memoryBandwidth <= 0 {
   955  			continue
   956  		}
   957  
   958  		numaZoneNode := util.GenerateNumaZoneNode(id)
   959  		res, ok := zoneResources[numaZoneNode]
   960  		if !ok || res == nil {
   961  			zoneResources[numaZoneNode] = &v1.ResourceList{}
   962  		}
   963  		(*zoneResources[numaZoneNode])[apiconsts.ResourceMemoryBandwidth] = *resource.NewQuantity(memoryBandwidth, resource.BinarySI)
   964  	}
   965  	return zoneResources, nil
   966  }
   967  
   968  // filterAllocatedPodResourcesList is to filter pods that have allocated devices or Resources
   969  func filterAllocatedPodResourcesList(podResourcesList []*podresv1.PodResources) []*podresv1.PodResources {
   970  	allocatedPodResourcesList := make([]*podresv1.PodResources, 0, len(podResourcesList))
   971  	isAllocatedPod := func(pod *podresv1.PodResources) bool {
   972  		if pod == nil {
   973  			return false
   974  		}
   975  
   976  		// filter allocated pod by whether it has at least one container with
   977  		// devices or Resources
   978  		for _, container := range pod.Containers {
   979  			if container != nil && (len(container.Devices) != 0 ||
   980  				len(container.Resources) != 0) {
   981  				return true
   982  			}
   983  		}
   984  
   985  		return false
   986  	}
   987  
   988  	for _, pod := range podResourcesList {
   989  		if isAllocatedPod(pod) {
   990  			allocatedPodResourcesList = append(allocatedPodResourcesList, pod)
   991  		}
   992  	}
   993  
   994  	return allocatedPodResourcesList
   995  }