github.com/kubewharf/katalyst-core@v0.5.3/pkg/agent/resourcemanager/fetcher/kubelet/topology/topology_adapter.go (about) 1 /* 2 Copyright 2022 The Katalyst Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package topology 18 19 import ( 20 "context" 21 "encoding/json" 22 "fmt" 23 "strconv" 24 "sync" 25 "time" 26 27 "github.com/fsnotify/fsnotify" 28 info "github.com/google/cadvisor/info/v1" 29 "github.com/pkg/errors" 30 "google.golang.org/grpc" 31 v1 "k8s.io/api/core/v1" 32 "k8s.io/apimachinery/pkg/api/resource" 33 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 34 utilerrors "k8s.io/apimachinery/pkg/util/errors" 35 "k8s.io/apimachinery/pkg/util/sets" 36 "k8s.io/klog/v2" 37 podresv1 "k8s.io/kubelet/pkg/apis/podresources/v1" 38 resourceutil "k8s.io/kubernetes/pkg/api/v1/resource" 39 40 nodev1alpha1 "github.com/kubewharf/katalyst-api/pkg/apis/node/v1alpha1" 41 apiconsts "github.com/kubewharf/katalyst-api/pkg/consts" 42 "github.com/kubewharf/katalyst-api/pkg/utils" 43 "github.com/kubewharf/katalyst-core/pkg/config/generic" 44 "github.com/kubewharf/katalyst-core/pkg/consts" 45 "github.com/kubewharf/katalyst-core/pkg/metaserver" 46 metaserverpod "github.com/kubewharf/katalyst-core/pkg/metaserver/agent/pod" 47 "github.com/kubewharf/katalyst-core/pkg/metaserver/spd" 48 "github.com/kubewharf/katalyst-core/pkg/util" 49 "github.com/kubewharf/katalyst-core/pkg/util/general" 50 "github.com/kubewharf/katalyst-core/pkg/util/kubelet/podresources" 51 "github.com/kubewharf/katalyst-core/pkg/util/native" 52 ) 53 54 const ( 55 podResourcesClientTimeout = 10 * time.Second 56 getTopologyZonesTimeout = 10 * time.Second 57 podResourcesClientMaxMsgSize = 1024 * 1024 * 16 58 ) 59 60 // NumaInfoGetter is to get numa info 61 type NumaInfoGetter func() ([]info.Node, error) 62 63 // PodResourcesFilter is to filter pod resources which does need to be reported 64 type PodResourcesFilter func(*v1.Pod, *podresv1.PodResources) (*podresv1.PodResources, error) 65 66 var oneQuantity = *resource.NewQuantity(1, resource.DecimalSI) 67 68 type topologyAdapterImpl struct { 69 mutex sync.Mutex 70 client podresv1.PodResourcesListerClient 71 endpoints []string 72 73 // qosConf is used to get pod qos configuration 74 qosConf *generic.QoSConfiguration 75 76 // metaServer is used to fetch pod list to calculate numa allocation 77 metaServer *metaserver.MetaServer 78 79 // numaSocketZoneNodeMap map numa zone node => socket zone node 80 numaSocketZoneNodeMap map[util.ZoneNode]util.ZoneNode 81 82 // skipDeviceNames name of devices which will be skipped in getting numa allocatable and allocation 83 skipDeviceNames sets.String 84 85 // getClientFunc is func to get pod resources lister client 86 getClientFunc podresources.GetClientFunc 87 88 // podResourcesFilter is support to filter out pods or resources which no need report to cnr 89 podResourcesFilter PodResourcesFilter 90 91 // kubeletResourcePluginPaths is the path of kubelet resource plugin 92 kubeletResourcePluginPaths []string 93 94 // resourceNameToZoneTypeMap is a map that stores the mapping relationship between resource names to zone types for device zones 95 resourceNameToZoneTypeMap map[string]string 96 97 // needValidationResources is the resources needed to be validated 98 needValidationResources []string 99 } 100 101 // NewPodResourcesServerTopologyAdapter creates a topology adapter which uses pod resources server 102 func NewPodResourcesServerTopologyAdapter(metaServer *metaserver.MetaServer, qosConf *generic.QoSConfiguration, 103 endpoints []string, kubeletResourcePluginPaths []string, resourceNameToZoneTypeMap map[string]string, 104 skipDeviceNames sets.String, numaInfoGetter NumaInfoGetter, podResourcesFilter PodResourcesFilter, 105 getClientFunc podresources.GetClientFunc, needValidationResources []string, 106 ) (Adapter, error) { 107 numaInfo, err := numaInfoGetter() 108 if err != nil { 109 return nil, fmt.Errorf("failed to get numa info: %s", err) 110 } 111 112 // make sure all candidate kubelet resource plugin paths exist 113 for _, path := range kubeletResourcePluginPaths { 114 // ensure resource plugin path exists 115 err = general.EnsureDirectory(path) 116 if err != nil { 117 return nil, errors.Wrapf(err, "ensure resource plugin path %s exists failed", path) 118 } 119 } 120 121 numaSocketZoneNodeMap := util.GenerateNumaSocketZone(numaInfo) 122 return &topologyAdapterImpl{ 123 endpoints: endpoints, 124 kubeletResourcePluginPaths: kubeletResourcePluginPaths, 125 qosConf: qosConf, 126 metaServer: metaServer, 127 numaSocketZoneNodeMap: numaSocketZoneNodeMap, 128 skipDeviceNames: skipDeviceNames, 129 getClientFunc: getClientFunc, 130 podResourcesFilter: podResourcesFilter, 131 resourceNameToZoneTypeMap: resourceNameToZoneTypeMap, 132 needValidationResources: needValidationResources, 133 }, nil 134 } 135 136 func (p *topologyAdapterImpl) GetTopologyZones(parentCtx context.Context) ([]*nodev1alpha1.TopologyZone, error) { 137 p.mutex.Lock() 138 defer p.mutex.Unlock() 139 140 // always force getting pod list instead of cache 141 ctx := context.WithValue(parentCtx, metaserverpod.BypassCacheKey, metaserverpod.BypassCacheTrue) 142 143 ctx, cancel := context.WithTimeout(ctx, getTopologyZonesTimeout) 144 defer cancel() 145 podList, err := p.metaServer.GetPodList(ctx, nil) 146 if err != nil { 147 return nil, errors.Wrap(err, "get pod list from metaServer failed") 148 } 149 150 listPodResourcesResponse, err := p.client.List(ctx, &podresv1.ListPodResourcesRequest{}) 151 if err != nil { 152 return nil, errors.Wrap(err, "list pod from pod resource server failed") 153 } 154 155 allocatableResources, err := p.client.GetAllocatableResources(ctx, &podresv1.AllocatableResourcesRequest{}) 156 if err != nil { 157 return nil, errors.Wrap(err, "get allocatable Resources from pod resource server failed") 158 } 159 160 if klog.V(5).Enabled() { 161 listPodResourcesResponseStr, _ := json.Marshal(listPodResourcesResponse) 162 allocatableResourcesResponseStr, _ := json.Marshal(allocatableResources) 163 klog.Infof("list pod Resources: %s\n allocatable Resources: %s", string(listPodResourcesResponseStr), 164 string(allocatableResourcesResponseStr)) 165 } 166 167 // validate pod Resources server response to make sure report topology status is correct 168 if err = p.validatePodResourcesServerResponse(allocatableResources, listPodResourcesResponse); err != nil { 169 return nil, errors.Wrap(err, "validate pod Resources server response failed") 170 } 171 172 podResources := listPodResourcesResponse.GetPodResources() 173 if len(podResources) == 0 { 174 return nil, errors.Errorf("list pod resources response is empty") 175 } 176 177 // filter already allocated pods 178 podResourcesList := filterAllocatedPodResourcesList(podResources) 179 180 // get numa Allocations by pod Resources 181 zoneAllocations, err := p.getZoneAllocations(podList, podResourcesList) 182 if err != nil { 183 return nil, errors.Wrap(err, "get zone allocations failed") 184 } 185 186 // get zone resources by allocatable resources 187 zoneResources, err := p.getZoneResources(allocatableResources) 188 if err != nil { 189 return nil, errors.Wrap(err, "get zone resources failed") 190 } 191 192 // get zone attributes by allocatable resources 193 zoneAttributes, err := p.getZoneAttributes(allocatableResources) 194 if err != nil { 195 return nil, errors.Wrap(err, "get zone attributes failed") 196 } 197 198 // get zone siblings by SiblingNumaMap 199 zoneSiblings, err := p.getZoneSiblings() 200 if err != nil { 201 return nil, errors.Wrap(err, "get zone siblings failed") 202 } 203 204 // initialize a topology zone generator by numa socket zone node map 205 topologyZoneGenerator, err := util.NewNumaSocketTopologyZoneGenerator(p.numaSocketZoneNodeMap) 206 if err != nil { 207 return nil, err 208 } 209 210 // add other children zone node of numa or socket into topology zone generator by allocatable resources 211 err = p.addNumaSocketChildrenZoneNodes(topologyZoneGenerator, allocatableResources) 212 if err != nil { 213 return nil, errors.Wrap(err, "get socket and numa zone topology failed") 214 } 215 216 err = p.addDeviceZoneNodes(topologyZoneGenerator, allocatableResources) 217 if err != nil { 218 return nil, errors.Wrap(err, "get device zone topology failed") 219 } 220 221 return topologyZoneGenerator.GenerateTopologyZoneStatus(zoneAllocations, zoneResources, zoneAttributes, zoneSiblings), nil 222 } 223 224 // GetTopologyPolicy return newest topology policy status 225 func (p *topologyAdapterImpl) GetTopologyPolicy(ctx context.Context) (nodev1alpha1.TopologyPolicy, error) { 226 p.mutex.Lock() 227 defer p.mutex.Unlock() 228 229 klConfig, err := p.metaServer.GetKubeletConfig(ctx) 230 if err != nil { 231 return "", errors.Wrap(err, "get kubelet config failed") 232 } 233 234 return utils.GenerateTopologyPolicy(klConfig.TopologyManagerPolicy, klConfig.TopologyManagerScope), nil 235 } 236 237 func (p *topologyAdapterImpl) Run(ctx context.Context, handler func()) error { 238 var ( 239 err error 240 conn *grpc.ClientConn 241 ) 242 p.mutex.Lock() 243 defer p.mutex.Unlock() 244 245 p.client, conn, err = p.getClientFunc( 246 general.GetOneExistPath(p.endpoints), podResourcesClientTimeout, podResourcesClientMaxMsgSize) 247 if err != nil { 248 return fmt.Errorf("get podResources client failed, connect err: %s", err) 249 } 250 251 // register file watcher to watch qrm checkpoint file change 252 watcher, err := general.RegisterFileEventWatcher( 253 ctx.Done(), 254 general.FileWatcherInfo{ 255 Path: p.kubeletResourcePluginPaths, 256 Filename: consts.KubeletQoSResourceManagerCheckpoint, 257 Op: fsnotify.Create, 258 }, 259 ) 260 if err != nil { 261 return fmt.Errorf("register file watcher failed, err: %s", err) 262 } 263 264 // start a goroutine to watch qrm checkpoint file change and notify to update topology status, 265 // and when qrm checkpoint file changed, it means that the topology status may be changed 266 go func() { 267 defer func() { 268 err = conn.Close() 269 if err != nil { 270 klog.Errorf("pod resource connection close failed: %v", err) 271 } 272 }() 273 for { 274 select { 275 case <-ctx.Done(): 276 klog.Infof("stopping pod resources server topology adapter") 277 return 278 case _, ok := <-watcher: 279 if !ok { 280 klog.Warningf("watcher channel closed") 281 return 282 } 283 klog.Infof("qrm state file changed, notify to update topology status") 284 if handler != nil { 285 handler() 286 } 287 } 288 } 289 }() 290 291 return nil 292 } 293 294 // validatePodResourcesServerResponse validate pod resources server response, if the resource is empty, 295 // maybe the kubelet or qrm plugin is restarting 296 func (p *topologyAdapterImpl) validatePodResourcesServerResponse(allocatableResourcesResponse *podresv1. 297 AllocatableResourcesResponse, listPodResourcesResponse *podresv1.ListPodResourcesResponse, 298 ) error { 299 if len(p.needValidationResources) > 0 { 300 if allocatableResourcesResponse == nil { 301 return fmt.Errorf("allocatable resources response is nil") 302 } 303 304 allocResSet := sets.NewString() 305 for _, res := range allocatableResourcesResponse.Resources { 306 allocResSet.Insert(res.ResourceName) 307 } 308 309 if !allocResSet.HasAll(p.needValidationResources...) { 310 return fmt.Errorf("allocatable resources response doen't contain all the resources that need to be validated") 311 } 312 } 313 314 if listPodResourcesResponse == nil { 315 return fmt.Errorf("list pod Resources response is nil") 316 } 317 318 return nil 319 } 320 321 // addNumaSocketChildrenZoneNodes add the child nodes of socket or numa zone nodes to the generator, the child nodes are 322 // generated by generateZoneNode according to TopologyLevel, Type and Name in TopologyAwareAllocatableQuantityList 323 func (p *topologyAdapterImpl) addNumaSocketChildrenZoneNodes(generator *util.TopologyZoneGenerator, 324 allocatableResources *podresv1.AllocatableResourcesResponse, 325 ) error { 326 if allocatableResources == nil { 327 return fmt.Errorf("allocatable Resources is nil") 328 } 329 330 var errList []error 331 for _, resources := range allocatableResources.Resources { 332 for _, quantity := range resources.TopologyAwareAllocatableQuantityList { 333 if quantity == nil || len(quantity.Type) == 0 { 334 continue 335 } 336 337 zoneNode, parentZoneNode, err := p.generateZoneNode(*quantity) 338 if err != nil { 339 errList = append(errList, fmt.Errorf("get zone key from quantity %v failed: %v", quantity, err)) 340 continue 341 } 342 343 err = generator.AddNode(parentZoneNode, zoneNode) 344 if err != nil { 345 errList = append(errList, err) 346 continue 347 } 348 } 349 } 350 351 if len(errList) > 0 { 352 return utilerrors.NewAggregate(errList) 353 } 354 355 return nil 356 } 357 358 // addDeviceZoneNodes add the device nodes which are children of numa zone nodes to the generator, the device nodes are 359 // generated by generateZoneNode according to TopologyLevel, Type and Name in TopologyAwareAllocatableQuantityList 360 func (p *topologyAdapterImpl) addDeviceZoneNodes(generator *util.TopologyZoneGenerator, 361 allocatableResources *podresv1.AllocatableResourcesResponse, 362 ) error { 363 if allocatableResources == nil { 364 return fmt.Errorf("allocatable Resources is nil") 365 } 366 var errList []error 367 for _, device := range allocatableResources.Devices { 368 if targetZoneType, ok := p.resourceNameToZoneTypeMap[device.ResourceName]; ok { 369 for _, deviceId := range device.DeviceIds { 370 deviceNode := util.GenerateDeviceZoneNode(deviceId, targetZoneType) 371 for _, numaNode := range device.Topology.Nodes { 372 numaZoneNode := util.GenerateNumaZoneNode(int(numaNode.ID)) 373 err := generator.AddNode(&numaZoneNode, deviceNode) 374 if err != nil { 375 errList = append(errList, err) 376 } 377 } 378 } 379 } 380 } 381 382 if len(errList) > 0 { 383 return utilerrors.NewAggregate(errList) 384 } 385 386 return nil 387 } 388 389 // getZoneResources gets a map of zone node to zone Resources. The zone node Resources is combined by allocatable 390 // device and allocatable resources from pod resources server 391 func (p *topologyAdapterImpl) getZoneResources(allocatableResources *podresv1.AllocatableResourcesResponse) (map[util.ZoneNode]nodev1alpha1.Resources, error) { 392 var ( 393 errList []error 394 err error 395 ) 396 397 if allocatableResources == nil { 398 return nil, fmt.Errorf("allocatable Resources is nil") 399 } 400 401 zoneAllocatable := make(map[util.ZoneNode]*v1.ResourceList) 402 zoneCapacity := make(map[util.ZoneNode]*v1.ResourceList) 403 404 zoneAllocatable, err = p.addContainerDevices(zoneAllocatable, allocatableResources.Devices) 405 if err != nil { 406 return nil, err 407 } 408 409 // todo: the capacity and allocatable are equally now because the response includes all 410 // devices which don't consider them whether is healthy 411 zoneCapacity, err = p.addContainerDevices(zoneCapacity, allocatableResources.Devices) 412 if err != nil { 413 return nil, err 414 } 415 416 // calculate Resources capacity and allocatable 417 for _, resources := range allocatableResources.Resources { 418 if resources == nil { 419 continue 420 } 421 422 resourceName := v1.ResourceName(resources.ResourceName) 423 zoneCapacity, err = p.addTopologyAwareQuantity(zoneCapacity, resourceName, resources.TopologyAwareCapacityQuantityList) 424 if err != nil { 425 errList = append(errList, err) 426 continue 427 } 428 429 zoneAllocatable, err = p.addTopologyAwareQuantity(zoneAllocatable, resourceName, resources.TopologyAwareAllocatableQuantityList) 430 if err != nil { 431 errList = append(errList, err) 432 continue 433 } 434 } 435 436 zoneCapacity, err = p.addNumaMemoryBandwidthResources(zoneCapacity, p.metaServer.SiblingNumaAvgMBWCapacityMap) 437 if err != nil { 438 errList = append(errList, err) 439 } 440 441 zoneAllocatable, err = p.addNumaMemoryBandwidthResources(zoneAllocatable, p.metaServer.SiblingNumaAvgMBWAllocatableMap) 442 if err != nil { 443 errList = append(errList, err) 444 } 445 446 if len(errList) > 0 { 447 return nil, utilerrors.NewAggregate(errList) 448 } 449 450 resources := make(map[util.ZoneNode]nodev1alpha1.Resources) 451 for zone, capacity := range zoneCapacity { 452 allocatable, ok := zoneAllocatable[zone] 453 if !ok { 454 return nil, fmt.Errorf("zone %v capacity found but allocatable is not found", zone) 455 } 456 457 resources[zone] = nodev1alpha1.Resources{ 458 Capacity: capacity, 459 Allocatable: allocatable, 460 } 461 } 462 463 return resources, nil 464 } 465 466 // getZoneAllocations gets a map of zone nodes to zone allocations computed from a list of pod resources that aggregates per-container allocations using 467 // aggregateContainerAllocated. The podResourcesFilter is used to filter out some pods that do not need to be reported to cnr 468 func (p *topologyAdapterImpl) getZoneAllocations(podList []*v1.Pod, podResourcesList []*podresv1.PodResources) (map[util.ZoneNode]util.ZoneAllocations, error) { 469 var ( 470 err error 471 errList []error 472 ) 473 474 podMap := native.GetPodNamespaceNameKeyMap(podList) 475 zoneAllocationsMap := make(map[util.ZoneNode]util.ZoneAllocations) 476 for _, podResources := range podResourcesList { 477 if podResources == nil { 478 continue 479 } 480 481 podKey := native.GenerateNamespaceNameKey(podResources.Namespace, podResources.Name) 482 pod, ok := podMap[podKey] 483 if !ok { 484 errList = append(errList, fmt.Errorf("pod %s not found in metaserver", podKey)) 485 continue 486 } 487 488 if native.PodIsTerminated(pod) { 489 continue 490 } 491 492 // the pod resource filter will filter out unwanted pods 493 if p.podResourcesFilter != nil { 494 podResources, err = p.podResourcesFilter(pod, podResources) 495 if err != nil { 496 errList = append(errList, err) 497 continue 498 } 499 500 // if podResources is nil, it means that the pod is filtered out 501 if podResources == nil { 502 continue 503 } 504 } 505 506 // aggregates resources in each zone used by all containers of the pod 507 podAllocated, err := p.aggregateContainerAllocated(pod.ObjectMeta, podResources.Containers) 508 if err != nil { 509 errList = append(errList, fmt.Errorf("pod %s aggregate container allocated failed, %s", podKey, err)) 510 continue 511 } 512 513 // revise pod allocated according qos level 514 err = p.revisePodAllocated(pod, podAllocated) 515 if err != nil { 516 errList = append(errList, fmt.Errorf("pod %s revise pod allocated failed, %s", podKey, err)) 517 continue 518 } 519 520 for zoneNode, resourceList := range podAllocated { 521 _, ok := zoneAllocationsMap[zoneNode] 522 if !ok { 523 zoneAllocationsMap[zoneNode] = util.ZoneAllocations{} 524 } 525 526 zoneAllocationsMap[zoneNode] = append(zoneAllocationsMap[zoneNode], &nodev1alpha1.Allocation{ 527 Consumer: native.GenerateUniqObjectUIDKey(pod), 528 Requests: resourceList, 529 }) 530 } 531 } 532 533 if len(errList) > 0 { 534 return nil, utilerrors.NewAggregate(errList) 535 } 536 537 return zoneAllocationsMap, nil 538 } 539 540 // revisePodAllocated is to revise pod allocated according to its qos level 541 func (p *topologyAdapterImpl) revisePodAllocated(pod *v1.Pod, podAllocated map[util.ZoneNode]*v1.ResourceList) error { 542 qosLevel, err := p.qosConf.GetQoSLevel(pod, map[string]string{}) 543 if err != nil { 544 return err 545 } 546 547 switch qosLevel { 548 case apiconsts.PodAnnotationQoSLevelSharedCores: 549 // revise shared_cores pod allocated according to its numa binding 550 return p.reviseSharedCoresPodAllocated(pod, podAllocated) 551 default: 552 return nil 553 } 554 } 555 556 // reviseSharedCoresPodAllocated is to revise shared_cores pod allocated according to its numa binding 557 func (p *topologyAdapterImpl) reviseSharedCoresPodAllocated(pod *v1.Pod, podAllocated map[util.ZoneNode]*v1.ResourceList) error { 558 ok, err := util.ValidateSharedCoresWithNumaBindingPod(p.qosConf, pod, podAllocated) 559 if !ok || err != nil { 560 return err 561 } 562 563 for zoneNode, resourceList := range podAllocated { 564 if zoneNode.Meta.Type != nodev1alpha1.TopologyTypeNuma { 565 continue 566 } 567 568 if resourceList != nil && 569 (!resourceList.Cpu().IsZero() || !resourceList.Memory().IsZero()) { 570 571 // revise the allocated resources to the binding numa node 572 requests, _ := resourceutil.PodRequestsAndLimits(pod) 573 if requests != nil { 574 (*resourceList)[v1.ResourceCPU] = requests.Cpu().DeepCopy() 575 (*resourceList)[v1.ResourceMemory] = requests.Memory().DeepCopy() 576 } 577 578 // shared_cores with numa binding pod cpu and memory are only bound to one numa, 579 break 580 } 581 } 582 583 return nil 584 } 585 586 // getZoneAttributes gets a map of zone node to zone attributes, which is generated from the annotation of 587 // topology aware quantity and socket and numa zone are not support attribute here 588 func (p *topologyAdapterImpl) getZoneAttributes(allocatableResources *podresv1.AllocatableResourcesResponse) (map[util.ZoneNode]util.ZoneAttributes, error) { 589 if allocatableResources == nil { 590 return nil, fmt.Errorf("allocatable Resources is nil") 591 } 592 593 var errList []error 594 zoneAttributes := make(map[util.ZoneNode]util.ZoneAttributes) 595 for _, resources := range allocatableResources.Resources { 596 if resources == nil { 597 continue 598 } 599 600 for _, quantity := range resources.TopologyAwareAllocatableQuantityList { 601 // only quantity with type need report attributes, and others such as Socket and Numa 602 // no need report that 603 if quantity == nil || len(quantity.Type) == 0 { 604 continue 605 } 606 607 zoneNode, _, err := p.generateZoneNode(*quantity) 608 if err != nil { 609 errList = append(errList, fmt.Errorf("get zone node from quantity %v failed: %v", quantity, err)) 610 continue 611 } 612 613 if _, ok := zoneAttributes[zoneNode]; !ok { 614 zoneAttributes[zoneNode] = util.ZoneAttributes{} 615 } 616 617 var attrs []nodev1alpha1.Attribute 618 for annoKey, value := range quantity.Annotations { 619 attrs = append(attrs, nodev1alpha1.Attribute{ 620 Name: annoKey, 621 Value: value, 622 }) 623 } 624 625 zoneAttributes[zoneNode] = util.MergeAttributes(zoneAttributes[zoneNode], attrs) 626 } 627 } 628 629 if len(errList) > 0 { 630 return nil, utilerrors.NewAggregate(errList) 631 } 632 633 return zoneAttributes, nil 634 } 635 636 // aggregateContainerAllocated aggregates resources in each zone used by all containers of a pod and returns a map of zone node to 637 // container allocated resources. 638 func (p *topologyAdapterImpl) aggregateContainerAllocated(podMeta metav1.ObjectMeta, containers []*podresv1.ContainerResources) (map[util.ZoneNode]*v1.ResourceList, error) { 639 var errList []error 640 641 podAllocated := make(map[util.ZoneNode]*v1.ResourceList) 642 for _, containerResources := range containers { 643 if containerResources == nil { 644 continue 645 } 646 647 var err error 648 containerAllocated := make(map[util.ZoneNode]*v1.ResourceList) 649 containerAllocated, err = p.addContainerDevices(containerAllocated, containerResources.Devices) 650 if err != nil { 651 errList = append(errList, fmt.Errorf("get container %s devices allocated failed: %s", 652 containerResources.Name, err)) 653 continue 654 } 655 656 containerAllocated, err = p.addContainerResources(containerAllocated, containerResources.Resources) 657 if err != nil { 658 errList = append(errList, fmt.Errorf("get container %s resources allocated failed: %s", 659 containerResources.Name, err)) 660 continue 661 } 662 663 // add container memory bandwidth according to its allocated numa resources 664 containerAllocated, err = p.addContainerMemoryBandwidth(containerAllocated, podMeta, containerResources.Name) 665 if err != nil { 666 errList = append(errList, fmt.Errorf("get container %s memory bandwidth failed: %s", 667 containerResources.Name, err)) 668 continue 669 } 670 671 for zoneNode, resourceList := range containerAllocated { 672 if resourceList == nil { 673 continue 674 } 675 676 for resourceName, quantity := range *resourceList { 677 podAllocated = addZoneQuantity(podAllocated, zoneNode, resourceName, quantity) 678 } 679 } 680 } 681 682 if len(errList) > 0 { 683 return nil, utilerrors.NewAggregate(errList) 684 } 685 686 return podAllocated, nil 687 } 688 689 // addContainerDevices add all numa zone device into the zone resources map, and the skipDeviceNames is used 690 // to filter out some devices that do not need to be reported to cnr. The device name is the resource name and 691 // the quantity is the number of devices. 692 func (p *topologyAdapterImpl) addContainerDevices(zoneResources map[util.ZoneNode]*v1.ResourceList, 693 containerDevices []*podresv1.ContainerDevices, 694 ) (map[util.ZoneNode]*v1.ResourceList, error) { 695 var errList []error 696 697 if zoneResources == nil { 698 zoneResources = make(map[util.ZoneNode]*v1.ResourceList) 699 } 700 701 for _, device := range containerDevices { 702 if device == nil || device.Topology == nil { 703 continue 704 } 705 706 if p.skipDeviceNames != nil && p.skipDeviceNames.Has(device.ResourceName) { 707 continue 708 } 709 710 resourceName := v1.ResourceName(device.ResourceName) 711 for _, node := range device.Topology.Nodes { 712 if node == nil { 713 continue 714 } 715 716 zoneNode := util.GenerateNumaZoneNode(int(node.ID)) 717 zoneResources = addZoneQuantity(zoneResources, zoneNode, resourceName, oneQuantity) 718 719 if zoneType, ok := p.resourceNameToZoneTypeMap[device.ResourceName]; ok { 720 for _, deviceId := range device.DeviceIds { 721 deviceNode := util.GenerateDeviceZoneNode(deviceId, zoneType) 722 zoneResources = addZoneQuantity(zoneResources, deviceNode, resourceName, oneQuantity) 723 } 724 } 725 } 726 } 727 728 if len(errList) > 0 { 729 return nil, utilerrors.NewAggregate(errList) 730 } 731 732 return zoneResources, nil 733 } 734 735 // addContainerResources add all container resources into the zone resources map, get each resource of each zone node 736 // and add them together to get the total resource of each zone node. 737 func (p *topologyAdapterImpl) addContainerResources(zoneResources map[util.ZoneNode]*v1.ResourceList, 738 topoAwareResources []*podresv1.TopologyAwareResource, 739 ) (map[util.ZoneNode]*v1.ResourceList, error) { 740 var ( 741 errList []error 742 err error 743 ) 744 745 if zoneResources == nil { 746 zoneResources = make(map[util.ZoneNode]*v1.ResourceList) 747 } 748 749 for _, resources := range topoAwareResources { 750 if resources == nil { 751 continue 752 } 753 754 resourceName := v1.ResourceName(resources.ResourceName) 755 zoneResources, err = p.addTopologyAwareQuantity(zoneResources, resourceName, resources.OriginalTopologyAwareQuantityList) 756 if err != nil { 757 errList = append(errList, err) 758 continue 759 } 760 } 761 762 if len(errList) > 0 { 763 return nil, utilerrors.NewAggregate(errList) 764 } 765 766 return zoneResources, nil 767 } 768 769 // addTopologyAwareQuantity add zone node resource into the map according to TopologyAwareQuantity list. Each TopologyAwareQuantity has a 770 // list of topology nodes, and each topology node has name, type, topology level, and annotations, and the resource value. The zone node 771 // is determined by the topology node name, type, topology level, 772 func (p *topologyAdapterImpl) addTopologyAwareQuantity(zoneResourceList map[util.ZoneNode]*v1.ResourceList, resourceName v1.ResourceName, 773 topoAwareQuantityList []*podresv1.TopologyAwareQuantity, 774 ) (map[util.ZoneNode]*v1.ResourceList, error) { 775 var errList []error 776 777 if zoneResourceList == nil { 778 zoneResourceList = make(map[util.ZoneNode]*v1.ResourceList) 779 } 780 781 for _, quantity := range topoAwareQuantityList { 782 783 if quantity == nil { 784 continue 785 } 786 787 zoneNode, _, err := p.generateZoneNode(*quantity) 788 if err != nil { 789 errList = append(errList, fmt.Errorf("get zone node from quantity %v failed: %v", quantity, err)) 790 continue 791 } 792 793 resourceValue, err := resource.ParseQuantity(fmt.Sprintf("%.2f", quantity.ResourceValue)) 794 if err != nil { 795 errList = append(errList, fmt.Errorf("parse resource: %s for zone %s failed: %s", resourceName, zoneNode, err)) 796 continue 797 } 798 799 zoneResourceList = addZoneQuantity(zoneResourceList, zoneNode, resourceName, resourceValue) 800 } 801 802 if len(errList) > 0 { 803 return nil, utilerrors.NewAggregate(errList) 804 } 805 806 return zoneResourceList, nil 807 } 808 809 // addZoneQuantity add a zone and resource quantity into the zone resource map, if the zone node is not in the map, 810 // then create a new resource list for the zone node, and add the resource quantity into the resource list. If the 811 // zone node is in the map, then get the resource list from the map, and add the resource quantity into the resource 812 // list. 813 func addZoneQuantity(zoneResourceList map[util.ZoneNode]*v1.ResourceList, zoneNode util.ZoneNode, 814 resourceName v1.ResourceName, value resource.Quantity, 815 ) map[util.ZoneNode]*v1.ResourceList { 816 if zoneResourceList == nil { 817 zoneResourceList = make(map[util.ZoneNode]*v1.ResourceList) 818 } 819 820 resourceListPtr, ok := zoneResourceList[zoneNode] 821 if !ok || resourceListPtr == nil { 822 resourceListPtr = &v1.ResourceList{} 823 zoneResourceList[zoneNode] = resourceListPtr 824 } 825 resourceList := *resourceListPtr 826 827 quantity, resourceOk := resourceList[resourceName] 828 if !resourceOk { 829 quantity = resource.Quantity{} 830 resourceList[resourceName] = quantity 831 } 832 833 quantity.Add(value) 834 resourceList[resourceName] = quantity 835 836 return zoneResourceList 837 } 838 839 // generateZoneNode get zone node and its parent zone node from quantity according to quantity type and topology level 840 // - if Type is empty, it means that the zone is socket or numa according to TopologyLevel 841 // - if Type is not empty, it means that the zone is a child of socket or a child of numa determined by TopologyLevel, 842 // and the zone name is determined by the quantity name or its resource identifier if existed. 843 func (p *topologyAdapterImpl) generateZoneNode(quantity podresv1.TopologyAwareQuantity) (util.ZoneNode, *util.ZoneNode, error) { 844 nodeID := int(quantity.Node) 845 if len(quantity.Type) == 0 { 846 switch quantity.TopologyLevel { 847 case podresv1.TopologyLevel_NUMA: 848 zoneNode := util.GenerateNumaZoneNode(nodeID) 849 parentZoneNode, ok := p.numaSocketZoneNodeMap[zoneNode] 850 if !ok { 851 return util.ZoneNode{}, nil, fmt.Errorf("numa zone node %v parent not found", zoneNode) 852 } 853 return zoneNode, &parentZoneNode, nil 854 case podresv1.TopologyLevel_SOCKET: 855 zoneNode := util.GenerateSocketZoneNode(nodeID) 856 return zoneNode, nil, nil 857 default: 858 return util.ZoneNode{}, nil, fmt.Errorf("quantity %v unsupport topology level: %s", quantity, quantity.TopologyLevel) 859 } 860 } else { 861 // if quantity has type, the zone's type is quantity type and name is quantity name by default, 862 // and if it has resource identifier annotation use it instead 863 zoneName := quantity.Name 864 if identifier, ok := quantity.Annotations[apiconsts.ResourceAnnotationKeyResourceIdentifier]; ok && len(identifier) != 0 { 865 zoneName = identifier 866 } 867 868 zoneNode := util.ZoneNode{ 869 Meta: util.ZoneMeta{ 870 Type: nodev1alpha1.TopologyType(quantity.Type), 871 Name: zoneName, 872 }, 873 } 874 875 switch quantity.TopologyLevel { 876 case podresv1.TopologyLevel_NUMA: 877 parentZoneNode := util.GenerateNumaZoneNode(nodeID) 878 return zoneNode, &parentZoneNode, nil 879 case podresv1.TopologyLevel_SOCKET: 880 parentZoneNode := util.GenerateSocketZoneNode(nodeID) 881 return zoneNode, &parentZoneNode, nil 882 default: 883 return zoneNode, nil, fmt.Errorf("quantity %v unsupport topology level: %s", quantity, quantity.TopologyLevel) 884 } 885 } 886 } 887 888 func (p *topologyAdapterImpl) getZoneSiblings() (map[util.ZoneNode]util.ZoneSiblings, error) { 889 zoneSiblings := make(map[util.ZoneNode]util.ZoneSiblings) 890 for id, siblings := range p.metaServer.SiblingNumaMap { 891 zoneNode := util.GenerateNumaZoneNode(id) 892 zoneSiblings[zoneNode] = make(util.ZoneSiblings, 0) 893 for sibling := range siblings { 894 zoneSiblings[zoneNode] = append(zoneSiblings[zoneNode], nodev1alpha1.Sibling{ 895 Type: nodev1alpha1.TopologyTypeNuma, 896 Name: strconv.Itoa(sibling), 897 }) 898 } 899 } 900 901 return zoneSiblings, nil 902 } 903 904 // addContainerMemoryBandwidth add container memory bandwidth according to numa cpu allocated and cpu request 905 func (p *topologyAdapterImpl) addContainerMemoryBandwidth(zoneAllocated map[util.ZoneNode]*v1.ResourceList, podMeta metav1.ObjectMeta, name string) (map[util.ZoneNode]*v1.ResourceList, error) { 906 spec, err := p.metaServer.GetContainerSpec(string(podMeta.UID), name) 907 if err != nil { 908 return nil, err 909 } 910 911 cpuRequest := native.CPUQuantityGetter()(spec.Resources.Requests) 912 if cpuRequest.IsZero() { 913 return zoneAllocated, nil 914 } 915 916 numaAllocated := make(map[util.ZoneNode]*v1.ResourceList) 917 for zoneNode, allocated := range zoneAllocated { 918 // only consider numa which is allocated cpu and memory bandwidth capacity greater than zero 919 if zoneNode.Meta.Type == nodev1alpha1.TopologyTypeNuma && allocated != nil && 920 (*allocated).Cpu().CmpInt64(0) > 0 { 921 numaID, err := util.GetZoneID(zoneNode) 922 if err != nil { 923 return nil, err 924 } 925 926 // if the numa avg mbw capacity is zero, we will not consider its mbw allocation 927 if p.metaServer.SiblingNumaAvgMBWCapacityMap[numaID] > 0 { 928 numaAllocated[zoneNode] = allocated 929 } 930 } 931 } 932 933 // only numa allocated container need consider memory bandwidth 934 if len(numaAllocated) > 0 { 935 memoryBandwidthRequest, err := spd.GetContainerMemoryBandwidthRequest(p.metaServer, podMeta, int(cpuRequest.Value())) 936 if err != nil { 937 return nil, err 938 } 939 940 if memoryBandwidthRequest > 0 { 941 memoryBandwidthRequestPerNuma := memoryBandwidthRequest / len(numaAllocated) 942 for _, allocated := range numaAllocated { 943 (*allocated)[apiconsts.ResourceMemoryBandwidth] = *resource.NewQuantity(int64(memoryBandwidthRequestPerNuma), resource.BinarySI) 944 } 945 } 946 } 947 948 return zoneAllocated, nil 949 } 950 951 // addNumaMemoryBandwidthResources add numa memory bandwidth by numa to memory bandwidth map 952 func (p *topologyAdapterImpl) addNumaMemoryBandwidthResources(zoneResources map[util.ZoneNode]*v1.ResourceList, memoryBandwidthMap map[int]int64) (map[util.ZoneNode]*v1.ResourceList, error) { 953 for id, memoryBandwidth := range memoryBandwidthMap { 954 if memoryBandwidth <= 0 { 955 continue 956 } 957 958 numaZoneNode := util.GenerateNumaZoneNode(id) 959 res, ok := zoneResources[numaZoneNode] 960 if !ok || res == nil { 961 zoneResources[numaZoneNode] = &v1.ResourceList{} 962 } 963 (*zoneResources[numaZoneNode])[apiconsts.ResourceMemoryBandwidth] = *resource.NewQuantity(memoryBandwidth, resource.BinarySI) 964 } 965 return zoneResources, nil 966 } 967 968 // filterAllocatedPodResourcesList is to filter pods that have allocated devices or Resources 969 func filterAllocatedPodResourcesList(podResourcesList []*podresv1.PodResources) []*podresv1.PodResources { 970 allocatedPodResourcesList := make([]*podresv1.PodResources, 0, len(podResourcesList)) 971 isAllocatedPod := func(pod *podresv1.PodResources) bool { 972 if pod == nil { 973 return false 974 } 975 976 // filter allocated pod by whether it has at least one container with 977 // devices or Resources 978 for _, container := range pod.Containers { 979 if container != nil && (len(container.Devices) != 0 || 980 len(container.Resources) != 0) { 981 return true 982 } 983 } 984 985 return false 986 } 987 988 for _, pod := range podResourcesList { 989 if isAllocatedPod(pod) { 990 allocatedPodResourcesList = append(allocatedPodResourcesList, pod) 991 } 992 } 993 994 return allocatedPodResourcesList 995 }