github.com/kubewharf/katalyst-core@v0.5.3/pkg/agent/qrm-plugins/network/staticpolicy/policy.go (about) 1 /* 2 Copyright 2022 The Katalyst Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package staticpolicy 18 19 import ( 20 "context" 21 "fmt" 22 "sort" 23 "strconv" 24 "strings" 25 "sync" 26 "time" 27 28 "k8s.io/apimachinery/pkg/util/wait" 29 pluginapi "k8s.io/kubelet/pkg/apis/resourceplugin/v1alpha1" 30 maputil "k8s.io/kubernetes/pkg/util/maps" 31 32 apinode "github.com/kubewharf/katalyst-api/pkg/apis/node/v1alpha1" 33 apiconsts "github.com/kubewharf/katalyst-api/pkg/consts" 34 "github.com/kubewharf/katalyst-api/pkg/plugins/skeleton" 35 "github.com/kubewharf/katalyst-core/cmd/katalyst-agent/app/agent" 36 "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/network/state" 37 "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/util" 38 "github.com/kubewharf/katalyst-core/pkg/config" 39 agentconfig "github.com/kubewharf/katalyst-core/pkg/config/agent" 40 "github.com/kubewharf/katalyst-core/pkg/config/agent/qrm" 41 "github.com/kubewharf/katalyst-core/pkg/config/generic" 42 "github.com/kubewharf/katalyst-core/pkg/metaserver" 43 "github.com/kubewharf/katalyst-core/pkg/metrics" 44 "github.com/kubewharf/katalyst-core/pkg/util/cgroup/common" 45 cgroupcmutils "github.com/kubewharf/katalyst-core/pkg/util/cgroup/manager" 46 "github.com/kubewharf/katalyst-core/pkg/util/general" 47 "github.com/kubewharf/katalyst-core/pkg/util/machine" 48 "github.com/kubewharf/katalyst-core/pkg/util/native" 49 "github.com/kubewharf/katalyst-core/pkg/util/qos" 50 ) 51 52 const ( 53 // NetworkResourcePluginPolicyNameStatic is the policy name of static network resource plugin 54 NetworkResourcePluginPolicyNameStatic = string(apiconsts.ResourcePluginPolicyNameStatic) 55 56 NetworkPluginStateFileName = "network_plugin_state" 57 58 // IPsSeparator is used to split merged IPs string 59 IPsSeparator = "," 60 ) 61 62 // StaticPolicy is the static network policy 63 type StaticPolicy struct { 64 sync.Mutex 65 66 name string 67 stopCh chan struct{} 68 started bool 69 qosConfig *generic.QoSConfiguration 70 qrmConfig *qrm.QRMPluginsConfiguration 71 emitter metrics.MetricEmitter 72 metaServer *metaserver.MetaServer 73 agentCtx *agent.GenericContext 74 nics []machine.InterfaceInfo 75 state state.State 76 77 CgroupV2Env bool 78 qosLevelToNetClassMap map[string]uint32 79 applyNetClassFunc func(podUID, containerID string, data *common.NetClsData) error 80 podLevelNetClassAnnoKey string 81 podLevelNetAttributesAnnoKeys []string 82 ipv4ResourceAllocationAnnotationKey string 83 ipv6ResourceAllocationAnnotationKey string 84 netNSPathResourceAllocationAnnotationKey string 85 netInterfaceNameResourceAllocationAnnotationKey string 86 netClassIDResourceAllocationAnnotationKey string 87 netBandwidthResourceAllocationAnnotationKey string 88 } 89 90 // NewStaticPolicy returns a static network policy 91 func NewStaticPolicy(agentCtx *agent.GenericContext, conf *config.Configuration, 92 _ interface{}, agentName string, 93 ) (bool, agent.Component, error) { 94 wrappedEmitter := agentCtx.EmitterPool.GetDefaultMetricsEmitter().WithTags(agentName, metrics.MetricTag{ 95 Key: util.QRMPluginPolicyTagName, 96 Val: NetworkResourcePluginPolicyNameStatic, 97 }) 98 99 // it is incorrect to reserve bandwidth on those diabled NICs. 100 // we only count active NICs as available network devices and allocate bandwidth on them 101 enabledNICs := filterNICsByAvailability(agentCtx.KatalystMachineInfo.ExtraNetworkInfo.Interface, nil, nil) 102 if len(enabledNICs) != 0 { 103 // the NICs should be in order by interface name so that we can adopt specific policies for bandwidth reservation or allocation 104 // e.g. reserve bandwidth for high-priority tasks on the first NIC 105 sort.SliceStable(enabledNICs, func(i, j int) bool { 106 return enabledNICs[i].Iface < enabledNICs[j].Iface 107 }) 108 } else { 109 general.Infof("no valid nics on this node") 110 } 111 112 // we only support one spreading policy for now: reserve the bandwidth on the first NIC. 113 // TODO: make the reservation policy configurable 114 reservation, err := getReservedBandwidth(enabledNICs, conf.ReservedBandwidth, FirstNIC) 115 if err != nil { 116 return false, agent.ComponentStub{}, fmt.Errorf("getReservedBandwidth failed with error: %v", err) 117 } 118 119 stateImpl, err := state.NewCheckpointState(conf.QRMPluginsConfiguration, conf.GenericQRMPluginConfiguration.StateFileDirectory, NetworkPluginStateFileName, 120 NetworkResourcePluginPolicyNameStatic, agentCtx.MachineInfo, enabledNICs, reservation, conf.SkipNetworkStateCorruption) 121 if err != nil { 122 return false, agent.ComponentStub{}, fmt.Errorf("NewCheckpointState failed with error: %v", err) 123 } 124 125 policyImplement := &StaticPolicy{ 126 nics: enabledNICs, 127 qosConfig: conf.QoSConfiguration, 128 qrmConfig: conf.QRMPluginsConfiguration, 129 emitter: wrappedEmitter, 130 metaServer: agentCtx.MetaServer, 131 agentCtx: agentCtx, 132 state: stateImpl, 133 stopCh: make(chan struct{}), 134 name: fmt.Sprintf("%s_%s", agentName, NetworkResourcePluginPolicyNameStatic), 135 qosLevelToNetClassMap: make(map[string]uint32), 136 } 137 138 if common.CheckCgroup2UnifiedMode() { 139 policyImplement.CgroupV2Env = true 140 policyImplement.applyNetClassFunc = agentCtx.MetaServer.ExternalManager.ApplyNetClass 141 } else { 142 policyImplement.CgroupV2Env = false 143 policyImplement.applyNetClassFunc = cgroupcmutils.ApplyNetClsForContainer 144 } 145 146 policyImplement.ApplyConfig(conf.StaticAgentConfiguration) 147 148 pluginWrapper, err := skeleton.NewRegistrationPluginWrapper(policyImplement, conf.QRMPluginSocketDirs, 149 func(key string, value int64) { 150 _ = wrappedEmitter.StoreInt64(key, value, metrics.MetricTypeNameRaw) 151 }) 152 if err != nil { 153 return false, agent.ComponentStub{}, fmt.Errorf("static policy new plugin wrapper failed with error: %v", err) 154 } 155 156 return true, &agent.PluginWrapper{GenericPlugin: pluginWrapper}, nil 157 } 158 159 // ApplyConfig applies config to StaticPolicy 160 func (p *StaticPolicy) ApplyConfig(conf *agentconfig.StaticAgentConfiguration) { 161 p.Lock() 162 defer p.Unlock() 163 164 p.qosLevelToNetClassMap[apiconsts.PodAnnotationQoSLevelReclaimedCores] = conf.NetClass.ReclaimedCores 165 p.qosLevelToNetClassMap[apiconsts.PodAnnotationQoSLevelSharedCores] = conf.NetClass.SharedCores 166 p.qosLevelToNetClassMap[apiconsts.PodAnnotationQoSLevelDedicatedCores] = conf.NetClass.DedicatedCores 167 p.qosLevelToNetClassMap[apiconsts.PodAnnotationQoSLevelSystemCores] = conf.NetClass.SystemCores 168 169 p.podLevelNetClassAnnoKey = conf.PodLevelNetClassAnnoKey 170 p.podLevelNetAttributesAnnoKeys = strings.Split(conf.PodLevelNetAttributesAnnoKeys, ",") 171 p.ipv4ResourceAllocationAnnotationKey = conf.IPv4ResourceAllocationAnnotationKey 172 p.ipv6ResourceAllocationAnnotationKey = conf.IPv6ResourceAllocationAnnotationKey 173 p.netNSPathResourceAllocationAnnotationKey = conf.NetNSPathResourceAllocationAnnotationKey 174 p.netInterfaceNameResourceAllocationAnnotationKey = conf.NetInterfaceNameResourceAllocationAnnotationKey 175 p.netClassIDResourceAllocationAnnotationKey = conf.NetClassIDResourceAllocationAnnotationKey 176 p.netBandwidthResourceAllocationAnnotationKey = conf.NetBandwidthResourceAllocationAnnotationKey 177 178 general.Infof("apply configs, "+ 179 "qosLevelToNetClassMap: %+v, "+ 180 "podLevelNetClassAnnoKey: %s, "+ 181 "podLevelNetAttributesAnnoKeys: %+v", 182 p.qosLevelToNetClassMap, 183 p.podLevelNetClassAnnoKey, 184 p.podLevelNetAttributesAnnoKeys) 185 } 186 187 // Start starts this plugin 188 func (p *StaticPolicy) Start() (err error) { 189 general.Infof("called") 190 191 p.Lock() 192 defer func() { 193 if !p.started { 194 if err == nil { 195 p.started = true 196 } else { 197 close(p.stopCh) 198 } 199 } 200 p.Unlock() 201 }() 202 203 if p.started { 204 general.Infof("already started") 205 return nil 206 } 207 208 p.stopCh = make(chan struct{}) 209 210 go wait.Until(func() { 211 _ = p.emitter.StoreInt64(util.MetricNameHeartBeat, 1, metrics.MetricTypeNameRaw) 212 }, time.Second*30, p.stopCh) 213 go wait.Until(p.applyNetClass, 5*time.Second, p.stopCh) 214 215 return nil 216 } 217 218 // Stop stops this plugin 219 func (p *StaticPolicy) Stop() error { 220 p.Lock() 221 defer func() { 222 p.started = false 223 p.Unlock() 224 general.Infof("stopped") 225 }() 226 227 if !p.started { 228 general.Warningf("already stopped") 229 return nil 230 } 231 close(p.stopCh) 232 return nil 233 } 234 235 // Name returns the name of this plugin 236 func (p *StaticPolicy) Name() string { 237 return p.name 238 } 239 240 // ResourceName returns resource names managed by this plugin 241 func (p *StaticPolicy) ResourceName() string { 242 return string(apiconsts.ResourceNetBandwidth) 243 } 244 245 // GetTopologyHints returns hints of corresponding resources 246 func (p *StaticPolicy) GetTopologyHints(_ context.Context, 247 req *pluginapi.ResourceRequest, 248 ) (resp *pluginapi.ResourceHintsResponse, err error) { 249 if req == nil { 250 return nil, fmt.Errorf("GetTopologyHints got nil req") 251 } 252 253 qosLevel, err := util.GetKatalystQoSLevelFromResourceReq(p.qosConfig, req) 254 if err != nil { 255 err = fmt.Errorf("GetKatalystQoSLevelFromResourceReq for pod: %s/%s, container: %s failed with error: %v", 256 req.PodNamespace, req.PodName, req.ContainerName, err) 257 general.Errorf("%s", err.Error()) 258 return nil, err 259 } 260 261 reqInt, _, err := util.GetQuantityFromResourceReq(req) 262 if err != nil { 263 return nil, fmt.Errorf("getReqQuantityFromResourceReq failed with error: %v", err) 264 } 265 266 general.InfoS("called", 267 "podNamespace", req.PodNamespace, 268 "podName", req.PodName, 269 "containerName", req.ContainerName, 270 "qosLevel", qosLevel, 271 "resourceRequests", req.ResourceRequests, 272 "reqAnnotations", req.Annotations, 273 "netBandwidthReq(Mbps)", reqInt) 274 275 p.Lock() 276 defer func() { 277 p.Unlock() 278 if err != nil { 279 _ = p.emitter.StoreInt64(util.MetricNameGetTopologyHintsFailed, 1, metrics.MetricTypeNameRaw) 280 } 281 }() 282 283 if req.ContainerType == pluginapi.ContainerType_INIT || 284 req.ContainerType == pluginapi.ContainerType_SIDECAR { 285 return util.PackResourceHintsResponse(req, p.ResourceName(), map[string]*pluginapi.ListOfTopologyHints{ 286 p.ResourceName(): nil, // indicates that there is no numa preference 287 }) 288 } 289 290 hints, err := p.calculateHints(req) 291 if err != nil { 292 err = fmt.Errorf("calculateHints for pod: %s/%s, container: %s failed with error: %v", 293 req.PodNamespace, req.PodName, req.ContainerName, err) 294 general.Errorf("%s", err.Error()) 295 return nil, err 296 } 297 298 return util.PackResourceHintsResponse(req, p.ResourceName(), hints) 299 } 300 301 func (p *StaticPolicy) RemovePod(_ context.Context, 302 req *pluginapi.RemovePodRequest, 303 ) (*pluginapi.RemovePodResponse, error) { 304 if req == nil { 305 return nil, fmt.Errorf("RemovePod got nil req") 306 } 307 308 p.Lock() 309 defer p.Unlock() 310 311 if err := p.removePod(req.PodUid); err != nil { 312 general.ErrorS(err, "remove pod failed with error", "podUID", req.PodUid) 313 return nil, err 314 } 315 316 return &pluginapi.RemovePodResponse{}, nil 317 } 318 319 // GetResourcesAllocation returns allocation results of corresponding resources 320 func (p *StaticPolicy) GetResourcesAllocation(_ context.Context, 321 _ *pluginapi.GetResourcesAllocationRequest, 322 ) (*pluginapi.GetResourcesAllocationResponse, error) { 323 // no need to implement this function, because NeedReconcile is false 324 return &pluginapi.GetResourcesAllocationResponse{}, nil 325 } 326 327 // GetTopologyAwareResources returns allocation results of corresponding resources as topology aware format 328 func (p *StaticPolicy) GetTopologyAwareResources(_ context.Context, 329 req *pluginapi.GetTopologyAwareResourcesRequest, 330 ) (*pluginapi.GetTopologyAwareResourcesResponse, error) { 331 if req == nil { 332 return nil, fmt.Errorf("GetTopologyAwareResources got nil req") 333 } 334 335 p.Lock() 336 defer p.Unlock() 337 338 allocationInfo := p.state.GetAllocationInfo(req.PodUid, req.ContainerName) 339 if allocationInfo == nil { 340 return &pluginapi.GetTopologyAwareResourcesResponse{}, nil 341 } 342 343 socket, err := p.getSocketIDByNIC(allocationInfo.IfName) 344 if err != nil { 345 return nil, fmt.Errorf("failed to find topologyNode for pod %s, container %s : %v", req.PodUid, req.ContainerName, err) 346 } 347 348 nic := p.getNICByName(allocationInfo.IfName) 349 topologyAwareQuantityList := []*pluginapi.TopologyAwareQuantity{ 350 { 351 ResourceValue: float64(allocationInfo.Egress), 352 Node: uint64(socket), 353 Name: allocationInfo.IfName, 354 Type: string(apinode.TopologyTypeNIC), 355 TopologyLevel: pluginapi.TopologyLevel_SOCKET, 356 Annotations: map[string]string{ 357 apiconsts.ResourceAnnotationKeyResourceIdentifier: getResourceIdentifier(nic.NSName, allocationInfo.IfName), 358 apiconsts.ResourceAnnotationKeyNICNetNSName: nic.NSName, 359 }, 360 }, 361 } 362 resp := &pluginapi.GetTopologyAwareResourcesResponse{ 363 PodUid: allocationInfo.PodUid, 364 PodName: allocationInfo.PodName, 365 PodNamespace: allocationInfo.PodNamespace, 366 ContainerTopologyAwareResources: &pluginapi.ContainerTopologyAwareResources{ 367 ContainerName: allocationInfo.ContainerName, 368 }, 369 } 370 371 if allocationInfo.CheckSideCar() { 372 resp.ContainerTopologyAwareResources.AllocatedResources = map[string]*pluginapi.TopologyAwareResource{ 373 string(apiconsts.ResourceNetBandwidth): { 374 IsNodeResource: true, 375 IsScalarResource: true, 376 AggregatedQuantity: 0, 377 OriginalAggregatedQuantity: 0, 378 TopologyAwareQuantityList: nil, 379 OriginalTopologyAwareQuantityList: nil, 380 }, 381 } 382 } else { 383 resp.ContainerTopologyAwareResources.AllocatedResources = map[string]*pluginapi.TopologyAwareResource{ 384 string(apiconsts.ResourceNetBandwidth): { 385 IsNodeResource: true, 386 IsScalarResource: true, 387 AggregatedQuantity: float64(allocationInfo.Egress), 388 OriginalAggregatedQuantity: float64(allocationInfo.Egress), 389 TopologyAwareQuantityList: topologyAwareQuantityList, 390 OriginalTopologyAwareQuantityList: topologyAwareQuantityList, 391 }, 392 } 393 } 394 395 return resp, nil 396 } 397 398 // GetTopologyAwareAllocatableResources returns corresponding allocatable resources as topology aware format 399 func (p *StaticPolicy) GetTopologyAwareAllocatableResources(_ context.Context, 400 _ *pluginapi.GetTopologyAwareAllocatableResourcesRequest, 401 ) (*pluginapi.GetTopologyAwareAllocatableResourcesResponse, error) { 402 p.Lock() 403 defer p.Unlock() 404 405 machineState := p.state.GetMachineState() 406 407 topologyAwareAllocatableQuantityList := make([]*pluginapi.TopologyAwareQuantity, 0, len(machineState)) 408 topologyAwareCapacityQuantityList := make([]*pluginapi.TopologyAwareQuantity, 0, len(machineState)) 409 410 var aggregatedAllocatableQuantity, aggregatedCapacityQuantity uint32 = 0, 0 411 for _, iface := range p.nics { 412 nicState := machineState[iface.Iface] 413 if nicState == nil { 414 return nil, fmt.Errorf("nil nicState for NIC: %s", iface.Iface) 415 } 416 417 topologyNode, err := p.getSocketIDByNIC(iface.Iface) 418 if err != nil { 419 return nil, fmt.Errorf("failed to find topologyNode: %v", err) 420 } 421 422 resourceIdentifier := getResourceIdentifier(iface.NSName, iface.Iface) 423 topologyAwareAllocatableQuantityList = append(topologyAwareAllocatableQuantityList, &pluginapi.TopologyAwareQuantity{ 424 ResourceValue: float64(general.MinUInt32(nicState.EgressState.Allocatable, nicState.IngressState.Allocatable)), 425 Node: uint64(topologyNode), 426 Name: iface.Iface, 427 Type: string(apinode.TopologyTypeNIC), 428 TopologyLevel: pluginapi.TopologyLevel_SOCKET, 429 Annotations: map[string]string{ 430 apiconsts.ResourceAnnotationKeyResourceIdentifier: resourceIdentifier, 431 apiconsts.ResourceAnnotationKeyNICNetNSName: iface.NSName, 432 }, 433 }) 434 topologyAwareCapacityQuantityList = append(topologyAwareCapacityQuantityList, &pluginapi.TopologyAwareQuantity{ 435 ResourceValue: float64(general.MinUInt32(nicState.EgressState.Capacity, nicState.IngressState.Capacity)), 436 Node: uint64(topologyNode), 437 Name: iface.Iface, 438 Type: string(apinode.TopologyTypeNIC), 439 TopologyLevel: pluginapi.TopologyLevel_SOCKET, 440 Annotations: map[string]string{ 441 apiconsts.ResourceAnnotationKeyResourceIdentifier: resourceIdentifier, 442 apiconsts.ResourceAnnotationKeyNICNetNSName: iface.NSName, 443 }, 444 }) 445 aggregatedAllocatableQuantity += general.MinUInt32(nicState.EgressState.Allocatable, nicState.IngressState.Allocatable) 446 aggregatedCapacityQuantity += general.MinUInt32(nicState.EgressState.Capacity, nicState.IngressState.Capacity) 447 } 448 449 return &pluginapi.GetTopologyAwareAllocatableResourcesResponse{ 450 AllocatableResources: map[string]*pluginapi.AllocatableTopologyAwareResource{ 451 string(apiconsts.ResourceNetBandwidth): { 452 IsNodeResource: true, 453 IsScalarResource: true, 454 AggregatedAllocatableQuantity: float64(aggregatedAllocatableQuantity), 455 TopologyAwareAllocatableQuantityList: topologyAwareAllocatableQuantityList, 456 AggregatedCapacityQuantity: float64(aggregatedCapacityQuantity), 457 TopologyAwareCapacityQuantityList: topologyAwareCapacityQuantityList, 458 }, 459 }, 460 }, nil 461 } 462 463 // GetResourcePluginOptions returns options to be communicated with Resource Manager 464 func (p *StaticPolicy) GetResourcePluginOptions(context.Context, 465 *pluginapi.Empty, 466 ) (*pluginapi.ResourcePluginOptions, error) { 467 return &pluginapi.ResourcePluginOptions{ 468 PreStartRequired: false, 469 WithTopologyAlignment: true, 470 NeedReconcile: false, 471 }, nil 472 } 473 474 // Allocate is called during pod admit so that the resource 475 // plugin can allocate corresponding resource for the container 476 // according to resource request 477 func (p *StaticPolicy) Allocate(_ context.Context, 478 req *pluginapi.ResourceRequest, 479 ) (resp *pluginapi.ResourceAllocationResponse, err error) { 480 if req == nil { 481 return nil, fmt.Errorf("GetTopologyHints got nil req") 482 } 483 484 // since qos config util will filter out annotation keys not related to katalyst QoS, 485 // we copy original pod annotations here to use them later 486 podAnnotations := maputil.CopySS(req.Annotations) 487 488 qosLevel, err := util.GetKatalystQoSLevelFromResourceReq(p.qosConfig, req) 489 if err != nil { 490 err = fmt.Errorf("GetKatalystQoSLevelFromResourceReq for pod: %s/%s, container: %s failed with error: %v", 491 req.PodNamespace, req.PodName, req.ContainerName, err) 492 general.Errorf("%s", err.Error()) 493 return nil, err 494 } 495 496 reqInt, _, err := util.GetQuantityFromResourceReq(req) 497 if err != nil { 498 return nil, fmt.Errorf("getReqQuantityFromResourceReq failed with error: %v", err) 499 } 500 501 general.InfoS("called", 502 "podNamespace", req.PodNamespace, 503 "podName", req.PodName, 504 "containerName", req.ContainerName, 505 "qosLevel", qosLevel, 506 "reqAnnotations", req.Annotations, 507 "netBandwidthReq(Mbps)", reqInt) 508 509 p.Lock() 510 defer func() { 511 p.Unlock() 512 if err != nil { 513 _ = p.emitter.StoreInt64(util.MetricNameAllocateFailed, 1, metrics.MetricTypeNameRaw) 514 } 515 }() 516 517 emptyResponse := &pluginapi.ResourceAllocationResponse{ 518 PodUid: req.PodUid, 519 PodNamespace: req.PodNamespace, 520 PodName: req.PodName, 521 ContainerName: req.ContainerName, 522 ContainerType: req.ContainerType, 523 ContainerIndex: req.ContainerIndex, 524 PodRole: req.PodRole, 525 PodType: req.PodType, 526 ResourceName: p.ResourceName(), 527 Labels: general.DeepCopyMap(req.Labels), 528 Annotations: general.DeepCopyMap(req.Annotations), 529 } 530 531 // currently, not to deal with init containers 532 if req.ContainerType == pluginapi.ContainerType_INIT { 533 return emptyResponse, nil 534 } else if req.ContainerType == pluginapi.ContainerType_SIDECAR { 535 // not to deal with sidecars, and return a trivial allocationResult to avoid re-allocating 536 return packAllocationResponse(req, &state.AllocationInfo{}, nil, nil) 537 } 538 539 // check allocationInfo is nil or not 540 podEntries := p.state.GetPodEntries() 541 allocationInfo := p.state.GetAllocationInfo(req.PodUid, req.ContainerName) 542 543 if allocationInfo != nil { 544 if allocationInfo.Egress >= uint32(reqInt) && allocationInfo.Ingress >= uint32(reqInt) { 545 general.InfoS("already allocated and meet requirement", 546 "podNamespace", req.PodNamespace, 547 "podName", req.PodName, 548 "containerName", req.ContainerName, 549 "bandwidthReq(Mbps)", reqInt, 550 "currentResult(Mbps)", allocationInfo.Egress) 551 552 resourceAllocationAnnotations, err := p.getResourceAllocationAnnotations(podAnnotations, allocationInfo) 553 if err != nil { 554 err = fmt.Errorf("getResourceAllocationAnnotations for pod: %s/%s, container: %s failed with error: %v", 555 req.PodNamespace, req.PodName, req.ContainerName, err) 556 general.Errorf("%s", err.Error()) 557 return nil, err 558 } 559 560 resp, packErr := packAllocationResponse(req, allocationInfo, req.Hint, resourceAllocationAnnotations) 561 if packErr != nil { 562 general.Errorf("pod: %s/%s, container: %s packAllocationResponse failed with error: %v", 563 req.PodNamespace, req.PodName, req.ContainerName, packErr) 564 return nil, fmt.Errorf("packAllocationResponse failed with error: %v", packErr) 565 } 566 return resp, nil 567 } else { 568 general.InfoS("not meet requirement, clear record and re-allocate", 569 "podNamespace", req.PodNamespace, 570 "podName", req.PodName, 571 "containerName", req.ContainerName, 572 "bandwidthReq(Mbps)", reqInt, 573 "currentResult(Mbps)", allocationInfo.Egress) 574 delete(podEntries, req.PodUid) 575 576 _, stateErr := state.GenerateMachineStateFromPodEntries(p.qrmConfig, p.nics, podEntries, p.state.GetReservedBandwidth()) 577 if stateErr != nil { 578 general.ErrorS(stateErr, "generateNetworkMachineStateByPodEntries failed", 579 "podNamespace", req.PodNamespace, 580 "podName", req.PodName, 581 "containerName", req.ContainerName, 582 "bandwidthReq(Mbps)", reqInt, 583 "currentResult(Mbps)", allocationInfo.Egress) 584 return nil, fmt.Errorf("generateNetworkMachineStateByPodEntries failed with error: %v", stateErr) 585 } 586 } 587 } 588 589 candidateNICs, err := p.selectNICsByReq(req) 590 if err != nil { 591 err = fmt.Errorf("selectNICsByReq for pod: %s/%s, container: %s, reqInt: %d, failed with error: %v", 592 req.PodNamespace, req.PodName, req.ContainerName, reqInt, err) 593 general.Errorf("%s", err.Error()) 594 return nil, err 595 } 596 597 if len(candidateNICs) == 0 { 598 general.ErrorS(err, "insufficient bandwidth on this node to satisfy the request", 599 "podNamespace", req.PodNamespace, 600 "podName", req.PodName, 601 "containerName", req.ContainerName, 602 "netBandwidthReq(Mbps)", reqInt, 603 "nicState", p.state.GetMachineState().String()) 604 return nil, fmt.Errorf("failed to meet the bandwidth requirement of %d Mbps", reqInt) 605 } 606 607 // we only support one policy and hard code it for now 608 // TODO: make the policy configurable 609 selectedNIC := selectOneNIC(candidateNICs, RandomOne) 610 general.Infof("select NIC %s to allocate bandwidth (%dMbps)", selectedNIC.Iface, reqInt) 611 612 siblingNUMAs, err := machine.GetSiblingNUMAs(selectedNIC.NumaNode, p.agentCtx.CPUTopology) 613 if err != nil { 614 general.Errorf("get siblingNUMAs for nic: %s failed with error: %v. Incorrect NumaNodes in machineState allocationInfo", selectedNIC.Iface, err) 615 } 616 617 // generate the response hint 618 // it could be different from the req.Hint if the affinitive NIC does not have sufficient bandwidth 619 nicPreference, err := checkNICPreferenceOfReq(selectedNIC, req.Annotations) 620 if err != nil { 621 return nil, fmt.Errorf("checkNICPreferenceOfReq for nic: %s failed with error: %v", selectedNIC.Iface, err) 622 } 623 624 respHint := &pluginapi.TopologyHint{ 625 Nodes: siblingNUMAs.ToSliceUInt64(), 626 Preferred: nicPreference, 627 } 628 629 // generate allocationInfo and update the checkpoint accordingly 630 newAllocation := &state.AllocationInfo{ 631 PodUid: req.PodUid, 632 PodNamespace: req.PodNamespace, 633 PodName: req.PodName, 634 ContainerName: req.ContainerName, 635 ContainerType: req.ContainerType.String(), 636 ContainerIndex: req.ContainerIndex, 637 PodRole: req.PodRole, 638 PodType: req.PodType, 639 Egress: uint32(reqInt), 640 Ingress: uint32(reqInt), 641 IfName: selectedNIC.Iface, 642 NumaNodes: siblingNUMAs, 643 Labels: general.DeepCopyMap(req.Labels), 644 Annotations: general.DeepCopyMap(req.Annotations), 645 } 646 647 resourceAllocationAnnotations, err := p.getResourceAllocationAnnotations(podAnnotations, newAllocation) 648 if err != nil { 649 err = fmt.Errorf("getResourceAllocationAnnotations for pod: %s/%s, container: %s failed with error: %v", 650 req.PodNamespace, req.PodName, req.ContainerName, err) 651 general.Errorf("%s", err.Error()) 652 return nil, err 653 } 654 655 // update PodEntries 656 p.state.SetAllocationInfo(req.PodUid, req.ContainerName, newAllocation) 657 658 machineState, stateErr := state.GenerateMachineStateFromPodEntries(p.qrmConfig, p.nics, p.state.GetPodEntries(), p.state.GetReservedBandwidth()) 659 if stateErr != nil { 660 general.ErrorS(stateErr, "generateNetworkMachineStateByPodEntries failed", 661 "podNamespace", req.PodNamespace, 662 "podName", req.PodName, 663 "containerName", req.ContainerName, 664 "bandwidthReq(Mbps)", reqInt, 665 "currentResult(Mbps)", allocationInfo.Egress) 666 return nil, fmt.Errorf("generateNetworkMachineStateByPodEntries failed with error: %v", stateErr) 667 } 668 669 // update state cache 670 p.state.SetMachineState(machineState) 671 672 return packAllocationResponse(req, newAllocation, respHint, resourceAllocationAnnotations) 673 } 674 675 // PreStartContainer is called, if indicated by resource plugin during registration phase, 676 // before each container start. Resource plugin can run resource specific operations 677 // such as resetting the resource before making resources available to the container 678 func (p *StaticPolicy) PreStartContainer(context.Context, 679 *pluginapi.PreStartContainerRequest, 680 ) (*pluginapi.PreStartContainerResponse, error) { 681 return &pluginapi.PreStartContainerResponse{}, nil 682 } 683 684 func (p *StaticPolicy) applyNetClass() { 685 if p.metaServer == nil { 686 general.Errorf("nil metaServer") 687 return 688 } 689 690 podList, err := p.metaServer.GetPodList(context.Background(), nil) 691 if err != nil { 692 general.Errorf("get pod list failed, err: %v", err) 693 return 694 } 695 696 for _, pod := range podList { 697 if pod == nil { 698 general.Errorf("get nil pod from metaServer") 699 continue 700 } 701 702 classID, err := p.getNetClassID(pod.GetAnnotations(), p.podLevelNetClassAnnoKey) 703 if err != nil { 704 general.Errorf("get net class id failed, pod: %s, err: %s", native.GenerateUniqObjectNameKey(pod), err) 705 continue 706 } 707 netClsData := &common.NetClsData{ 708 ClassID: classID, 709 Attributes: native.FilterPodAnnotations(p.podLevelNetAttributesAnnoKeys, pod), 710 } 711 712 for _, container := range pod.Spec.Containers { 713 go func(podUID, containerName string, netClsData *common.NetClsData) { 714 containerID, err := p.metaServer.GetContainerID(podUID, containerName) 715 if err != nil { 716 general.Errorf("get container id failed, pod: %s, container: %s(%s), err: %v", 717 podUID, containerName, containerID, err) 718 return 719 } 720 721 if exist, err := common.IsContainerCgroupExist(podUID, containerID); err != nil { 722 general.Errorf("check if container cgroup exists failed, pod: %s, container: %s(%s), err: %v", 723 podUID, containerName, containerID, err) 724 return 725 } else if !exist { 726 general.Infof("container cgroup does not exist, pod: %s, container: %s(%s)", podUID, containerName, containerID) 727 return 728 } 729 730 if p.CgroupV2Env { 731 cgID, err := p.metaServer.ExternalManager.GetCgroupIDForContainer(podUID, containerID) 732 if err != nil { 733 general.Errorf("get cgroup id failed, pod: %s, container: %s(%s), err: %v", 734 podUID, containerName, containerID, err) 735 return 736 } 737 netClsData.CgroupID = cgID 738 } 739 740 if err = p.applyNetClassFunc(podUID, containerID, netClsData); err != nil { 741 general.Errorf("apply net class failed, pod: %s, container: %s(%s), netClsData: %+v, err: %v", 742 podUID, containerName, containerID, *netClsData, err) 743 return 744 } 745 746 general.Infof("apply net class successfully, pod: %s, container: %s(%s), netClsData: %+v", 747 podUID, containerName, containerID, *netClsData) 748 }(string(pod.UID), container.Name, netClsData) 749 } 750 } 751 } 752 753 func (p *StaticPolicy) filterAvailableNICsByBandwidth(nics []machine.InterfaceInfo, req *pluginapi.ResourceRequest, _ *agent.GenericContext) []machine.InterfaceInfo { 754 filteredNICs := make([]machine.InterfaceInfo, 0, len(nics)) 755 756 if req == nil { 757 general.Infof("filterNICsByBandwidth got nil req") 758 return nil 759 } 760 761 reqInt, _, err := util.GetQuantityFromResourceReq(req) 762 if err != nil { 763 general.Errorf("getReqQuantityFromResourceReq failed with error: %v", err) 764 return nil 765 } 766 767 machineState := p.state.GetMachineState() 768 if len(machineState) == 0 || len(nics) == 0 { 769 general.Errorf("filterNICsByBandwidth with 0 NIC") 770 return nil 771 } 772 773 // filter NICs by available bandwidth 774 for _, iface := range nics { 775 if machineState[iface.Iface].EgressState.Free >= uint32(reqInt) && machineState[iface.Iface].IngressState.Free >= uint32(reqInt) { 776 filteredNICs = append(filteredNICs, iface) 777 } 778 } 779 780 // no nic meets the bandwidth request 781 if len(filteredNICs) == 0 { 782 general.InfoS("nic list returned by filtereNICsByBandwidth is empty", 783 "podNamespace", req.PodNamespace, 784 "podName", req.PodName, 785 "containerName", req.ContainerName) 786 } 787 788 return filteredNICs 789 } 790 791 func (p *StaticPolicy) calculateHints(req *pluginapi.ResourceRequest) (map[string]*pluginapi.ListOfTopologyHints, error) { 792 // resp.hints: 1) empty, means no resource (i.e. NIC) meeting requirements found; 2) nil, does not care about the hints 793 // since NIC is a kind of topology-aware resource, it is incorrect to return nil 794 hints := map[string]*pluginapi.ListOfTopologyHints{ 795 p.ResourceName(): { 796 Hints: []*pluginapi.TopologyHint{}, 797 }, 798 } 799 800 // return empty hints immediately if no valid nics on this node 801 if len(p.nics) == 0 { 802 return hints, nil 803 } 804 805 candidateNICs, err := p.selectNICsByReq(req) 806 if err != nil { 807 return hints, fmt.Errorf("failed to select available NICs: %v", err) 808 } 809 810 if len(candidateNICs) == 0 { 811 general.InfoS("candidateNICs is empty", 812 "podNamespace", req.PodNamespace, 813 "podName", req.PodName, 814 "containerName", req.ContainerName) 815 // if the req.NS asks to allocate on the 1st NIC which does not have sufficient bandwidth, candidateNICs is empty. 816 // however, we should not return directly here. To indicate the option of the 2nd NIC if no restricted affinity or ns requested, we return [0,1,2,3] instead. 817 } 818 819 numasToHintMap := make(map[string]*pluginapi.TopologyHint) 820 for _, nic := range candidateNICs { 821 siblingNUMAs, err := machine.GetSiblingNUMAs(nic.NumaNode, p.agentCtx.CPUTopology) 822 if err != nil { 823 return nil, fmt.Errorf("get siblingNUMAs for nic: %s failed with error: %v", nic.Iface, err) 824 } 825 826 nicPreference, err := checkNICPreferenceOfReq(nic, req.Annotations) 827 if err != nil { 828 return nil, fmt.Errorf("checkNICPreferenceOfReq for nic: %s failed with error: %v", nic.Iface, err) 829 } 830 831 siblingNUMAsStr := siblingNUMAs.String() 832 if numasToHintMap[siblingNUMAsStr] == nil { 833 numasToHintMap[siblingNUMAsStr] = &pluginapi.TopologyHint{ 834 Nodes: siblingNUMAs.ToSliceUInt64(), 835 } 836 } 837 838 if nicPreference { 839 general.InfoS("set nic preferred to true", 840 "podNamespace", req.PodNamespace, 841 "podName", req.PodName, 842 "containerName", req.ContainerName, 843 "nic", nic.Iface) 844 numasToHintMap[siblingNUMAsStr].Preferred = nicPreference 845 } 846 } 847 848 for _, hint := range numasToHintMap { 849 hints[p.ResourceName()].Hints = append(hints[p.ResourceName()].Hints, hint) 850 } 851 852 // check if restricted affinity or ns requested 853 if !isReqAffinityRestricted(req.Annotations) && !isReqNamespaceRestricted(req.Annotations) { 854 general.InfoS("add all NUMAs to hint to avoid affinity error", 855 "podNamespace", req.PodNamespace, 856 "podName", req.PodName, 857 "containerName", req.ContainerName, 858 req.Annotations[apiconsts.PodAnnotationNetworkEnhancementAffinityRestricted], 859 apiconsts.PodAnnotationNetworkEnhancementAffinityRestrictedTrue) 860 861 hints[p.ResourceName()].Hints = append(hints[p.ResourceName()].Hints, &pluginapi.TopologyHint{ 862 Nodes: p.agentCtx.CPUDetails.NUMANodes().ToSliceUInt64(), 863 }) 864 } 865 866 return hints, nil 867 } 868 869 /* 870 The NIC selection depends on the following three aspects: available Bandwidth on each NIC, Namespace parameter in request, and req.Hints. 871 1) The availability of sufficient bandwidth on the NIC is a prerequisite for determining whether the card can be selected. 872 If there is insufficient bandwidth on a NIC, it cannot be included in the candidate list. 873 874 2) We may put NICs into separate net namespaces in order to use both NICs simultaneously (Host network mode). 875 If a container wants to request a specific NIC through the namespace parameter, this requirement must also be met. 876 If the specified NIC has insufficient bandwidth, it cannot be included in the candidate list. 877 878 3) The req.Hints parameter represents the affinity of a NIC. For example, a socket container running on a specific socket 879 may use req.Hints to prioritize the selection of a NIC connected to that socket. However, this requirement is only satisfied as much as possible. 880 If the NIC connected to the socket has sufficient bandwidth, only this NIC is returned. Otherwise, other cards with sufficient bandwidth will be returned. 881 */ 882 func (p *StaticPolicy) selectNICsByReq(req *pluginapi.ResourceRequest) ([]machine.InterfaceInfo, error) { 883 nicFilters := []NICFilter{ 884 p.filterAvailableNICsByBandwidth, 885 filterNICsByNamespaceType, 886 filterNICsByHint, 887 } 888 889 if len(p.nics) == 0 { 890 return []machine.InterfaceInfo{}, nil 891 } 892 893 candidateNICs, err := filterAvailableNICsByReq(p.nics, req, p.agentCtx, nicFilters) 894 if err != nil { 895 return nil, fmt.Errorf("filterAvailableNICsByReq failed with error: %v", err) 896 } 897 898 // this node can not meet the combined requests 899 if len(candidateNICs) == 0 { 900 general.InfoS("nic list returned by filterAvailableNICsByReq is empty", 901 "podNamespace", req.PodNamespace, 902 "podName", req.PodName, 903 "containerName", req.ContainerName) 904 } 905 906 return candidateNICs, nil 907 } 908 909 func (p *StaticPolicy) getResourceAllocationAnnotations(podAnnotations map[string]string, allocation *state.AllocationInfo) (map[string]string, error) { 910 netClsID, err := p.getNetClassID(podAnnotations, p.podLevelNetClassAnnoKey) 911 if err != nil { 912 return nil, fmt.Errorf("getNetClassID failed with error: %v", err) 913 } 914 915 selectedNIC := p.getNICByName(allocation.IfName) 916 917 resourceAllocationAnnotations := map[string]string{ 918 p.ipv4ResourceAllocationAnnotationKey: strings.Join(selectedNIC.GetNICIPs(machine.IPVersionV4), IPsSeparator), 919 p.ipv6ResourceAllocationAnnotationKey: strings.Join(selectedNIC.GetNICIPs(machine.IPVersionV6), IPsSeparator), 920 p.netInterfaceNameResourceAllocationAnnotationKey: selectedNIC.Iface, 921 p.netClassIDResourceAllocationAnnotationKey: fmt.Sprintf("%d", netClsID), 922 // TODO: support differentiated Egress/Ingress bandwidth later 923 p.netBandwidthResourceAllocationAnnotationKey: strconv.Itoa(int(allocation.Egress)), 924 } 925 926 if len(selectedNIC.NSAbsolutePath) > 0 { 927 resourceAllocationAnnotations[p.netNSPathResourceAllocationAnnotationKey] = selectedNIC.NSAbsolutePath 928 } 929 930 return resourceAllocationAnnotations, nil 931 } 932 933 func (p *StaticPolicy) removePod(podUID string) error { 934 if p.CgroupV2Env { 935 cgIDList, err := p.metaServer.ExternalManager.ListCgroupIDsForPod(podUID) 936 if err != nil { 937 if general.IsErrNotFound(err) { 938 general.Warningf("cgroup ids for pod not found") 939 return nil 940 } 941 return fmt.Errorf("[NetworkStaticPolicy.removePod] list cgroup ids of pod: %s failed with error: %v", podUID, err) 942 } 943 944 for _, cgID := range cgIDList { 945 go func(cgID uint64) { 946 if err := p.metaServer.ExternalManager.ClearNetClass(cgID); err != nil { 947 general.Errorf("delete net class failed, cgID: %v, err: %v", cgID, err) 948 return 949 } 950 }(cgID) 951 } 952 } 953 954 // update state cache 955 podEntries := p.state.GetPodEntries() 956 delete(podEntries, podUID) 957 958 machineState, err := state.GenerateMachineStateFromPodEntries(p.qrmConfig, p.nics, podEntries, p.state.GetReservedBandwidth()) 959 if err != nil { 960 general.Errorf("pod: %s, GenerateMachineStateFromPodEntries failed with error: %v", podUID, err) 961 return fmt.Errorf("calculate machineState by updated pod entries failed with error: %v", err) 962 } 963 964 p.state.SetPodEntries(podEntries) 965 p.state.SetMachineState(machineState) 966 967 return nil 968 } 969 970 func (p *StaticPolicy) getNetClassID(podAnnotations map[string]string, podLevelNetClassAnnoKey string) (uint32, error) { 971 isPodLevelNetClassExist, classID, err := qos.GetPodNetClassID(podAnnotations, podLevelNetClassAnnoKey) 972 if err != nil { 973 return 0, err 974 } 975 if isPodLevelNetClassExist { 976 return classID, nil 977 } 978 979 qosLevel, err := p.qosConfig.GetQoSLevel(nil, podAnnotations) 980 if err != nil { 981 return 0, err 982 } 983 return p.getNetClassIDByQoSLevel(qosLevel) 984 } 985 986 func (p *StaticPolicy) getNetClassIDByQoSLevel(qosLevel string) (uint32, error) { 987 if netClsID, found := p.qosLevelToNetClassMap[qosLevel]; found { 988 return netClsID, nil 989 } else { 990 return 0, fmt.Errorf("netClsID for qosLevel: %s isn't found", qosLevel) 991 } 992 } 993 994 func (p *StaticPolicy) getNICByName(ifName string) machine.InterfaceInfo { 995 for idx := range p.nics { 996 if p.nics[idx].Iface == ifName { 997 return p.nics[idx] 998 } 999 } 1000 1001 return machine.InterfaceInfo{} 1002 } 1003 1004 // return the Socket id/index that the specified NIC attached to 1005 func (p *StaticPolicy) getSocketIDByNIC(ifName string) (int, error) { 1006 for _, iface := range p.nics { 1007 if iface.Iface == ifName { 1008 socketIDs := p.agentCtx.KatalystMachineInfo.CPUDetails.SocketsInNUMANodes(iface.NumaNode) 1009 if socketIDs.Size() == 0 { 1010 return -1, fmt.Errorf("failed to find the associated socket ID for the specified NIC %s - numanode: %d, cpuDetails: %v", ifName, iface.NumaNode, p.agentCtx.KatalystMachineInfo.CPUDetails) 1011 } 1012 1013 return socketIDs.ToSliceInt()[0], nil 1014 } 1015 } 1016 1017 return -1, fmt.Errorf("invalid NIC name - failed to find a matched NIC") 1018 }