github.com/kubewharf/katalyst-core@v0.5.3/pkg/scheduler/plugins/qosawarenoderesources/fit.go (about) 1 /* 2 Copyright 2022 The Katalyst Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package qosawarenoderesources 18 19 import ( 20 "context" 21 "fmt" 22 23 v1 "k8s.io/api/core/v1" 24 "k8s.io/apimachinery/pkg/runtime" 25 "k8s.io/klog/v2" 26 kubeschedulerconfig "k8s.io/kubernetes/pkg/scheduler/apis/config" 27 "k8s.io/kubernetes/pkg/scheduler/framework" 28 "k8s.io/kubernetes/pkg/scheduler/framework/plugins/feature" 29 "k8s.io/kubernetes/pkg/scheduler/framework/plugins/noderesources" 30 31 "github.com/kubewharf/katalyst-api/pkg/apis/scheduling/config" 32 "github.com/kubewharf/katalyst-api/pkg/apis/scheduling/config/validation" 33 "github.com/kubewharf/katalyst-api/pkg/consts" 34 "github.com/kubewharf/katalyst-core/pkg/scheduler/cache" 35 "github.com/kubewharf/katalyst-core/pkg/scheduler/eventhandlers" 36 "github.com/kubewharf/katalyst-core/pkg/scheduler/util" 37 "github.com/kubewharf/katalyst-core/pkg/util/native" 38 ) 39 40 var ( 41 _ framework.PreFilterPlugin = &Fit{} 42 _ framework.FilterPlugin = &Fit{} 43 _ framework.EnqueueExtensions = &Fit{} 44 _ framework.ScorePlugin = &Fit{} 45 _ framework.ReservePlugin = &Fit{} 46 ) 47 48 const ( 49 // FitName is the name of the plugin used in the plugin registry and configurations. 50 FitName = "QoSAwareNodeResourcesFit" 51 52 // preFilterStateKey is the key in CycleState to NodeResourcesFit pre-computed data. 53 // Using the name of the plugin will likely help us avoid collisions with other plugins. 54 preFilterStateKey = "PreFilter" + FitName 55 ) 56 57 // nodeResourceStrategyTypeMap maps strategy to scorer implementation 58 var nodeResourceStrategyTypeMap = map[kubeschedulerconfig.ScoringStrategyType]scorer{ 59 kubeschedulerconfig.LeastAllocated: func(args *config.QoSAwareNodeResourcesFitArgs) *resourceAllocationScorer { 60 resToWeightMap := resourcesToWeightMap(args.ScoringStrategy.ReclaimedResources) 61 return &resourceAllocationScorer{ 62 Name: string(kubeschedulerconfig.LeastAllocated), 63 scorer: leastResourceScorer(resToWeightMap), 64 resourceToWeightMap: resToWeightMap, 65 } 66 }, 67 kubeschedulerconfig.MostAllocated: func(args *config.QoSAwareNodeResourcesFitArgs) *resourceAllocationScorer { 68 resToWeightMap := resourcesToWeightMap(args.ScoringStrategy.ReclaimedResources) 69 return &resourceAllocationScorer{ 70 Name: string(kubeschedulerconfig.MostAllocated), 71 scorer: mostResourceScorer(resToWeightMap), 72 resourceToWeightMap: resToWeightMap, 73 } 74 }, 75 kubeschedulerconfig.RequestedToCapacityRatio: func(args *config.QoSAwareNodeResourcesFitArgs) *resourceAllocationScorer { 76 resToWeightMap := resourcesToWeightMap(args.ScoringStrategy.ReclaimedResources) 77 return &resourceAllocationScorer{ 78 Name: string(kubeschedulerconfig.RequestedToCapacityRatio), 79 scorer: requestedToCapacityRatioScorer(resToWeightMap, args.ScoringStrategy.ReclaimedRequestedToCapacityRatio.Shape), 80 resourceToWeightMap: resToWeightMap, 81 } 82 }, 83 } 84 85 // Fit is a plugin that checks if a node has sufficient resources. 86 type Fit struct { 87 handle framework.Handle 88 resourceAllocationScorer 89 nativeFit *noderesources.Fit 90 } 91 92 // ScoreExtensions of the Score plugin. 93 func (f *Fit) ScoreExtensions() framework.ScoreExtensions { 94 return nil 95 } 96 97 // preFilterState computed at PreFilter and used at Filter. 98 type preFilterState struct { 99 native.QoSResource 100 } 101 102 // Clone the prefilter state. 103 func (s *preFilterState) Clone() framework.StateData { 104 return s 105 } 106 107 // Name returns name of the plugin. It is used in logs, etc. 108 func (f *Fit) Name() string { 109 return FitName 110 } 111 112 // NewFit initializes a new plugin and returns it. 113 func NewFit(plArgs runtime.Object, h framework.Handle) (framework.Plugin, error) { 114 args, ok := plArgs.(*config.QoSAwareNodeResourcesFitArgs) 115 if !ok { 116 return nil, fmt.Errorf("want args to be of type NodeQoSResourcesFitArgs, got %T", plArgs) 117 } 118 if err := validation.ValidateQoSAwareNodeResourcesFitArgs(nil, args); err != nil { 119 return nil, err 120 } 121 122 if args.ScoringStrategy == nil { 123 return nil, fmt.Errorf("scoring strategy not specified") 124 } 125 126 strategy := args.ScoringStrategy.Type 127 scorePlugin, exists := nodeResourceStrategyTypeMap[strategy] 128 if !exists { 129 return nil, fmt.Errorf("scoring strategy %s is not supported", strategy) 130 } 131 132 nativeFit, err := newNativeFit(args, h) 133 if err != nil { 134 return nil, err 135 } 136 137 eventhandlers.RegisterCommonPodHandler() 138 eventhandlers.RegisterCommonCNRHandler() 139 140 return &Fit{ 141 handle: h, 142 resourceAllocationScorer: *scorePlugin(args), 143 nativeFit: nativeFit, 144 }, nil 145 } 146 147 func newNativeFit(args *config.QoSAwareNodeResourcesFitArgs, h framework.Handle) (*noderesources.Fit, error) { 148 scoringStrategy := &kubeschedulerconfig.ScoringStrategy{ 149 Type: args.ScoringStrategy.Type, 150 Resources: args.ScoringStrategy.Resources, 151 RequestedToCapacityRatio: args.ScoringStrategy.RequestedToCapacityRatio, 152 } 153 154 nativeFitPlugin, err := noderesources.NewFit( 155 &kubeschedulerconfig.NodeResourcesFitArgs{ 156 ScoringStrategy: scoringStrategy, 157 }, h, feature.Features{}, 158 ) 159 if err != nil { 160 return nil, err 161 } 162 163 nativeFit, ok := nativeFitPlugin.(*noderesources.Fit) 164 if !ok { 165 return nil, fmt.Errorf("assert nativeFit type error, got %T", nativeFitPlugin) 166 } 167 168 return nativeFit, nil 169 } 170 171 // PreFilter invoked at the prefilter extension point. 172 func (f *Fit) PreFilter(ctx context.Context, cycleState *framework.CycleState, pod *v1.Pod) (*framework.PreFilterResult, *framework.Status) { 173 if !util.IsReclaimedPod(pod) { 174 return nil, nil 175 } 176 cycleState.Write(preFilterStateKey, computePodQoSResourceRequest(pod)) 177 return nil, nil 178 } 179 180 // PreFilterExtensions returns prefilter extensions, pod add and remove. 181 func (f *Fit) PreFilterExtensions() framework.PreFilterExtensions { 182 return nil 183 } 184 185 // computePodQoSResourceRequest returns a framework.Resource that covers the largest 186 // width in each resource dimension. Because init-containers run sequentially, we collect 187 // the max in each dimension iteratively. In contrast, we sum the resource vectors for 188 // regular containers since they run simultaneously. 189 // 190 // the resources defined for Overhead should be added to the calculated QoSResource request sum 191 // 192 // example: 193 /* 194 // Pod: 195 // InitContainers 196 // IC1: 197 // CPU: 2 198 // Memory: 1G 199 // IC2: 200 // CPU: 2 201 // Memory: 3G 202 // Containers 203 // C1: 204 // CPU: 2 205 // Memory: 1G 206 // C2: 207 // CPU: 1 208 // Memory: 1G 209 // 210 // Result: CPU: 3, Memory: 3G 211 */ 212 func computePodQoSResourceRequest(pod *v1.Pod) *preFilterState { 213 result := &preFilterState{} 214 for _, container := range pod.Spec.Containers { 215 result.Add(container.Resources.Requests) 216 } 217 218 // take max_resource(sum_pod, any_init_container) 219 for _, container := range pod.Spec.InitContainers { 220 result.SetMaxResource(container.Resources.Requests) 221 } 222 223 // If Overhead is being utilized, add to the total requests for the pod 224 if pod.Spec.Overhead != nil { 225 result.Add(pod.Spec.Overhead) 226 } 227 return result 228 } 229 230 func getPreFilterState(cycleState *framework.CycleState) (*preFilterState, error) { 231 c, err := cycleState.Read(preFilterStateKey) 232 if err != nil { 233 // preFilterState doesn't exist, likely PreFilter wasn't invoked. 234 return nil, fmt.Errorf("error reading %q from cycleState: %w", preFilterStateKey, err) 235 } 236 237 s, ok := c.(*preFilterState) 238 if !ok { 239 return nil, fmt.Errorf("%+v convert to NodeQoSResourcesFit.preFilterState error", c) 240 } 241 return s, nil 242 } 243 244 // EventsToRegister returns the possible events that may make a Pod 245 // failed by this plugin schedulable. 246 // NOTE: if in-place-update (KEP 1287) gets implemented, then PodUpdate event 247 // should be registered for this plugin since a Pod update may free up resources 248 // that make other Pods schedulable. 249 func (f *Fit) EventsToRegister() []framework.ClusterEvent { 250 return []framework.ClusterEvent{ 251 {Resource: framework.Pod, ActionType: framework.Delete}, 252 {Resource: framework.Node, ActionType: framework.Add}, 253 } 254 } 255 256 // Filter invoked at the filter extension point. 257 // Checks if a node has sufficient resources, such as cpu, memory, gpu, opaque int resources etc to run a pod. 258 // It returns a list of insufficient resources, if empty, then the node has all the resources requested by the pod. 259 func (f *Fit) Filter(ctx context.Context, cycleState *framework.CycleState, pod *v1.Pod, nodeInfo *framework.NodeInfo) *framework.Status { 260 if !util.IsReclaimedPod(pod) { 261 return nil 262 } 263 264 s, err := getPreFilterState(cycleState) 265 if err != nil { 266 return framework.AsStatus(err) 267 } 268 269 insufficientResources := fitsRequest(s, nodeInfo) 270 271 if len(insufficientResources) != 0 { 272 // We will keep all failure reasons. 273 failureReasons := make([]string, 0, len(insufficientResources)) 274 for i := range insufficientResources { 275 failureReasons = append(failureReasons, insufficientResources[i].Reason) 276 } 277 return framework.NewStatus(framework.Unschedulable, failureReasons...) 278 } 279 280 return nil 281 } 282 283 // InsufficientResource describes what kind of resource limit is hit and caused the pod to not fit the node. 284 type InsufficientResource struct { 285 ResourceName v1.ResourceName 286 // We explicitly have a parameter for reason to avoid formatting a message on the fly 287 // for common resources, which is expensive for cluster autoscaler simulations. 288 Reason string 289 Requested int64 290 Used int64 291 Capacity int64 292 } 293 294 func fitsRequest(podRequest *preFilterState, nodeInfo *framework.NodeInfo) []InsufficientResource { 295 insufficientResources := make([]InsufficientResource, 0, 2) 296 297 if podRequest.ReclaimedMilliCPU == 0 && 298 podRequest.ReclaimedMemory == 0 { 299 return insufficientResources 300 } 301 302 extendedNodeInfo, err := cache.GetCache().GetNodeInfo(nodeInfo.Node().GetName()) 303 if err != nil { 304 insufficientResources = append(insufficientResources, 305 InsufficientResource{ 306 Reason: err.Error(), 307 }, 308 ) 309 return insufficientResources 310 } 311 312 extendedNodeInfo.Mutex.RLock() 313 defer extendedNodeInfo.Mutex.RUnlock() 314 315 if podRequest.ReclaimedMilliCPU > (extendedNodeInfo.QoSResourcesAllocatable.ReclaimedMilliCPU - extendedNodeInfo.QoSResourcesRequested.ReclaimedMilliCPU) { 316 insufficientResources = append(insufficientResources, InsufficientResource{ 317 ResourceName: consts.ReclaimedResourceMilliCPU, 318 Reason: fmt.Sprintf("Insufficient %s", consts.ReclaimedResourceMilliCPU), 319 Requested: podRequest.ReclaimedMilliCPU, 320 Used: extendedNodeInfo.QoSResourcesRequested.ReclaimedMilliCPU, 321 Capacity: extendedNodeInfo.QoSResourcesAllocatable.ReclaimedMilliCPU, 322 }) 323 } 324 if podRequest.ReclaimedMemory > (extendedNodeInfo.QoSResourcesAllocatable.ReclaimedMemory - extendedNodeInfo.QoSResourcesRequested.ReclaimedMemory) { 325 insufficientResources = append(insufficientResources, InsufficientResource{ 326 ResourceName: consts.ReclaimedResourceMemory, 327 Reason: fmt.Sprintf("Insufficient %s", consts.ReclaimedResourceMemory), 328 Requested: podRequest.ReclaimedMemory, 329 Used: extendedNodeInfo.QoSResourcesRequested.ReclaimedMemory, 330 Capacity: extendedNodeInfo.QoSResourcesAllocatable.ReclaimedMemory, 331 }) 332 } 333 334 return insufficientResources 335 } 336 337 // Score invoked at the Score extension point. 338 func (f *Fit) Score(ctx context.Context, state *framework.CycleState, pod *v1.Pod, nodeName string) (int64, *framework.Status) { 339 if util.IsReclaimedPod(pod) { 340 extendedNodeInfo, err := cache.GetCache().GetNodeInfo(nodeName) 341 if err != nil { 342 return 0, framework.AsStatus(fmt.Errorf("getting node %q error: %w", nodeName, err)) 343 } 344 345 return f.score(pod, extendedNodeInfo, nodeName) 346 } 347 348 return f.nativeFit.Score(ctx, state, pod, nodeName) 349 } 350 351 // Reserve is the functions invoked by the framework at "Reserve" extension point. 352 func (f *Fit) Reserve(ctx context.Context, state *framework.CycleState, pod *v1.Pod, nodeName string) *framework.Status { 353 if !util.IsReclaimedPod(pod) || nodeName == "" || native.PodIsTerminated(pod) { 354 return nil 355 } 356 357 newPod := pod.DeepCopy() 358 newPod.Spec.NodeName = nodeName 359 360 if err := cache.GetCache().AddPod(newPod); err != nil { 361 return framework.NewStatus(framework.Unschedulable, fmt.Sprintf("extended cache reserve failed, err: %s", err.Error())) 362 } 363 364 return nil 365 } 366 367 // Unreserve is the functions invoked by the framework at "Unreserve" extension point. 368 func (f *Fit) Unreserve(ctx context.Context, state *framework.CycleState, pod *v1.Pod, nodeName string) { 369 if !util.IsReclaimedPod(pod) || nodeName == "" { 370 return 371 } 372 373 newPod := pod.DeepCopy() 374 newPod.Spec.NodeName = nodeName 375 376 if err := cache.GetCache().RemovePod(newPod); err != nil { 377 klog.ErrorS(err, "Unreserve failed to RemovePod", 378 "pod", klog.KObj(pod), "node", nodeName) 379 } 380 }