github.com/kubewharf/katalyst-core@v0.5.3/pkg/scheduler/plugins/qosawarenoderesources/fit.go (about)

     1  /*
     2  Copyright 2022 The Katalyst Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package qosawarenoderesources
    18  
    19  import (
    20  	"context"
    21  	"fmt"
    22  
    23  	v1 "k8s.io/api/core/v1"
    24  	"k8s.io/apimachinery/pkg/runtime"
    25  	"k8s.io/klog/v2"
    26  	kubeschedulerconfig "k8s.io/kubernetes/pkg/scheduler/apis/config"
    27  	"k8s.io/kubernetes/pkg/scheduler/framework"
    28  	"k8s.io/kubernetes/pkg/scheduler/framework/plugins/feature"
    29  	"k8s.io/kubernetes/pkg/scheduler/framework/plugins/noderesources"
    30  
    31  	"github.com/kubewharf/katalyst-api/pkg/apis/scheduling/config"
    32  	"github.com/kubewharf/katalyst-api/pkg/apis/scheduling/config/validation"
    33  	"github.com/kubewharf/katalyst-api/pkg/consts"
    34  	"github.com/kubewharf/katalyst-core/pkg/scheduler/cache"
    35  	"github.com/kubewharf/katalyst-core/pkg/scheduler/eventhandlers"
    36  	"github.com/kubewharf/katalyst-core/pkg/scheduler/util"
    37  	"github.com/kubewharf/katalyst-core/pkg/util/native"
    38  )
    39  
    40  var (
    41  	_ framework.PreFilterPlugin   = &Fit{}
    42  	_ framework.FilterPlugin      = &Fit{}
    43  	_ framework.EnqueueExtensions = &Fit{}
    44  	_ framework.ScorePlugin       = &Fit{}
    45  	_ framework.ReservePlugin     = &Fit{}
    46  )
    47  
    48  const (
    49  	// FitName is the name of the plugin used in the plugin registry and configurations.
    50  	FitName = "QoSAwareNodeResourcesFit"
    51  
    52  	// preFilterStateKey is the key in CycleState to NodeResourcesFit pre-computed data.
    53  	// Using the name of the plugin will likely help us avoid collisions with other plugins.
    54  	preFilterStateKey = "PreFilter" + FitName
    55  )
    56  
    57  // nodeResourceStrategyTypeMap maps strategy to scorer implementation
    58  var nodeResourceStrategyTypeMap = map[kubeschedulerconfig.ScoringStrategyType]scorer{
    59  	kubeschedulerconfig.LeastAllocated: func(args *config.QoSAwareNodeResourcesFitArgs) *resourceAllocationScorer {
    60  		resToWeightMap := resourcesToWeightMap(args.ScoringStrategy.ReclaimedResources)
    61  		return &resourceAllocationScorer{
    62  			Name:                string(kubeschedulerconfig.LeastAllocated),
    63  			scorer:              leastResourceScorer(resToWeightMap),
    64  			resourceToWeightMap: resToWeightMap,
    65  		}
    66  	},
    67  	kubeschedulerconfig.MostAllocated: func(args *config.QoSAwareNodeResourcesFitArgs) *resourceAllocationScorer {
    68  		resToWeightMap := resourcesToWeightMap(args.ScoringStrategy.ReclaimedResources)
    69  		return &resourceAllocationScorer{
    70  			Name:                string(kubeschedulerconfig.MostAllocated),
    71  			scorer:              mostResourceScorer(resToWeightMap),
    72  			resourceToWeightMap: resToWeightMap,
    73  		}
    74  	},
    75  	kubeschedulerconfig.RequestedToCapacityRatio: func(args *config.QoSAwareNodeResourcesFitArgs) *resourceAllocationScorer {
    76  		resToWeightMap := resourcesToWeightMap(args.ScoringStrategy.ReclaimedResources)
    77  		return &resourceAllocationScorer{
    78  			Name:                string(kubeschedulerconfig.RequestedToCapacityRatio),
    79  			scorer:              requestedToCapacityRatioScorer(resToWeightMap, args.ScoringStrategy.ReclaimedRequestedToCapacityRatio.Shape),
    80  			resourceToWeightMap: resToWeightMap,
    81  		}
    82  	},
    83  }
    84  
    85  // Fit is a plugin that checks if a node has sufficient resources.
    86  type Fit struct {
    87  	handle framework.Handle
    88  	resourceAllocationScorer
    89  	nativeFit *noderesources.Fit
    90  }
    91  
    92  // ScoreExtensions of the Score plugin.
    93  func (f *Fit) ScoreExtensions() framework.ScoreExtensions {
    94  	return nil
    95  }
    96  
    97  // preFilterState computed at PreFilter and used at Filter.
    98  type preFilterState struct {
    99  	native.QoSResource
   100  }
   101  
   102  // Clone the prefilter state.
   103  func (s *preFilterState) Clone() framework.StateData {
   104  	return s
   105  }
   106  
   107  // Name returns name of the plugin. It is used in logs, etc.
   108  func (f *Fit) Name() string {
   109  	return FitName
   110  }
   111  
   112  // NewFit initializes a new plugin and returns it.
   113  func NewFit(plArgs runtime.Object, h framework.Handle) (framework.Plugin, error) {
   114  	args, ok := plArgs.(*config.QoSAwareNodeResourcesFitArgs)
   115  	if !ok {
   116  		return nil, fmt.Errorf("want args to be of type NodeQoSResourcesFitArgs, got %T", plArgs)
   117  	}
   118  	if err := validation.ValidateQoSAwareNodeResourcesFitArgs(nil, args); err != nil {
   119  		return nil, err
   120  	}
   121  
   122  	if args.ScoringStrategy == nil {
   123  		return nil, fmt.Errorf("scoring strategy not specified")
   124  	}
   125  
   126  	strategy := args.ScoringStrategy.Type
   127  	scorePlugin, exists := nodeResourceStrategyTypeMap[strategy]
   128  	if !exists {
   129  		return nil, fmt.Errorf("scoring strategy %s is not supported", strategy)
   130  	}
   131  
   132  	nativeFit, err := newNativeFit(args, h)
   133  	if err != nil {
   134  		return nil, err
   135  	}
   136  
   137  	eventhandlers.RegisterCommonPodHandler()
   138  	eventhandlers.RegisterCommonCNRHandler()
   139  
   140  	return &Fit{
   141  		handle:                   h,
   142  		resourceAllocationScorer: *scorePlugin(args),
   143  		nativeFit:                nativeFit,
   144  	}, nil
   145  }
   146  
   147  func newNativeFit(args *config.QoSAwareNodeResourcesFitArgs, h framework.Handle) (*noderesources.Fit, error) {
   148  	scoringStrategy := &kubeschedulerconfig.ScoringStrategy{
   149  		Type:                     args.ScoringStrategy.Type,
   150  		Resources:                args.ScoringStrategy.Resources,
   151  		RequestedToCapacityRatio: args.ScoringStrategy.RequestedToCapacityRatio,
   152  	}
   153  
   154  	nativeFitPlugin, err := noderesources.NewFit(
   155  		&kubeschedulerconfig.NodeResourcesFitArgs{
   156  			ScoringStrategy: scoringStrategy,
   157  		}, h, feature.Features{},
   158  	)
   159  	if err != nil {
   160  		return nil, err
   161  	}
   162  
   163  	nativeFit, ok := nativeFitPlugin.(*noderesources.Fit)
   164  	if !ok {
   165  		return nil, fmt.Errorf("assert nativeFit type error, got %T", nativeFitPlugin)
   166  	}
   167  
   168  	return nativeFit, nil
   169  }
   170  
   171  // PreFilter invoked at the prefilter extension point.
   172  func (f *Fit) PreFilter(ctx context.Context, cycleState *framework.CycleState, pod *v1.Pod) (*framework.PreFilterResult, *framework.Status) {
   173  	if !util.IsReclaimedPod(pod) {
   174  		return nil, nil
   175  	}
   176  	cycleState.Write(preFilterStateKey, computePodQoSResourceRequest(pod))
   177  	return nil, nil
   178  }
   179  
   180  // PreFilterExtensions returns prefilter extensions, pod add and remove.
   181  func (f *Fit) PreFilterExtensions() framework.PreFilterExtensions {
   182  	return nil
   183  }
   184  
   185  // computePodQoSResourceRequest returns a framework.Resource that covers the largest
   186  // width in each resource dimension. Because init-containers run sequentially, we collect
   187  // the max in each dimension iteratively. In contrast, we sum the resource vectors for
   188  // regular containers since they run simultaneously.
   189  //
   190  // the resources defined for Overhead should be added to the calculated QoSResource request sum
   191  //
   192  // example:
   193  /*
   194  // Pod:
   195  //   InitContainers
   196  //     IC1:
   197  //       CPU: 2
   198  //       Memory: 1G
   199  //     IC2:
   200  //       CPU: 2
   201  //       Memory: 3G
   202  //   Containers
   203  //     C1:
   204  //       CPU: 2
   205  //       Memory: 1G
   206  //     C2:
   207  //       CPU: 1
   208  //       Memory: 1G
   209  //
   210  // Result: CPU: 3, Memory: 3G
   211  */
   212  func computePodQoSResourceRequest(pod *v1.Pod) *preFilterState {
   213  	result := &preFilterState{}
   214  	for _, container := range pod.Spec.Containers {
   215  		result.Add(container.Resources.Requests)
   216  	}
   217  
   218  	// take max_resource(sum_pod, any_init_container)
   219  	for _, container := range pod.Spec.InitContainers {
   220  		result.SetMaxResource(container.Resources.Requests)
   221  	}
   222  
   223  	// If Overhead is being utilized, add to the total requests for the pod
   224  	if pod.Spec.Overhead != nil {
   225  		result.Add(pod.Spec.Overhead)
   226  	}
   227  	return result
   228  }
   229  
   230  func getPreFilterState(cycleState *framework.CycleState) (*preFilterState, error) {
   231  	c, err := cycleState.Read(preFilterStateKey)
   232  	if err != nil {
   233  		// preFilterState doesn't exist, likely PreFilter wasn't invoked.
   234  		return nil, fmt.Errorf("error reading %q from cycleState: %w", preFilterStateKey, err)
   235  	}
   236  
   237  	s, ok := c.(*preFilterState)
   238  	if !ok {
   239  		return nil, fmt.Errorf("%+v  convert to NodeQoSResourcesFit.preFilterState error", c)
   240  	}
   241  	return s, nil
   242  }
   243  
   244  // EventsToRegister returns the possible events that may make a Pod
   245  // failed by this plugin schedulable.
   246  // NOTE: if in-place-update (KEP 1287) gets implemented, then PodUpdate event
   247  // should be registered for this plugin since a Pod update may free up resources
   248  // that make other Pods schedulable.
   249  func (f *Fit) EventsToRegister() []framework.ClusterEvent {
   250  	return []framework.ClusterEvent{
   251  		{Resource: framework.Pod, ActionType: framework.Delete},
   252  		{Resource: framework.Node, ActionType: framework.Add},
   253  	}
   254  }
   255  
   256  // Filter invoked at the filter extension point.
   257  // Checks if a node has sufficient resources, such as cpu, memory, gpu, opaque int resources etc to run a pod.
   258  // It returns a list of insufficient resources, if empty, then the node has all the resources requested by the pod.
   259  func (f *Fit) Filter(ctx context.Context, cycleState *framework.CycleState, pod *v1.Pod, nodeInfo *framework.NodeInfo) *framework.Status {
   260  	if !util.IsReclaimedPod(pod) {
   261  		return nil
   262  	}
   263  
   264  	s, err := getPreFilterState(cycleState)
   265  	if err != nil {
   266  		return framework.AsStatus(err)
   267  	}
   268  
   269  	insufficientResources := fitsRequest(s, nodeInfo)
   270  
   271  	if len(insufficientResources) != 0 {
   272  		// We will keep all failure reasons.
   273  		failureReasons := make([]string, 0, len(insufficientResources))
   274  		for i := range insufficientResources {
   275  			failureReasons = append(failureReasons, insufficientResources[i].Reason)
   276  		}
   277  		return framework.NewStatus(framework.Unschedulable, failureReasons...)
   278  	}
   279  
   280  	return nil
   281  }
   282  
   283  // InsufficientResource describes what kind of resource limit is hit and caused the pod to not fit the node.
   284  type InsufficientResource struct {
   285  	ResourceName v1.ResourceName
   286  	// We explicitly have a parameter for reason to avoid formatting a message on the fly
   287  	// for common resources, which is expensive for cluster autoscaler simulations.
   288  	Reason    string
   289  	Requested int64
   290  	Used      int64
   291  	Capacity  int64
   292  }
   293  
   294  func fitsRequest(podRequest *preFilterState, nodeInfo *framework.NodeInfo) []InsufficientResource {
   295  	insufficientResources := make([]InsufficientResource, 0, 2)
   296  
   297  	if podRequest.ReclaimedMilliCPU == 0 &&
   298  		podRequest.ReclaimedMemory == 0 {
   299  		return insufficientResources
   300  	}
   301  
   302  	extendedNodeInfo, err := cache.GetCache().GetNodeInfo(nodeInfo.Node().GetName())
   303  	if err != nil {
   304  		insufficientResources = append(insufficientResources,
   305  			InsufficientResource{
   306  				Reason: err.Error(),
   307  			},
   308  		)
   309  		return insufficientResources
   310  	}
   311  
   312  	extendedNodeInfo.Mutex.RLock()
   313  	defer extendedNodeInfo.Mutex.RUnlock()
   314  
   315  	if podRequest.ReclaimedMilliCPU > (extendedNodeInfo.QoSResourcesAllocatable.ReclaimedMilliCPU - extendedNodeInfo.QoSResourcesRequested.ReclaimedMilliCPU) {
   316  		insufficientResources = append(insufficientResources, InsufficientResource{
   317  			ResourceName: consts.ReclaimedResourceMilliCPU,
   318  			Reason:       fmt.Sprintf("Insufficient %s", consts.ReclaimedResourceMilliCPU),
   319  			Requested:    podRequest.ReclaimedMilliCPU,
   320  			Used:         extendedNodeInfo.QoSResourcesRequested.ReclaimedMilliCPU,
   321  			Capacity:     extendedNodeInfo.QoSResourcesAllocatable.ReclaimedMilliCPU,
   322  		})
   323  	}
   324  	if podRequest.ReclaimedMemory > (extendedNodeInfo.QoSResourcesAllocatable.ReclaimedMemory - extendedNodeInfo.QoSResourcesRequested.ReclaimedMemory) {
   325  		insufficientResources = append(insufficientResources, InsufficientResource{
   326  			ResourceName: consts.ReclaimedResourceMemory,
   327  			Reason:       fmt.Sprintf("Insufficient %s", consts.ReclaimedResourceMemory),
   328  			Requested:    podRequest.ReclaimedMemory,
   329  			Used:         extendedNodeInfo.QoSResourcesRequested.ReclaimedMemory,
   330  			Capacity:     extendedNodeInfo.QoSResourcesAllocatable.ReclaimedMemory,
   331  		})
   332  	}
   333  
   334  	return insufficientResources
   335  }
   336  
   337  // Score invoked at the Score extension point.
   338  func (f *Fit) Score(ctx context.Context, state *framework.CycleState, pod *v1.Pod, nodeName string) (int64, *framework.Status) {
   339  	if util.IsReclaimedPod(pod) {
   340  		extendedNodeInfo, err := cache.GetCache().GetNodeInfo(nodeName)
   341  		if err != nil {
   342  			return 0, framework.AsStatus(fmt.Errorf("getting node %q error: %w", nodeName, err))
   343  		}
   344  
   345  		return f.score(pod, extendedNodeInfo, nodeName)
   346  	}
   347  
   348  	return f.nativeFit.Score(ctx, state, pod, nodeName)
   349  }
   350  
   351  // Reserve is the functions invoked by the framework at "Reserve" extension point.
   352  func (f *Fit) Reserve(ctx context.Context, state *framework.CycleState, pod *v1.Pod, nodeName string) *framework.Status {
   353  	if !util.IsReclaimedPod(pod) || nodeName == "" || native.PodIsTerminated(pod) {
   354  		return nil
   355  	}
   356  
   357  	newPod := pod.DeepCopy()
   358  	newPod.Spec.NodeName = nodeName
   359  
   360  	if err := cache.GetCache().AddPod(newPod); err != nil {
   361  		return framework.NewStatus(framework.Unschedulable, fmt.Sprintf("extended cache reserve failed, err: %s", err.Error()))
   362  	}
   363  
   364  	return nil
   365  }
   366  
   367  // Unreserve is the functions invoked by the framework at "Unreserve" extension point.
   368  func (f *Fit) Unreserve(ctx context.Context, state *framework.CycleState, pod *v1.Pod, nodeName string) {
   369  	if !util.IsReclaimedPod(pod) || nodeName == "" {
   370  		return
   371  	}
   372  
   373  	newPod := pod.DeepCopy()
   374  	newPod.Spec.NodeName = nodeName
   375  
   376  	if err := cache.GetCache().RemovePod(newPod); err != nil {
   377  		klog.ErrorS(err, "Unreserve failed to RemovePod",
   378  			"pod", klog.KObj(pod), "node", nodeName)
   379  	}
   380  }