github.com/kubewharf/katalyst-core@v0.5.3/pkg/scheduler/plugins/nodeovercommitment/cache/cache.go

github.com/kubewharf/katalyst-core@v0.5.3/pkg/scheduler/plugins/nodeovercommitment/cache/cache.go (about)

     1  /*
     2  Copyright 2022 The Katalyst Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package cache
    18  
    19  import (
    20  	"fmt"
    21  	"sync"
    22  
    23  	v1 "k8s.io/api/core/v1"
    24  	"k8s.io/klog/v2"
    25  	"k8s.io/kubernetes/pkg/features"
    26  	"k8s.io/kubernetes/pkg/kubelet/cm/cpumanager"
    27  	"k8s.io/kubernetes/pkg/scheduler/framework"
    28  
    29  	"github.com/kubewharf/katalyst-api/pkg/apis/node/v1alpha1"
    30  	"github.com/kubewharf/katalyst-api/pkg/consts"
    31  	"github.com/kubewharf/katalyst-core/pkg/util/native"
    32  )
    33  
    34  var cache *overcommitCache
    35  
    36  func init() {
    37  	cache = &overcommitCache{
    38  		nodeCaches: map[string]*NodeCache{},
    39  	}
    40  }
    41  
    42  // cache stored node native topology providers and guaranteed requested resource.
    43  // only used in overcommit scenario when kubelet uses native topology strategy.
    44  type overcommitCache struct {
    45  	sync.RWMutex
    46  	nodeCaches map[string]*NodeCache
    47  }
    48  
    49  func GetCache() *overcommitCache {
    50  	return cache
    51  }
    52  
    53  func (c *overcommitCache) GetNode(name string) (*NodeCache, error) {
    54  	c.RLock()
    55  	defer c.RUnlock()
    56  
    57  	node, ok := c.nodeCaches[name]
    58  	if !ok {
    59  		return nil, fmt.Errorf("node %v not found", name)
    60  	}
    61  
    62  	return node, nil
    63  }
    64  
    65  func (c *overcommitCache) AddPod(pod *v1.Pod) error {
    66  	key, err := framework.GetPodKey(pod)
    67  	if err != nil {
    68  		return err
    69  	}
    70  
    71  	c.Lock()
    72  	defer c.Unlock()
    73  
    74  	n, ok := c.nodeCaches[pod.Spec.NodeName]
    75  	if !ok {
    76  		n = New()
    77  		c.nodeCaches[pod.Spec.NodeName] = n
    78  	}
    79  	n.AddPod(key, pod)
    80  
    81  	return nil
    82  }
    83  
    84  func (c *overcommitCache) RemovePod(pod *v1.Pod) error {
    85  	key, err := framework.GetPodKey(pod)
    86  	if err != nil {
    87  		return err
    88  	}
    89  
    90  	c.Lock()
    91  	defer c.Unlock()
    92  
    93  	n, ok := c.nodeCaches[pod.Spec.NodeName]
    94  	if !ok {
    95  		klog.ErrorS(nil, "Node not found when trying to remove pod", "node", klog.KRef("", pod.Spec.NodeName), "pod", klog.KObj(pod))
    96  	} else {
    97  		n.RemovePod(key, pod)
    98  	}
    99  
   100  	return nil
   101  }
   102  
   103  func (c *overcommitCache) AddOrUpdateCNR(cnr *v1alpha1.CustomNodeResource) {
   104  	c.Lock()
   105  	defer c.Unlock()
   106  
   107  	n, ok := c.nodeCaches[cnr.Name]
   108  	if !ok {
   109  		n = New()
   110  		c.nodeCaches[cnr.Name] = n
   111  	}
   112  
   113  	n.updateTopologyProvider(cnr)
   114  }
   115  
   116  func (c *overcommitCache) RemoveCNR(cnr *v1alpha1.CustomNodeResource) {
   117  	c.Lock()
   118  	defer c.Unlock()
   119  
   120  	delete(c.nodeCaches, cnr.Name)
   121  }
   122  
   123  type NodeCache struct {
   124  	sync.RWMutex
   125  
   126  	PodResources map[string]int
   127  
   128  	// kubelet topology hint providers from CNR annotation.
   129  	// provider will be cached only if provider policy is available.
   130  	// only used for node resource overcommitment.
   131  	HintProviders map[string]struct{}
   132  
   133  	// total guaranteed cpus on node
   134  	GuaranteedCPUs int
   135  }
   136  
   137  func New() *NodeCache {
   138  	return &NodeCache{
   139  		PodResources:  map[string]int{},
   140  		HintProviders: map[string]struct{}{},
   141  	}
   142  }
   143  
   144  func (n *NodeCache) AddPod(key string, pod *v1.Pod) {
   145  	n.RemovePod(key, pod)
   146  	guaranteedCPUs := native.PodGuaranteedCPUs(pod)
   147  
   148  	n.Lock()
   149  	defer n.Unlock()
   150  
   151  	n.PodResources[key] = guaranteedCPUs
   152  	n.GuaranteedCPUs += guaranteedCPUs
   153  }
   154  
   155  func (n *NodeCache) RemovePod(key string, pod *v1.Pod) {
   156  	n.Lock()
   157  	defer n.Unlock()
   158  	podResource, ok := n.PodResources[key]
   159  	if !ok {
   160  		return
   161  	}
   162  
   163  	n.GuaranteedCPUs -= podResource
   164  	delete(n.PodResources, key)
   165  }
   166  
   167  func (n *NodeCache) updateTopologyProvider(cnr *v1alpha1.CustomNodeResource) {
   168  	if len(cnr.Annotations) <= 0 {
   169  		return
   170  	}
   171  
   172  	if CPUManagerPolicy, ok := cnr.Annotations[consts.KCNRAnnotationCPUManager]; ok {
   173  		if CPUManagerPolicy == string(cpumanager.PolicyStatic) {
   174  			n.HintProviders[string(features.CPUManager)] = struct{}{}
   175  		}
   176  	}
   177  
   178  	if memoryManagerPolicy, ok := cnr.Annotations[consts.KCNRAnnotationMemoryManager]; ok {
   179  		if memoryManagerPolicy == "Static" {
   180  			n.HintProviders[string(features.MemoryManager)] = struct{}{}
   181  		}
   182  	}
   183  }
   184  
   185  func (n *NodeCache) HintProvidersAvailable() (CPUManager, MemoryManager bool) {
   186  	n.RLock()
   187  	defer n.RUnlock()
   188  
   189  	_, ok := n.HintProviders[string(features.CPUManager)]
   190  	if ok {
   191  		CPUManager = true
   192  	}
   193  
   194  	_, ok = n.HintProviders[string(features.MemoryManager)]
   195  	if ok {
   196  		MemoryManager = true
   197  	}
   198  
   199  	return
   200  }
   201  
   202  func (n *NodeCache) GetGuaranteedCPUs() int {
   203  	n.RLock()
   204  	defer n.RUnlock()
   205  
   206  	return n.GuaranteedCPUs
   207  }