k8s.io/kubernetes@v1.29.3/pkg/scheduler/internal/cache/cache.go (about)

     1  /*
     2  Copyright 2015 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package cache
    18  
    19  import (
    20  	"context"
    21  	"fmt"
    22  	"sync"
    23  	"time"
    24  
    25  	v1 "k8s.io/api/core/v1"
    26  	"k8s.io/apimachinery/pkg/util/sets"
    27  	"k8s.io/apimachinery/pkg/util/wait"
    28  	"k8s.io/klog/v2"
    29  	"k8s.io/kubernetes/pkg/scheduler/framework"
    30  	"k8s.io/kubernetes/pkg/scheduler/metrics"
    31  )
    32  
    33  var (
    34  	cleanAssumedPeriod = 1 * time.Second
    35  )
    36  
    37  // New returns a Cache implementation.
    38  // It automatically starts a go routine that manages expiration of assumed pods.
    39  // "ttl" is how long the assumed pod will get expired.
    40  // "ctx" is the context that would close the background goroutine.
    41  func New(ctx context.Context, ttl time.Duration) Cache {
    42  	logger := klog.FromContext(ctx)
    43  	cache := newCache(ctx, ttl, cleanAssumedPeriod)
    44  	cache.run(logger)
    45  	return cache
    46  }
    47  
    48  // nodeInfoListItem holds a NodeInfo pointer and acts as an item in a doubly
    49  // linked list. When a NodeInfo is updated, it goes to the head of the list.
    50  // The items closer to the head are the most recently updated items.
    51  type nodeInfoListItem struct {
    52  	info *framework.NodeInfo
    53  	next *nodeInfoListItem
    54  	prev *nodeInfoListItem
    55  }
    56  
    57  type cacheImpl struct {
    58  	stop   <-chan struct{}
    59  	ttl    time.Duration
    60  	period time.Duration
    61  
    62  	// This mutex guards all fields within this cache struct.
    63  	mu sync.RWMutex
    64  	// a set of assumed pod keys.
    65  	// The key could further be used to get an entry in podStates.
    66  	assumedPods sets.Set[string]
    67  	// a map from pod key to podState.
    68  	podStates map[string]*podState
    69  	nodes     map[string]*nodeInfoListItem
    70  	// headNode points to the most recently updated NodeInfo in "nodes". It is the
    71  	// head of the linked list.
    72  	headNode *nodeInfoListItem
    73  	nodeTree *nodeTree
    74  	// A map from image name to its ImageStateSummary.
    75  	imageStates map[string]*framework.ImageStateSummary
    76  }
    77  
    78  type podState struct {
    79  	pod *v1.Pod
    80  	// Used by assumedPod to determinate expiration.
    81  	// If deadline is nil, assumedPod will never expire.
    82  	deadline *time.Time
    83  	// Used to block cache from expiring assumedPod if binding still runs
    84  	bindingFinished bool
    85  }
    86  
    87  func newCache(ctx context.Context, ttl, period time.Duration) *cacheImpl {
    88  	logger := klog.FromContext(ctx)
    89  	return &cacheImpl{
    90  		ttl:    ttl,
    91  		period: period,
    92  		stop:   ctx.Done(),
    93  
    94  		nodes:       make(map[string]*nodeInfoListItem),
    95  		nodeTree:    newNodeTree(logger, nil),
    96  		assumedPods: sets.New[string](),
    97  		podStates:   make(map[string]*podState),
    98  		imageStates: make(map[string]*framework.ImageStateSummary),
    99  	}
   100  }
   101  
   102  // newNodeInfoListItem initializes a new nodeInfoListItem.
   103  func newNodeInfoListItem(ni *framework.NodeInfo) *nodeInfoListItem {
   104  	return &nodeInfoListItem{
   105  		info: ni,
   106  	}
   107  }
   108  
   109  // moveNodeInfoToHead moves a NodeInfo to the head of "cache.nodes" doubly
   110  // linked list. The head is the most recently updated NodeInfo.
   111  // We assume cache lock is already acquired.
   112  func (cache *cacheImpl) moveNodeInfoToHead(logger klog.Logger, name string) {
   113  	ni, ok := cache.nodes[name]
   114  	if !ok {
   115  		logger.Error(nil, "No node info with given name found in the cache", "node", klog.KRef("", name))
   116  		return
   117  	}
   118  	// if the node info list item is already at the head, we are done.
   119  	if ni == cache.headNode {
   120  		return
   121  	}
   122  
   123  	if ni.prev != nil {
   124  		ni.prev.next = ni.next
   125  	}
   126  	if ni.next != nil {
   127  		ni.next.prev = ni.prev
   128  	}
   129  	if cache.headNode != nil {
   130  		cache.headNode.prev = ni
   131  	}
   132  	ni.next = cache.headNode
   133  	ni.prev = nil
   134  	cache.headNode = ni
   135  }
   136  
   137  // removeNodeInfoFromList removes a NodeInfo from the "cache.nodes" doubly
   138  // linked list.
   139  // We assume cache lock is already acquired.
   140  func (cache *cacheImpl) removeNodeInfoFromList(logger klog.Logger, name string) {
   141  	ni, ok := cache.nodes[name]
   142  	if !ok {
   143  		logger.Error(nil, "No node info with given name found in the cache", "node", klog.KRef("", name))
   144  		return
   145  	}
   146  
   147  	if ni.prev != nil {
   148  		ni.prev.next = ni.next
   149  	}
   150  	if ni.next != nil {
   151  		ni.next.prev = ni.prev
   152  	}
   153  	// if the removed item was at the head, we must update the head.
   154  	if ni == cache.headNode {
   155  		cache.headNode = ni.next
   156  	}
   157  	delete(cache.nodes, name)
   158  }
   159  
   160  // Dump produces a dump of the current scheduler cache. This is used for
   161  // debugging purposes only and shouldn't be confused with UpdateSnapshot
   162  // function.
   163  // This method is expensive, and should be only used in non-critical path.
   164  func (cache *cacheImpl) Dump() *Dump {
   165  	cache.mu.RLock()
   166  	defer cache.mu.RUnlock()
   167  
   168  	nodes := make(map[string]*framework.NodeInfo, len(cache.nodes))
   169  	for k, v := range cache.nodes {
   170  		nodes[k] = v.info.Snapshot()
   171  	}
   172  
   173  	return &Dump{
   174  		Nodes:       nodes,
   175  		AssumedPods: cache.assumedPods.Union(nil),
   176  	}
   177  }
   178  
   179  // UpdateSnapshot takes a snapshot of cached NodeInfo map. This is called at
   180  // beginning of every scheduling cycle.
   181  // The snapshot only includes Nodes that are not deleted at the time this function is called.
   182  // nodeInfo.Node() is guaranteed to be not nil for all the nodes in the snapshot.
   183  // This function tracks generation number of NodeInfo and updates only the
   184  // entries of an existing snapshot that have changed after the snapshot was taken.
   185  func (cache *cacheImpl) UpdateSnapshot(logger klog.Logger, nodeSnapshot *Snapshot) error {
   186  	cache.mu.Lock()
   187  	defer cache.mu.Unlock()
   188  
   189  	// Get the last generation of the snapshot.
   190  	snapshotGeneration := nodeSnapshot.generation
   191  
   192  	// NodeInfoList and HavePodsWithAffinityNodeInfoList must be re-created if a node was added
   193  	// or removed from the cache.
   194  	updateAllLists := false
   195  	// HavePodsWithAffinityNodeInfoList must be re-created if a node changed its
   196  	// status from having pods with affinity to NOT having pods with affinity or the other
   197  	// way around.
   198  	updateNodesHavePodsWithAffinity := false
   199  	// HavePodsWithRequiredAntiAffinityNodeInfoList must be re-created if a node changed its
   200  	// status from having pods with required anti-affinity to NOT having pods with required
   201  	// anti-affinity or the other way around.
   202  	updateNodesHavePodsWithRequiredAntiAffinity := false
   203  	// usedPVCSet must be re-created whenever the head node generation is greater than
   204  	// last snapshot generation.
   205  	updateUsedPVCSet := false
   206  
   207  	// Start from the head of the NodeInfo doubly linked list and update snapshot
   208  	// of NodeInfos updated after the last snapshot.
   209  	for node := cache.headNode; node != nil; node = node.next {
   210  		if node.info.Generation <= snapshotGeneration {
   211  			// all the nodes are updated before the existing snapshot. We are done.
   212  			break
   213  		}
   214  		if np := node.info.Node(); np != nil {
   215  			existing, ok := nodeSnapshot.nodeInfoMap[np.Name]
   216  			if !ok {
   217  				updateAllLists = true
   218  				existing = &framework.NodeInfo{}
   219  				nodeSnapshot.nodeInfoMap[np.Name] = existing
   220  			}
   221  			clone := node.info.Snapshot()
   222  			// We track nodes that have pods with affinity, here we check if this node changed its
   223  			// status from having pods with affinity to NOT having pods with affinity or the other
   224  			// way around.
   225  			if (len(existing.PodsWithAffinity) > 0) != (len(clone.PodsWithAffinity) > 0) {
   226  				updateNodesHavePodsWithAffinity = true
   227  			}
   228  			if (len(existing.PodsWithRequiredAntiAffinity) > 0) != (len(clone.PodsWithRequiredAntiAffinity) > 0) {
   229  				updateNodesHavePodsWithRequiredAntiAffinity = true
   230  			}
   231  			if !updateUsedPVCSet {
   232  				if len(existing.PVCRefCounts) != len(clone.PVCRefCounts) {
   233  					updateUsedPVCSet = true
   234  				} else {
   235  					for pvcKey := range clone.PVCRefCounts {
   236  						if _, found := existing.PVCRefCounts[pvcKey]; !found {
   237  							updateUsedPVCSet = true
   238  							break
   239  						}
   240  					}
   241  				}
   242  			}
   243  			// We need to preserve the original pointer of the NodeInfo struct since it
   244  			// is used in the NodeInfoList, which we may not update.
   245  			*existing = *clone
   246  		}
   247  	}
   248  	// Update the snapshot generation with the latest NodeInfo generation.
   249  	if cache.headNode != nil {
   250  		nodeSnapshot.generation = cache.headNode.info.Generation
   251  	}
   252  
   253  	// Comparing to pods in nodeTree.
   254  	// Deleted nodes get removed from the tree, but they might remain in the nodes map
   255  	// if they still have non-deleted Pods.
   256  	if len(nodeSnapshot.nodeInfoMap) > cache.nodeTree.numNodes {
   257  		cache.removeDeletedNodesFromSnapshot(nodeSnapshot)
   258  		updateAllLists = true
   259  	}
   260  
   261  	if updateAllLists || updateNodesHavePodsWithAffinity || updateNodesHavePodsWithRequiredAntiAffinity || updateUsedPVCSet {
   262  		cache.updateNodeInfoSnapshotList(logger, nodeSnapshot, updateAllLists)
   263  	}
   264  
   265  	if len(nodeSnapshot.nodeInfoList) != cache.nodeTree.numNodes {
   266  		errMsg := fmt.Sprintf("snapshot state is not consistent, length of NodeInfoList=%v not equal to length of nodes in tree=%v "+
   267  			", length of NodeInfoMap=%v, length of nodes in cache=%v"+
   268  			", trying to recover",
   269  			len(nodeSnapshot.nodeInfoList), cache.nodeTree.numNodes,
   270  			len(nodeSnapshot.nodeInfoMap), len(cache.nodes))
   271  		logger.Error(nil, errMsg)
   272  		// We will try to recover by re-creating the lists for the next scheduling cycle, but still return an
   273  		// error to surface the problem, the error will likely cause a failure to the current scheduling cycle.
   274  		cache.updateNodeInfoSnapshotList(logger, nodeSnapshot, true)
   275  		return fmt.Errorf(errMsg)
   276  	}
   277  
   278  	return nil
   279  }
   280  
   281  func (cache *cacheImpl) updateNodeInfoSnapshotList(logger klog.Logger, snapshot *Snapshot, updateAll bool) {
   282  	snapshot.havePodsWithAffinityNodeInfoList = make([]*framework.NodeInfo, 0, cache.nodeTree.numNodes)
   283  	snapshot.havePodsWithRequiredAntiAffinityNodeInfoList = make([]*framework.NodeInfo, 0, cache.nodeTree.numNodes)
   284  	snapshot.usedPVCSet = sets.New[string]()
   285  	if updateAll {
   286  		// Take a snapshot of the nodes order in the tree
   287  		snapshot.nodeInfoList = make([]*framework.NodeInfo, 0, cache.nodeTree.numNodes)
   288  		nodesList, err := cache.nodeTree.list()
   289  		if err != nil {
   290  			logger.Error(err, "Error occurred while retrieving the list of names of the nodes from node tree")
   291  		}
   292  		for _, nodeName := range nodesList {
   293  			if nodeInfo := snapshot.nodeInfoMap[nodeName]; nodeInfo != nil {
   294  				snapshot.nodeInfoList = append(snapshot.nodeInfoList, nodeInfo)
   295  				if len(nodeInfo.PodsWithAffinity) > 0 {
   296  					snapshot.havePodsWithAffinityNodeInfoList = append(snapshot.havePodsWithAffinityNodeInfoList, nodeInfo)
   297  				}
   298  				if len(nodeInfo.PodsWithRequiredAntiAffinity) > 0 {
   299  					snapshot.havePodsWithRequiredAntiAffinityNodeInfoList = append(snapshot.havePodsWithRequiredAntiAffinityNodeInfoList, nodeInfo)
   300  				}
   301  				for key := range nodeInfo.PVCRefCounts {
   302  					snapshot.usedPVCSet.Insert(key)
   303  				}
   304  			} else {
   305  				logger.Error(nil, "Node exists in nodeTree but not in NodeInfoMap, this should not happen", "node", klog.KRef("", nodeName))
   306  			}
   307  		}
   308  	} else {
   309  		for _, nodeInfo := range snapshot.nodeInfoList {
   310  			if len(nodeInfo.PodsWithAffinity) > 0 {
   311  				snapshot.havePodsWithAffinityNodeInfoList = append(snapshot.havePodsWithAffinityNodeInfoList, nodeInfo)
   312  			}
   313  			if len(nodeInfo.PodsWithRequiredAntiAffinity) > 0 {
   314  				snapshot.havePodsWithRequiredAntiAffinityNodeInfoList = append(snapshot.havePodsWithRequiredAntiAffinityNodeInfoList, nodeInfo)
   315  			}
   316  			for key := range nodeInfo.PVCRefCounts {
   317  				snapshot.usedPVCSet.Insert(key)
   318  			}
   319  		}
   320  	}
   321  }
   322  
   323  // If certain nodes were deleted after the last snapshot was taken, we should remove them from the snapshot.
   324  func (cache *cacheImpl) removeDeletedNodesFromSnapshot(snapshot *Snapshot) {
   325  	toDelete := len(snapshot.nodeInfoMap) - cache.nodeTree.numNodes
   326  	for name := range snapshot.nodeInfoMap {
   327  		if toDelete <= 0 {
   328  			break
   329  		}
   330  		if n, ok := cache.nodes[name]; !ok || n.info.Node() == nil {
   331  			delete(snapshot.nodeInfoMap, name)
   332  			toDelete--
   333  		}
   334  	}
   335  }
   336  
   337  // NodeCount returns the number of nodes in the cache.
   338  // DO NOT use outside of tests.
   339  func (cache *cacheImpl) NodeCount() int {
   340  	cache.mu.RLock()
   341  	defer cache.mu.RUnlock()
   342  	return len(cache.nodes)
   343  }
   344  
   345  // PodCount returns the number of pods in the cache (including those from deleted nodes).
   346  // DO NOT use outside of tests.
   347  func (cache *cacheImpl) PodCount() (int, error) {
   348  	cache.mu.RLock()
   349  	defer cache.mu.RUnlock()
   350  	// podFilter is expected to return true for most or all of the pods. We
   351  	// can avoid expensive array growth without wasting too much memory by
   352  	// pre-allocating capacity.
   353  	count := 0
   354  	for _, n := range cache.nodes {
   355  		count += len(n.info.Pods)
   356  	}
   357  	return count, nil
   358  }
   359  
   360  func (cache *cacheImpl) AssumePod(logger klog.Logger, pod *v1.Pod) error {
   361  	key, err := framework.GetPodKey(pod)
   362  	if err != nil {
   363  		return err
   364  	}
   365  
   366  	cache.mu.Lock()
   367  	defer cache.mu.Unlock()
   368  	if _, ok := cache.podStates[key]; ok {
   369  		return fmt.Errorf("pod %v(%v) is in the cache, so can't be assumed", key, klog.KObj(pod))
   370  	}
   371  
   372  	return cache.addPod(logger, pod, true)
   373  }
   374  
   375  func (cache *cacheImpl) FinishBinding(logger klog.Logger, pod *v1.Pod) error {
   376  	return cache.finishBinding(logger, pod, time.Now())
   377  }
   378  
   379  // finishBinding exists to make tests deterministic by injecting now as an argument
   380  func (cache *cacheImpl) finishBinding(logger klog.Logger, pod *v1.Pod, now time.Time) error {
   381  	key, err := framework.GetPodKey(pod)
   382  	if err != nil {
   383  		return err
   384  	}
   385  
   386  	cache.mu.RLock()
   387  	defer cache.mu.RUnlock()
   388  
   389  	logger.V(5).Info("Finished binding for pod, can be expired", "podKey", key, "pod", klog.KObj(pod))
   390  	currState, ok := cache.podStates[key]
   391  	if ok && cache.assumedPods.Has(key) {
   392  		if cache.ttl == time.Duration(0) {
   393  			currState.deadline = nil
   394  		} else {
   395  			dl := now.Add(cache.ttl)
   396  			currState.deadline = &dl
   397  		}
   398  		currState.bindingFinished = true
   399  	}
   400  	return nil
   401  }
   402  
   403  func (cache *cacheImpl) ForgetPod(logger klog.Logger, pod *v1.Pod) error {
   404  	key, err := framework.GetPodKey(pod)
   405  	if err != nil {
   406  		return err
   407  	}
   408  
   409  	cache.mu.Lock()
   410  	defer cache.mu.Unlock()
   411  
   412  	currState, ok := cache.podStates[key]
   413  	if ok && currState.pod.Spec.NodeName != pod.Spec.NodeName {
   414  		return fmt.Errorf("pod %v(%v) was assumed on %v but assigned to %v", key, klog.KObj(pod), pod.Spec.NodeName, currState.pod.Spec.NodeName)
   415  	}
   416  
   417  	// Only assumed pod can be forgotten.
   418  	if ok && cache.assumedPods.Has(key) {
   419  		return cache.removePod(logger, pod)
   420  	}
   421  	return fmt.Errorf("pod %v(%v) wasn't assumed so cannot be forgotten", key, klog.KObj(pod))
   422  }
   423  
   424  // Assumes that lock is already acquired.
   425  func (cache *cacheImpl) addPod(logger klog.Logger, pod *v1.Pod, assumePod bool) error {
   426  	key, err := framework.GetPodKey(pod)
   427  	if err != nil {
   428  		return err
   429  	}
   430  	n, ok := cache.nodes[pod.Spec.NodeName]
   431  	if !ok {
   432  		n = newNodeInfoListItem(framework.NewNodeInfo())
   433  		cache.nodes[pod.Spec.NodeName] = n
   434  	}
   435  	n.info.AddPod(pod)
   436  	cache.moveNodeInfoToHead(logger, pod.Spec.NodeName)
   437  	ps := &podState{
   438  		pod: pod,
   439  	}
   440  	cache.podStates[key] = ps
   441  	if assumePod {
   442  		cache.assumedPods.Insert(key)
   443  	}
   444  	return nil
   445  }
   446  
   447  // Assumes that lock is already acquired.
   448  func (cache *cacheImpl) updatePod(logger klog.Logger, oldPod, newPod *v1.Pod) error {
   449  	if err := cache.removePod(logger, oldPod); err != nil {
   450  		return err
   451  	}
   452  	return cache.addPod(logger, newPod, false)
   453  }
   454  
   455  // Assumes that lock is already acquired.
   456  // Removes a pod from the cached node info. If the node information was already
   457  // removed and there are no more pods left in the node, cleans up the node from
   458  // the cache.
   459  func (cache *cacheImpl) removePod(logger klog.Logger, pod *v1.Pod) error {
   460  	key, err := framework.GetPodKey(pod)
   461  	if err != nil {
   462  		return err
   463  	}
   464  
   465  	n, ok := cache.nodes[pod.Spec.NodeName]
   466  	if !ok {
   467  		logger.Error(nil, "Node not found when trying to remove pod", "node", klog.KRef("", pod.Spec.NodeName), "podKey", key, "pod", klog.KObj(pod))
   468  	} else {
   469  		if err := n.info.RemovePod(logger, pod); err != nil {
   470  			return err
   471  		}
   472  		if len(n.info.Pods) == 0 && n.info.Node() == nil {
   473  			cache.removeNodeInfoFromList(logger, pod.Spec.NodeName)
   474  		} else {
   475  			cache.moveNodeInfoToHead(logger, pod.Spec.NodeName)
   476  		}
   477  	}
   478  
   479  	delete(cache.podStates, key)
   480  	delete(cache.assumedPods, key)
   481  	return nil
   482  }
   483  
   484  func (cache *cacheImpl) AddPod(logger klog.Logger, pod *v1.Pod) error {
   485  	key, err := framework.GetPodKey(pod)
   486  	if err != nil {
   487  		return err
   488  	}
   489  
   490  	cache.mu.Lock()
   491  	defer cache.mu.Unlock()
   492  
   493  	currState, ok := cache.podStates[key]
   494  	switch {
   495  	case ok && cache.assumedPods.Has(key):
   496  		// When assuming, we've already added the Pod to cache,
   497  		// Just update here to make sure the Pod's status is up-to-date.
   498  		if err = cache.updatePod(logger, currState.pod, pod); err != nil {
   499  			logger.Error(err, "Error occurred while updating pod")
   500  		}
   501  		if currState.pod.Spec.NodeName != pod.Spec.NodeName {
   502  			// The pod was added to a different node than it was assumed to.
   503  			logger.Info("Pod was added to a different node than it was assumed", "podKey", key, "pod", klog.KObj(pod), "assumedNode", klog.KRef("", pod.Spec.NodeName), "currentNode", klog.KRef("", currState.pod.Spec.NodeName))
   504  			return nil
   505  		}
   506  	case !ok:
   507  		// Pod was expired. We should add it back.
   508  		if err = cache.addPod(logger, pod, false); err != nil {
   509  			logger.Error(err, "Error occurred while adding pod")
   510  		}
   511  	default:
   512  		return fmt.Errorf("pod %v(%v) was already in added state", key, klog.KObj(pod))
   513  	}
   514  	return nil
   515  }
   516  
   517  func (cache *cacheImpl) UpdatePod(logger klog.Logger, oldPod, newPod *v1.Pod) error {
   518  	key, err := framework.GetPodKey(oldPod)
   519  	if err != nil {
   520  		return err
   521  	}
   522  
   523  	cache.mu.Lock()
   524  	defer cache.mu.Unlock()
   525  
   526  	currState, ok := cache.podStates[key]
   527  	if !ok {
   528  		return fmt.Errorf("pod %v(%v) is not added to scheduler cache, so cannot be updated", key, klog.KObj(oldPod))
   529  	}
   530  
   531  	// An assumed pod won't have Update/Remove event. It needs to have Add event
   532  	// before Update event, in which case the state would change from Assumed to Added.
   533  	if cache.assumedPods.Has(key) {
   534  		return fmt.Errorf("assumed pod %v(%v) should not be updated", key, klog.KObj(oldPod))
   535  	}
   536  
   537  	if currState.pod.Spec.NodeName != newPod.Spec.NodeName {
   538  		logger.Error(nil, "Pod updated on a different node than previously added to", "podKey", key, "pod", klog.KObj(oldPod))
   539  		logger.Error(nil, "scheduler cache is corrupted and can badly affect scheduling decisions")
   540  		klog.FlushAndExit(klog.ExitFlushTimeout, 1)
   541  	}
   542  	return cache.updatePod(logger, oldPod, newPod)
   543  }
   544  
   545  func (cache *cacheImpl) RemovePod(logger klog.Logger, pod *v1.Pod) error {
   546  	key, err := framework.GetPodKey(pod)
   547  	if err != nil {
   548  		return err
   549  	}
   550  
   551  	cache.mu.Lock()
   552  	defer cache.mu.Unlock()
   553  
   554  	currState, ok := cache.podStates[key]
   555  	if !ok {
   556  		return fmt.Errorf("pod %v(%v) is not found in scheduler cache, so cannot be removed from it", key, klog.KObj(pod))
   557  	}
   558  	if currState.pod.Spec.NodeName != pod.Spec.NodeName {
   559  		logger.Error(nil, "Pod was added to a different node than it was assumed", "podKey", key, "pod", klog.KObj(pod), "assumedNode", klog.KRef("", pod.Spec.NodeName), "currentNode", klog.KRef("", currState.pod.Spec.NodeName))
   560  		if pod.Spec.NodeName != "" {
   561  			// An empty NodeName is possible when the scheduler misses a Delete
   562  			// event and it gets the last known state from the informer cache.
   563  			logger.Error(nil, "scheduler cache is corrupted and can badly affect scheduling decisions")
   564  			klog.FlushAndExit(klog.ExitFlushTimeout, 1)
   565  		}
   566  	}
   567  	return cache.removePod(logger, currState.pod)
   568  }
   569  
   570  func (cache *cacheImpl) IsAssumedPod(pod *v1.Pod) (bool, error) {
   571  	key, err := framework.GetPodKey(pod)
   572  	if err != nil {
   573  		return false, err
   574  	}
   575  
   576  	cache.mu.RLock()
   577  	defer cache.mu.RUnlock()
   578  
   579  	return cache.assumedPods.Has(key), nil
   580  }
   581  
   582  // GetPod might return a pod for which its node has already been deleted from
   583  // the main cache. This is useful to properly process pod update events.
   584  func (cache *cacheImpl) GetPod(pod *v1.Pod) (*v1.Pod, error) {
   585  	key, err := framework.GetPodKey(pod)
   586  	if err != nil {
   587  		return nil, err
   588  	}
   589  
   590  	cache.mu.RLock()
   591  	defer cache.mu.RUnlock()
   592  
   593  	podState, ok := cache.podStates[key]
   594  	if !ok {
   595  		return nil, fmt.Errorf("pod %v(%v) does not exist in scheduler cache", key, klog.KObj(pod))
   596  	}
   597  
   598  	return podState.pod, nil
   599  }
   600  
   601  func (cache *cacheImpl) AddNode(logger klog.Logger, node *v1.Node) *framework.NodeInfo {
   602  	cache.mu.Lock()
   603  	defer cache.mu.Unlock()
   604  
   605  	n, ok := cache.nodes[node.Name]
   606  	if !ok {
   607  		n = newNodeInfoListItem(framework.NewNodeInfo())
   608  		cache.nodes[node.Name] = n
   609  	} else {
   610  		cache.removeNodeImageStates(n.info.Node())
   611  	}
   612  	cache.moveNodeInfoToHead(logger, node.Name)
   613  
   614  	cache.nodeTree.addNode(logger, node)
   615  	cache.addNodeImageStates(node, n.info)
   616  	n.info.SetNode(node)
   617  	return n.info.Snapshot()
   618  }
   619  
   620  func (cache *cacheImpl) UpdateNode(logger klog.Logger, oldNode, newNode *v1.Node) *framework.NodeInfo {
   621  	cache.mu.Lock()
   622  	defer cache.mu.Unlock()
   623  	n, ok := cache.nodes[newNode.Name]
   624  	if !ok {
   625  		n = newNodeInfoListItem(framework.NewNodeInfo())
   626  		cache.nodes[newNode.Name] = n
   627  		cache.nodeTree.addNode(logger, newNode)
   628  	} else {
   629  		cache.removeNodeImageStates(n.info.Node())
   630  	}
   631  	cache.moveNodeInfoToHead(logger, newNode.Name)
   632  
   633  	cache.nodeTree.updateNode(logger, oldNode, newNode)
   634  	cache.addNodeImageStates(newNode, n.info)
   635  	n.info.SetNode(newNode)
   636  	return n.info.Snapshot()
   637  }
   638  
   639  // RemoveNode removes a node from the cache's tree.
   640  // The node might still have pods because their deletion events didn't arrive
   641  // yet. Those pods are considered removed from the cache, being the node tree
   642  // the source of truth.
   643  // However, we keep a ghost node with the list of pods until all pod deletion
   644  // events have arrived. A ghost node is skipped from snapshots.
   645  func (cache *cacheImpl) RemoveNode(logger klog.Logger, node *v1.Node) error {
   646  	cache.mu.Lock()
   647  	defer cache.mu.Unlock()
   648  
   649  	n, ok := cache.nodes[node.Name]
   650  	if !ok {
   651  		return fmt.Errorf("node %v is not found", node.Name)
   652  	}
   653  	n.info.RemoveNode()
   654  	// We remove NodeInfo for this node only if there aren't any pods on this node.
   655  	// We can't do it unconditionally, because notifications about pods are delivered
   656  	// in a different watch, and thus can potentially be observed later, even though
   657  	// they happened before node removal.
   658  	if len(n.info.Pods) == 0 {
   659  		cache.removeNodeInfoFromList(logger, node.Name)
   660  	} else {
   661  		cache.moveNodeInfoToHead(logger, node.Name)
   662  	}
   663  	if err := cache.nodeTree.removeNode(logger, node); err != nil {
   664  		return err
   665  	}
   666  	cache.removeNodeImageStates(node)
   667  	return nil
   668  }
   669  
   670  // addNodeImageStates adds states of the images on given node to the given nodeInfo and update the imageStates in
   671  // scheduler cache. This function assumes the lock to scheduler cache has been acquired.
   672  func (cache *cacheImpl) addNodeImageStates(node *v1.Node, nodeInfo *framework.NodeInfo) {
   673  	newSum := make(map[string]*framework.ImageStateSummary)
   674  
   675  	for _, image := range node.Status.Images {
   676  		for _, name := range image.Names {
   677  			// update the entry in imageStates
   678  			state, ok := cache.imageStates[name]
   679  			if !ok {
   680  				state = &framework.ImageStateSummary{
   681  					Size:  image.SizeBytes,
   682  					Nodes: sets.New(node.Name),
   683  				}
   684  				cache.imageStates[name] = state
   685  			} else {
   686  				state.Nodes.Insert(node.Name)
   687  			}
   688  			// create the ImageStateSummary for this image
   689  			if _, ok := newSum[name]; !ok {
   690  				newSum[name] = state
   691  			}
   692  		}
   693  	}
   694  	nodeInfo.ImageStates = newSum
   695  }
   696  
   697  // removeNodeImageStates removes the given node record from image entries having the node
   698  // in imageStates cache. After the removal, if any image becomes free, i.e., the image
   699  // is no longer available on any node, the image entry will be removed from imageStates.
   700  func (cache *cacheImpl) removeNodeImageStates(node *v1.Node) {
   701  	if node == nil {
   702  		return
   703  	}
   704  
   705  	for _, image := range node.Status.Images {
   706  		for _, name := range image.Names {
   707  			state, ok := cache.imageStates[name]
   708  			if ok {
   709  				state.Nodes.Delete(node.Name)
   710  				if state.Nodes.Len() == 0 {
   711  					// Remove the unused image to make sure the length of
   712  					// imageStates represents the total number of different
   713  					// images on all nodes
   714  					delete(cache.imageStates, name)
   715  				}
   716  			}
   717  		}
   718  	}
   719  }
   720  
   721  func (cache *cacheImpl) run(logger klog.Logger) {
   722  	go wait.Until(func() {
   723  		cache.cleanupAssumedPods(logger, time.Now())
   724  	}, cache.period, cache.stop)
   725  }
   726  
   727  // cleanupAssumedPods exists for making test deterministic by taking time as input argument.
   728  // It also reports metrics on the cache size for nodes, pods, and assumed pods.
   729  func (cache *cacheImpl) cleanupAssumedPods(logger klog.Logger, now time.Time) {
   730  	cache.mu.Lock()
   731  	defer cache.mu.Unlock()
   732  	defer cache.updateMetrics()
   733  
   734  	// The size of assumedPods should be small
   735  	for key := range cache.assumedPods {
   736  		ps, ok := cache.podStates[key]
   737  		if !ok {
   738  			logger.Error(nil, "Key found in assumed set but not in podStates, potentially a logical error")
   739  			klog.FlushAndExit(klog.ExitFlushTimeout, 1)
   740  		}
   741  		if !ps.bindingFinished {
   742  			logger.V(5).Info("Could not expire cache for pod as binding is still in progress", "podKey", key, "pod", klog.KObj(ps.pod))
   743  			continue
   744  		}
   745  		if cache.ttl != 0 && now.After(*ps.deadline) {
   746  			logger.Info("Pod expired", "podKey", key, "pod", klog.KObj(ps.pod))
   747  			if err := cache.removePod(logger, ps.pod); err != nil {
   748  				logger.Error(err, "ExpirePod failed", "podKey", key, "pod", klog.KObj(ps.pod))
   749  			}
   750  		}
   751  	}
   752  }
   753  
   754  // updateMetrics updates cache size metric values for pods, assumed pods, and nodes
   755  func (cache *cacheImpl) updateMetrics() {
   756  	metrics.CacheSize.WithLabelValues("assumed_pods").Set(float64(len(cache.assumedPods)))
   757  	metrics.CacheSize.WithLabelValues("pods").Set(float64(len(cache.podStates)))
   758  	metrics.CacheSize.WithLabelValues("nodes").Set(float64(len(cache.nodes)))
   759  }