k8s.io/kubernetes@v1.29.3/pkg/scheduler/internal/cache/cache.go (about) 1 /* 2 Copyright 2015 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package cache 18 19 import ( 20 "context" 21 "fmt" 22 "sync" 23 "time" 24 25 v1 "k8s.io/api/core/v1" 26 "k8s.io/apimachinery/pkg/util/sets" 27 "k8s.io/apimachinery/pkg/util/wait" 28 "k8s.io/klog/v2" 29 "k8s.io/kubernetes/pkg/scheduler/framework" 30 "k8s.io/kubernetes/pkg/scheduler/metrics" 31 ) 32 33 var ( 34 cleanAssumedPeriod = 1 * time.Second 35 ) 36 37 // New returns a Cache implementation. 38 // It automatically starts a go routine that manages expiration of assumed pods. 39 // "ttl" is how long the assumed pod will get expired. 40 // "ctx" is the context that would close the background goroutine. 41 func New(ctx context.Context, ttl time.Duration) Cache { 42 logger := klog.FromContext(ctx) 43 cache := newCache(ctx, ttl, cleanAssumedPeriod) 44 cache.run(logger) 45 return cache 46 } 47 48 // nodeInfoListItem holds a NodeInfo pointer and acts as an item in a doubly 49 // linked list. When a NodeInfo is updated, it goes to the head of the list. 50 // The items closer to the head are the most recently updated items. 51 type nodeInfoListItem struct { 52 info *framework.NodeInfo 53 next *nodeInfoListItem 54 prev *nodeInfoListItem 55 } 56 57 type cacheImpl struct { 58 stop <-chan struct{} 59 ttl time.Duration 60 period time.Duration 61 62 // This mutex guards all fields within this cache struct. 63 mu sync.RWMutex 64 // a set of assumed pod keys. 65 // The key could further be used to get an entry in podStates. 66 assumedPods sets.Set[string] 67 // a map from pod key to podState. 68 podStates map[string]*podState 69 nodes map[string]*nodeInfoListItem 70 // headNode points to the most recently updated NodeInfo in "nodes". It is the 71 // head of the linked list. 72 headNode *nodeInfoListItem 73 nodeTree *nodeTree 74 // A map from image name to its ImageStateSummary. 75 imageStates map[string]*framework.ImageStateSummary 76 } 77 78 type podState struct { 79 pod *v1.Pod 80 // Used by assumedPod to determinate expiration. 81 // If deadline is nil, assumedPod will never expire. 82 deadline *time.Time 83 // Used to block cache from expiring assumedPod if binding still runs 84 bindingFinished bool 85 } 86 87 func newCache(ctx context.Context, ttl, period time.Duration) *cacheImpl { 88 logger := klog.FromContext(ctx) 89 return &cacheImpl{ 90 ttl: ttl, 91 period: period, 92 stop: ctx.Done(), 93 94 nodes: make(map[string]*nodeInfoListItem), 95 nodeTree: newNodeTree(logger, nil), 96 assumedPods: sets.New[string](), 97 podStates: make(map[string]*podState), 98 imageStates: make(map[string]*framework.ImageStateSummary), 99 } 100 } 101 102 // newNodeInfoListItem initializes a new nodeInfoListItem. 103 func newNodeInfoListItem(ni *framework.NodeInfo) *nodeInfoListItem { 104 return &nodeInfoListItem{ 105 info: ni, 106 } 107 } 108 109 // moveNodeInfoToHead moves a NodeInfo to the head of "cache.nodes" doubly 110 // linked list. The head is the most recently updated NodeInfo. 111 // We assume cache lock is already acquired. 112 func (cache *cacheImpl) moveNodeInfoToHead(logger klog.Logger, name string) { 113 ni, ok := cache.nodes[name] 114 if !ok { 115 logger.Error(nil, "No node info with given name found in the cache", "node", klog.KRef("", name)) 116 return 117 } 118 // if the node info list item is already at the head, we are done. 119 if ni == cache.headNode { 120 return 121 } 122 123 if ni.prev != nil { 124 ni.prev.next = ni.next 125 } 126 if ni.next != nil { 127 ni.next.prev = ni.prev 128 } 129 if cache.headNode != nil { 130 cache.headNode.prev = ni 131 } 132 ni.next = cache.headNode 133 ni.prev = nil 134 cache.headNode = ni 135 } 136 137 // removeNodeInfoFromList removes a NodeInfo from the "cache.nodes" doubly 138 // linked list. 139 // We assume cache lock is already acquired. 140 func (cache *cacheImpl) removeNodeInfoFromList(logger klog.Logger, name string) { 141 ni, ok := cache.nodes[name] 142 if !ok { 143 logger.Error(nil, "No node info with given name found in the cache", "node", klog.KRef("", name)) 144 return 145 } 146 147 if ni.prev != nil { 148 ni.prev.next = ni.next 149 } 150 if ni.next != nil { 151 ni.next.prev = ni.prev 152 } 153 // if the removed item was at the head, we must update the head. 154 if ni == cache.headNode { 155 cache.headNode = ni.next 156 } 157 delete(cache.nodes, name) 158 } 159 160 // Dump produces a dump of the current scheduler cache. This is used for 161 // debugging purposes only and shouldn't be confused with UpdateSnapshot 162 // function. 163 // This method is expensive, and should be only used in non-critical path. 164 func (cache *cacheImpl) Dump() *Dump { 165 cache.mu.RLock() 166 defer cache.mu.RUnlock() 167 168 nodes := make(map[string]*framework.NodeInfo, len(cache.nodes)) 169 for k, v := range cache.nodes { 170 nodes[k] = v.info.Snapshot() 171 } 172 173 return &Dump{ 174 Nodes: nodes, 175 AssumedPods: cache.assumedPods.Union(nil), 176 } 177 } 178 179 // UpdateSnapshot takes a snapshot of cached NodeInfo map. This is called at 180 // beginning of every scheduling cycle. 181 // The snapshot only includes Nodes that are not deleted at the time this function is called. 182 // nodeInfo.Node() is guaranteed to be not nil for all the nodes in the snapshot. 183 // This function tracks generation number of NodeInfo and updates only the 184 // entries of an existing snapshot that have changed after the snapshot was taken. 185 func (cache *cacheImpl) UpdateSnapshot(logger klog.Logger, nodeSnapshot *Snapshot) error { 186 cache.mu.Lock() 187 defer cache.mu.Unlock() 188 189 // Get the last generation of the snapshot. 190 snapshotGeneration := nodeSnapshot.generation 191 192 // NodeInfoList and HavePodsWithAffinityNodeInfoList must be re-created if a node was added 193 // or removed from the cache. 194 updateAllLists := false 195 // HavePodsWithAffinityNodeInfoList must be re-created if a node changed its 196 // status from having pods with affinity to NOT having pods with affinity or the other 197 // way around. 198 updateNodesHavePodsWithAffinity := false 199 // HavePodsWithRequiredAntiAffinityNodeInfoList must be re-created if a node changed its 200 // status from having pods with required anti-affinity to NOT having pods with required 201 // anti-affinity or the other way around. 202 updateNodesHavePodsWithRequiredAntiAffinity := false 203 // usedPVCSet must be re-created whenever the head node generation is greater than 204 // last snapshot generation. 205 updateUsedPVCSet := false 206 207 // Start from the head of the NodeInfo doubly linked list and update snapshot 208 // of NodeInfos updated after the last snapshot. 209 for node := cache.headNode; node != nil; node = node.next { 210 if node.info.Generation <= snapshotGeneration { 211 // all the nodes are updated before the existing snapshot. We are done. 212 break 213 } 214 if np := node.info.Node(); np != nil { 215 existing, ok := nodeSnapshot.nodeInfoMap[np.Name] 216 if !ok { 217 updateAllLists = true 218 existing = &framework.NodeInfo{} 219 nodeSnapshot.nodeInfoMap[np.Name] = existing 220 } 221 clone := node.info.Snapshot() 222 // We track nodes that have pods with affinity, here we check if this node changed its 223 // status from having pods with affinity to NOT having pods with affinity or the other 224 // way around. 225 if (len(existing.PodsWithAffinity) > 0) != (len(clone.PodsWithAffinity) > 0) { 226 updateNodesHavePodsWithAffinity = true 227 } 228 if (len(existing.PodsWithRequiredAntiAffinity) > 0) != (len(clone.PodsWithRequiredAntiAffinity) > 0) { 229 updateNodesHavePodsWithRequiredAntiAffinity = true 230 } 231 if !updateUsedPVCSet { 232 if len(existing.PVCRefCounts) != len(clone.PVCRefCounts) { 233 updateUsedPVCSet = true 234 } else { 235 for pvcKey := range clone.PVCRefCounts { 236 if _, found := existing.PVCRefCounts[pvcKey]; !found { 237 updateUsedPVCSet = true 238 break 239 } 240 } 241 } 242 } 243 // We need to preserve the original pointer of the NodeInfo struct since it 244 // is used in the NodeInfoList, which we may not update. 245 *existing = *clone 246 } 247 } 248 // Update the snapshot generation with the latest NodeInfo generation. 249 if cache.headNode != nil { 250 nodeSnapshot.generation = cache.headNode.info.Generation 251 } 252 253 // Comparing to pods in nodeTree. 254 // Deleted nodes get removed from the tree, but they might remain in the nodes map 255 // if they still have non-deleted Pods. 256 if len(nodeSnapshot.nodeInfoMap) > cache.nodeTree.numNodes { 257 cache.removeDeletedNodesFromSnapshot(nodeSnapshot) 258 updateAllLists = true 259 } 260 261 if updateAllLists || updateNodesHavePodsWithAffinity || updateNodesHavePodsWithRequiredAntiAffinity || updateUsedPVCSet { 262 cache.updateNodeInfoSnapshotList(logger, nodeSnapshot, updateAllLists) 263 } 264 265 if len(nodeSnapshot.nodeInfoList) != cache.nodeTree.numNodes { 266 errMsg := fmt.Sprintf("snapshot state is not consistent, length of NodeInfoList=%v not equal to length of nodes in tree=%v "+ 267 ", length of NodeInfoMap=%v, length of nodes in cache=%v"+ 268 ", trying to recover", 269 len(nodeSnapshot.nodeInfoList), cache.nodeTree.numNodes, 270 len(nodeSnapshot.nodeInfoMap), len(cache.nodes)) 271 logger.Error(nil, errMsg) 272 // We will try to recover by re-creating the lists for the next scheduling cycle, but still return an 273 // error to surface the problem, the error will likely cause a failure to the current scheduling cycle. 274 cache.updateNodeInfoSnapshotList(logger, nodeSnapshot, true) 275 return fmt.Errorf(errMsg) 276 } 277 278 return nil 279 } 280 281 func (cache *cacheImpl) updateNodeInfoSnapshotList(logger klog.Logger, snapshot *Snapshot, updateAll bool) { 282 snapshot.havePodsWithAffinityNodeInfoList = make([]*framework.NodeInfo, 0, cache.nodeTree.numNodes) 283 snapshot.havePodsWithRequiredAntiAffinityNodeInfoList = make([]*framework.NodeInfo, 0, cache.nodeTree.numNodes) 284 snapshot.usedPVCSet = sets.New[string]() 285 if updateAll { 286 // Take a snapshot of the nodes order in the tree 287 snapshot.nodeInfoList = make([]*framework.NodeInfo, 0, cache.nodeTree.numNodes) 288 nodesList, err := cache.nodeTree.list() 289 if err != nil { 290 logger.Error(err, "Error occurred while retrieving the list of names of the nodes from node tree") 291 } 292 for _, nodeName := range nodesList { 293 if nodeInfo := snapshot.nodeInfoMap[nodeName]; nodeInfo != nil { 294 snapshot.nodeInfoList = append(snapshot.nodeInfoList, nodeInfo) 295 if len(nodeInfo.PodsWithAffinity) > 0 { 296 snapshot.havePodsWithAffinityNodeInfoList = append(snapshot.havePodsWithAffinityNodeInfoList, nodeInfo) 297 } 298 if len(nodeInfo.PodsWithRequiredAntiAffinity) > 0 { 299 snapshot.havePodsWithRequiredAntiAffinityNodeInfoList = append(snapshot.havePodsWithRequiredAntiAffinityNodeInfoList, nodeInfo) 300 } 301 for key := range nodeInfo.PVCRefCounts { 302 snapshot.usedPVCSet.Insert(key) 303 } 304 } else { 305 logger.Error(nil, "Node exists in nodeTree but not in NodeInfoMap, this should not happen", "node", klog.KRef("", nodeName)) 306 } 307 } 308 } else { 309 for _, nodeInfo := range snapshot.nodeInfoList { 310 if len(nodeInfo.PodsWithAffinity) > 0 { 311 snapshot.havePodsWithAffinityNodeInfoList = append(snapshot.havePodsWithAffinityNodeInfoList, nodeInfo) 312 } 313 if len(nodeInfo.PodsWithRequiredAntiAffinity) > 0 { 314 snapshot.havePodsWithRequiredAntiAffinityNodeInfoList = append(snapshot.havePodsWithRequiredAntiAffinityNodeInfoList, nodeInfo) 315 } 316 for key := range nodeInfo.PVCRefCounts { 317 snapshot.usedPVCSet.Insert(key) 318 } 319 } 320 } 321 } 322 323 // If certain nodes were deleted after the last snapshot was taken, we should remove them from the snapshot. 324 func (cache *cacheImpl) removeDeletedNodesFromSnapshot(snapshot *Snapshot) { 325 toDelete := len(snapshot.nodeInfoMap) - cache.nodeTree.numNodes 326 for name := range snapshot.nodeInfoMap { 327 if toDelete <= 0 { 328 break 329 } 330 if n, ok := cache.nodes[name]; !ok || n.info.Node() == nil { 331 delete(snapshot.nodeInfoMap, name) 332 toDelete-- 333 } 334 } 335 } 336 337 // NodeCount returns the number of nodes in the cache. 338 // DO NOT use outside of tests. 339 func (cache *cacheImpl) NodeCount() int { 340 cache.mu.RLock() 341 defer cache.mu.RUnlock() 342 return len(cache.nodes) 343 } 344 345 // PodCount returns the number of pods in the cache (including those from deleted nodes). 346 // DO NOT use outside of tests. 347 func (cache *cacheImpl) PodCount() (int, error) { 348 cache.mu.RLock() 349 defer cache.mu.RUnlock() 350 // podFilter is expected to return true for most or all of the pods. We 351 // can avoid expensive array growth without wasting too much memory by 352 // pre-allocating capacity. 353 count := 0 354 for _, n := range cache.nodes { 355 count += len(n.info.Pods) 356 } 357 return count, nil 358 } 359 360 func (cache *cacheImpl) AssumePod(logger klog.Logger, pod *v1.Pod) error { 361 key, err := framework.GetPodKey(pod) 362 if err != nil { 363 return err 364 } 365 366 cache.mu.Lock() 367 defer cache.mu.Unlock() 368 if _, ok := cache.podStates[key]; ok { 369 return fmt.Errorf("pod %v(%v) is in the cache, so can't be assumed", key, klog.KObj(pod)) 370 } 371 372 return cache.addPod(logger, pod, true) 373 } 374 375 func (cache *cacheImpl) FinishBinding(logger klog.Logger, pod *v1.Pod) error { 376 return cache.finishBinding(logger, pod, time.Now()) 377 } 378 379 // finishBinding exists to make tests deterministic by injecting now as an argument 380 func (cache *cacheImpl) finishBinding(logger klog.Logger, pod *v1.Pod, now time.Time) error { 381 key, err := framework.GetPodKey(pod) 382 if err != nil { 383 return err 384 } 385 386 cache.mu.RLock() 387 defer cache.mu.RUnlock() 388 389 logger.V(5).Info("Finished binding for pod, can be expired", "podKey", key, "pod", klog.KObj(pod)) 390 currState, ok := cache.podStates[key] 391 if ok && cache.assumedPods.Has(key) { 392 if cache.ttl == time.Duration(0) { 393 currState.deadline = nil 394 } else { 395 dl := now.Add(cache.ttl) 396 currState.deadline = &dl 397 } 398 currState.bindingFinished = true 399 } 400 return nil 401 } 402 403 func (cache *cacheImpl) ForgetPod(logger klog.Logger, pod *v1.Pod) error { 404 key, err := framework.GetPodKey(pod) 405 if err != nil { 406 return err 407 } 408 409 cache.mu.Lock() 410 defer cache.mu.Unlock() 411 412 currState, ok := cache.podStates[key] 413 if ok && currState.pod.Spec.NodeName != pod.Spec.NodeName { 414 return fmt.Errorf("pod %v(%v) was assumed on %v but assigned to %v", key, klog.KObj(pod), pod.Spec.NodeName, currState.pod.Spec.NodeName) 415 } 416 417 // Only assumed pod can be forgotten. 418 if ok && cache.assumedPods.Has(key) { 419 return cache.removePod(logger, pod) 420 } 421 return fmt.Errorf("pod %v(%v) wasn't assumed so cannot be forgotten", key, klog.KObj(pod)) 422 } 423 424 // Assumes that lock is already acquired. 425 func (cache *cacheImpl) addPod(logger klog.Logger, pod *v1.Pod, assumePod bool) error { 426 key, err := framework.GetPodKey(pod) 427 if err != nil { 428 return err 429 } 430 n, ok := cache.nodes[pod.Spec.NodeName] 431 if !ok { 432 n = newNodeInfoListItem(framework.NewNodeInfo()) 433 cache.nodes[pod.Spec.NodeName] = n 434 } 435 n.info.AddPod(pod) 436 cache.moveNodeInfoToHead(logger, pod.Spec.NodeName) 437 ps := &podState{ 438 pod: pod, 439 } 440 cache.podStates[key] = ps 441 if assumePod { 442 cache.assumedPods.Insert(key) 443 } 444 return nil 445 } 446 447 // Assumes that lock is already acquired. 448 func (cache *cacheImpl) updatePod(logger klog.Logger, oldPod, newPod *v1.Pod) error { 449 if err := cache.removePod(logger, oldPod); err != nil { 450 return err 451 } 452 return cache.addPod(logger, newPod, false) 453 } 454 455 // Assumes that lock is already acquired. 456 // Removes a pod from the cached node info. If the node information was already 457 // removed and there are no more pods left in the node, cleans up the node from 458 // the cache. 459 func (cache *cacheImpl) removePod(logger klog.Logger, pod *v1.Pod) error { 460 key, err := framework.GetPodKey(pod) 461 if err != nil { 462 return err 463 } 464 465 n, ok := cache.nodes[pod.Spec.NodeName] 466 if !ok { 467 logger.Error(nil, "Node not found when trying to remove pod", "node", klog.KRef("", pod.Spec.NodeName), "podKey", key, "pod", klog.KObj(pod)) 468 } else { 469 if err := n.info.RemovePod(logger, pod); err != nil { 470 return err 471 } 472 if len(n.info.Pods) == 0 && n.info.Node() == nil { 473 cache.removeNodeInfoFromList(logger, pod.Spec.NodeName) 474 } else { 475 cache.moveNodeInfoToHead(logger, pod.Spec.NodeName) 476 } 477 } 478 479 delete(cache.podStates, key) 480 delete(cache.assumedPods, key) 481 return nil 482 } 483 484 func (cache *cacheImpl) AddPod(logger klog.Logger, pod *v1.Pod) error { 485 key, err := framework.GetPodKey(pod) 486 if err != nil { 487 return err 488 } 489 490 cache.mu.Lock() 491 defer cache.mu.Unlock() 492 493 currState, ok := cache.podStates[key] 494 switch { 495 case ok && cache.assumedPods.Has(key): 496 // When assuming, we've already added the Pod to cache, 497 // Just update here to make sure the Pod's status is up-to-date. 498 if err = cache.updatePod(logger, currState.pod, pod); err != nil { 499 logger.Error(err, "Error occurred while updating pod") 500 } 501 if currState.pod.Spec.NodeName != pod.Spec.NodeName { 502 // The pod was added to a different node than it was assumed to. 503 logger.Info("Pod was added to a different node than it was assumed", "podKey", key, "pod", klog.KObj(pod), "assumedNode", klog.KRef("", pod.Spec.NodeName), "currentNode", klog.KRef("", currState.pod.Spec.NodeName)) 504 return nil 505 } 506 case !ok: 507 // Pod was expired. We should add it back. 508 if err = cache.addPod(logger, pod, false); err != nil { 509 logger.Error(err, "Error occurred while adding pod") 510 } 511 default: 512 return fmt.Errorf("pod %v(%v) was already in added state", key, klog.KObj(pod)) 513 } 514 return nil 515 } 516 517 func (cache *cacheImpl) UpdatePod(logger klog.Logger, oldPod, newPod *v1.Pod) error { 518 key, err := framework.GetPodKey(oldPod) 519 if err != nil { 520 return err 521 } 522 523 cache.mu.Lock() 524 defer cache.mu.Unlock() 525 526 currState, ok := cache.podStates[key] 527 if !ok { 528 return fmt.Errorf("pod %v(%v) is not added to scheduler cache, so cannot be updated", key, klog.KObj(oldPod)) 529 } 530 531 // An assumed pod won't have Update/Remove event. It needs to have Add event 532 // before Update event, in which case the state would change from Assumed to Added. 533 if cache.assumedPods.Has(key) { 534 return fmt.Errorf("assumed pod %v(%v) should not be updated", key, klog.KObj(oldPod)) 535 } 536 537 if currState.pod.Spec.NodeName != newPod.Spec.NodeName { 538 logger.Error(nil, "Pod updated on a different node than previously added to", "podKey", key, "pod", klog.KObj(oldPod)) 539 logger.Error(nil, "scheduler cache is corrupted and can badly affect scheduling decisions") 540 klog.FlushAndExit(klog.ExitFlushTimeout, 1) 541 } 542 return cache.updatePod(logger, oldPod, newPod) 543 } 544 545 func (cache *cacheImpl) RemovePod(logger klog.Logger, pod *v1.Pod) error { 546 key, err := framework.GetPodKey(pod) 547 if err != nil { 548 return err 549 } 550 551 cache.mu.Lock() 552 defer cache.mu.Unlock() 553 554 currState, ok := cache.podStates[key] 555 if !ok { 556 return fmt.Errorf("pod %v(%v) is not found in scheduler cache, so cannot be removed from it", key, klog.KObj(pod)) 557 } 558 if currState.pod.Spec.NodeName != pod.Spec.NodeName { 559 logger.Error(nil, "Pod was added to a different node than it was assumed", "podKey", key, "pod", klog.KObj(pod), "assumedNode", klog.KRef("", pod.Spec.NodeName), "currentNode", klog.KRef("", currState.pod.Spec.NodeName)) 560 if pod.Spec.NodeName != "" { 561 // An empty NodeName is possible when the scheduler misses a Delete 562 // event and it gets the last known state from the informer cache. 563 logger.Error(nil, "scheduler cache is corrupted and can badly affect scheduling decisions") 564 klog.FlushAndExit(klog.ExitFlushTimeout, 1) 565 } 566 } 567 return cache.removePod(logger, currState.pod) 568 } 569 570 func (cache *cacheImpl) IsAssumedPod(pod *v1.Pod) (bool, error) { 571 key, err := framework.GetPodKey(pod) 572 if err != nil { 573 return false, err 574 } 575 576 cache.mu.RLock() 577 defer cache.mu.RUnlock() 578 579 return cache.assumedPods.Has(key), nil 580 } 581 582 // GetPod might return a pod for which its node has already been deleted from 583 // the main cache. This is useful to properly process pod update events. 584 func (cache *cacheImpl) GetPod(pod *v1.Pod) (*v1.Pod, error) { 585 key, err := framework.GetPodKey(pod) 586 if err != nil { 587 return nil, err 588 } 589 590 cache.mu.RLock() 591 defer cache.mu.RUnlock() 592 593 podState, ok := cache.podStates[key] 594 if !ok { 595 return nil, fmt.Errorf("pod %v(%v) does not exist in scheduler cache", key, klog.KObj(pod)) 596 } 597 598 return podState.pod, nil 599 } 600 601 func (cache *cacheImpl) AddNode(logger klog.Logger, node *v1.Node) *framework.NodeInfo { 602 cache.mu.Lock() 603 defer cache.mu.Unlock() 604 605 n, ok := cache.nodes[node.Name] 606 if !ok { 607 n = newNodeInfoListItem(framework.NewNodeInfo()) 608 cache.nodes[node.Name] = n 609 } else { 610 cache.removeNodeImageStates(n.info.Node()) 611 } 612 cache.moveNodeInfoToHead(logger, node.Name) 613 614 cache.nodeTree.addNode(logger, node) 615 cache.addNodeImageStates(node, n.info) 616 n.info.SetNode(node) 617 return n.info.Snapshot() 618 } 619 620 func (cache *cacheImpl) UpdateNode(logger klog.Logger, oldNode, newNode *v1.Node) *framework.NodeInfo { 621 cache.mu.Lock() 622 defer cache.mu.Unlock() 623 n, ok := cache.nodes[newNode.Name] 624 if !ok { 625 n = newNodeInfoListItem(framework.NewNodeInfo()) 626 cache.nodes[newNode.Name] = n 627 cache.nodeTree.addNode(logger, newNode) 628 } else { 629 cache.removeNodeImageStates(n.info.Node()) 630 } 631 cache.moveNodeInfoToHead(logger, newNode.Name) 632 633 cache.nodeTree.updateNode(logger, oldNode, newNode) 634 cache.addNodeImageStates(newNode, n.info) 635 n.info.SetNode(newNode) 636 return n.info.Snapshot() 637 } 638 639 // RemoveNode removes a node from the cache's tree. 640 // The node might still have pods because their deletion events didn't arrive 641 // yet. Those pods are considered removed from the cache, being the node tree 642 // the source of truth. 643 // However, we keep a ghost node with the list of pods until all pod deletion 644 // events have arrived. A ghost node is skipped from snapshots. 645 func (cache *cacheImpl) RemoveNode(logger klog.Logger, node *v1.Node) error { 646 cache.mu.Lock() 647 defer cache.mu.Unlock() 648 649 n, ok := cache.nodes[node.Name] 650 if !ok { 651 return fmt.Errorf("node %v is not found", node.Name) 652 } 653 n.info.RemoveNode() 654 // We remove NodeInfo for this node only if there aren't any pods on this node. 655 // We can't do it unconditionally, because notifications about pods are delivered 656 // in a different watch, and thus can potentially be observed later, even though 657 // they happened before node removal. 658 if len(n.info.Pods) == 0 { 659 cache.removeNodeInfoFromList(logger, node.Name) 660 } else { 661 cache.moveNodeInfoToHead(logger, node.Name) 662 } 663 if err := cache.nodeTree.removeNode(logger, node); err != nil { 664 return err 665 } 666 cache.removeNodeImageStates(node) 667 return nil 668 } 669 670 // addNodeImageStates adds states of the images on given node to the given nodeInfo and update the imageStates in 671 // scheduler cache. This function assumes the lock to scheduler cache has been acquired. 672 func (cache *cacheImpl) addNodeImageStates(node *v1.Node, nodeInfo *framework.NodeInfo) { 673 newSum := make(map[string]*framework.ImageStateSummary) 674 675 for _, image := range node.Status.Images { 676 for _, name := range image.Names { 677 // update the entry in imageStates 678 state, ok := cache.imageStates[name] 679 if !ok { 680 state = &framework.ImageStateSummary{ 681 Size: image.SizeBytes, 682 Nodes: sets.New(node.Name), 683 } 684 cache.imageStates[name] = state 685 } else { 686 state.Nodes.Insert(node.Name) 687 } 688 // create the ImageStateSummary for this image 689 if _, ok := newSum[name]; !ok { 690 newSum[name] = state 691 } 692 } 693 } 694 nodeInfo.ImageStates = newSum 695 } 696 697 // removeNodeImageStates removes the given node record from image entries having the node 698 // in imageStates cache. After the removal, if any image becomes free, i.e., the image 699 // is no longer available on any node, the image entry will be removed from imageStates. 700 func (cache *cacheImpl) removeNodeImageStates(node *v1.Node) { 701 if node == nil { 702 return 703 } 704 705 for _, image := range node.Status.Images { 706 for _, name := range image.Names { 707 state, ok := cache.imageStates[name] 708 if ok { 709 state.Nodes.Delete(node.Name) 710 if state.Nodes.Len() == 0 { 711 // Remove the unused image to make sure the length of 712 // imageStates represents the total number of different 713 // images on all nodes 714 delete(cache.imageStates, name) 715 } 716 } 717 } 718 } 719 } 720 721 func (cache *cacheImpl) run(logger klog.Logger) { 722 go wait.Until(func() { 723 cache.cleanupAssumedPods(logger, time.Now()) 724 }, cache.period, cache.stop) 725 } 726 727 // cleanupAssumedPods exists for making test deterministic by taking time as input argument. 728 // It also reports metrics on the cache size for nodes, pods, and assumed pods. 729 func (cache *cacheImpl) cleanupAssumedPods(logger klog.Logger, now time.Time) { 730 cache.mu.Lock() 731 defer cache.mu.Unlock() 732 defer cache.updateMetrics() 733 734 // The size of assumedPods should be small 735 for key := range cache.assumedPods { 736 ps, ok := cache.podStates[key] 737 if !ok { 738 logger.Error(nil, "Key found in assumed set but not in podStates, potentially a logical error") 739 klog.FlushAndExit(klog.ExitFlushTimeout, 1) 740 } 741 if !ps.bindingFinished { 742 logger.V(5).Info("Could not expire cache for pod as binding is still in progress", "podKey", key, "pod", klog.KObj(ps.pod)) 743 continue 744 } 745 if cache.ttl != 0 && now.After(*ps.deadline) { 746 logger.Info("Pod expired", "podKey", key, "pod", klog.KObj(ps.pod)) 747 if err := cache.removePod(logger, ps.pod); err != nil { 748 logger.Error(err, "ExpirePod failed", "podKey", key, "pod", klog.KObj(ps.pod)) 749 } 750 } 751 } 752 } 753 754 // updateMetrics updates cache size metric values for pods, assumed pods, and nodes 755 func (cache *cacheImpl) updateMetrics() { 756 metrics.CacheSize.WithLabelValues("assumed_pods").Set(float64(len(cache.assumedPods))) 757 metrics.CacheSize.WithLabelValues("pods").Set(float64(len(cache.podStates))) 758 metrics.CacheSize.WithLabelValues("nodes").Set(float64(len(cache.nodes))) 759 }