k8s.io/kubernetes@v1.29.3/pkg/kubelet/status/state/state_checkpoint.go (about)

     1  /*
     2  Copyright 2021 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package state
    18  
    19  import (
    20  	"fmt"
    21  	"path"
    22  	"sync"
    23  
    24  	"k8s.io/api/core/v1"
    25  	"k8s.io/klog/v2"
    26  	"k8s.io/kubernetes/pkg/kubelet/checkpointmanager"
    27  	"k8s.io/kubernetes/pkg/kubelet/checkpointmanager/errors"
    28  )
    29  
    30  var _ State = &stateCheckpoint{}
    31  
    32  type stateCheckpoint struct {
    33  	mux               sync.RWMutex
    34  	cache             State
    35  	checkpointManager checkpointmanager.CheckpointManager
    36  	checkpointName    string
    37  }
    38  
    39  // NewStateCheckpoint creates new State for keeping track of pod resource allocations with checkpoint backend
    40  func NewStateCheckpoint(stateDir, checkpointName string) (State, error) {
    41  	checkpointManager, err := checkpointmanager.NewCheckpointManager(stateDir)
    42  	if err != nil {
    43  		return nil, fmt.Errorf("failed to initialize checkpoint manager for pod allocation tracking: %v", err)
    44  	}
    45  	stateCheckpoint := &stateCheckpoint{
    46  		cache:             NewStateMemory(),
    47  		checkpointManager: checkpointManager,
    48  		checkpointName:    checkpointName,
    49  	}
    50  
    51  	if err := stateCheckpoint.restoreState(); err != nil {
    52  		//lint:ignore ST1005 user-facing error message
    53  		return nil, fmt.Errorf("could not restore state from checkpoint: %v, please drain this node and delete pod allocation checkpoint file %q before restarting Kubelet", err, path.Join(stateDir, checkpointName))
    54  	}
    55  	return stateCheckpoint, nil
    56  }
    57  
    58  // restores state from a checkpoint and creates it if it doesn't exist
    59  func (sc *stateCheckpoint) restoreState() error {
    60  	sc.mux.Lock()
    61  	defer sc.mux.Unlock()
    62  	var err error
    63  
    64  	checkpoint := NewPodResourceAllocationCheckpoint()
    65  
    66  	if err = sc.checkpointManager.GetCheckpoint(sc.checkpointName, checkpoint); err != nil {
    67  		if err == errors.ErrCheckpointNotFound {
    68  			return sc.storeState()
    69  		}
    70  		return err
    71  	}
    72  
    73  	sc.cache.SetPodResourceAllocation(checkpoint.AllocationEntries)
    74  	sc.cache.SetResizeStatus(checkpoint.ResizeStatusEntries)
    75  	klog.V(2).InfoS("State checkpoint: restored pod resource allocation state from checkpoint")
    76  	return nil
    77  }
    78  
    79  // saves state to a checkpoint, caller is responsible for locking
    80  func (sc *stateCheckpoint) storeState() error {
    81  	checkpoint := NewPodResourceAllocationCheckpoint()
    82  
    83  	podAllocation := sc.cache.GetPodResourceAllocation()
    84  	for pod := range podAllocation {
    85  		checkpoint.AllocationEntries[pod] = make(map[string]v1.ResourceList)
    86  		for container, alloc := range podAllocation[pod] {
    87  			checkpoint.AllocationEntries[pod][container] = alloc
    88  		}
    89  	}
    90  
    91  	podResizeStatus := sc.cache.GetResizeStatus()
    92  	checkpoint.ResizeStatusEntries = make(map[string]v1.PodResizeStatus)
    93  	for pUID, rStatus := range podResizeStatus {
    94  		checkpoint.ResizeStatusEntries[pUID] = rStatus
    95  	}
    96  
    97  	err := sc.checkpointManager.CreateCheckpoint(sc.checkpointName, checkpoint)
    98  	if err != nil {
    99  		klog.ErrorS(err, "Failed to save pod allocation checkpoint")
   100  		return err
   101  	}
   102  	return nil
   103  }
   104  
   105  // GetContainerResourceAllocation returns current resources allocated to a pod's container
   106  func (sc *stateCheckpoint) GetContainerResourceAllocation(podUID string, containerName string) (v1.ResourceList, bool) {
   107  	sc.mux.RLock()
   108  	defer sc.mux.RUnlock()
   109  	return sc.cache.GetContainerResourceAllocation(podUID, containerName)
   110  }
   111  
   112  // GetPodResourceAllocation returns current pod resource allocation
   113  func (sc *stateCheckpoint) GetPodResourceAllocation() PodResourceAllocation {
   114  	sc.mux.RLock()
   115  	defer sc.mux.RUnlock()
   116  	return sc.cache.GetPodResourceAllocation()
   117  }
   118  
   119  // GetPodResizeStatus returns the last resize decision for a pod
   120  func (sc *stateCheckpoint) GetPodResizeStatus(podUID string) (v1.PodResizeStatus, bool) {
   121  	sc.mux.RLock()
   122  	defer sc.mux.RUnlock()
   123  	return sc.cache.GetPodResizeStatus(podUID)
   124  }
   125  
   126  // GetResizeStatus returns the set of resize decisions made
   127  func (sc *stateCheckpoint) GetResizeStatus() PodResizeStatus {
   128  	sc.mux.RLock()
   129  	defer sc.mux.RUnlock()
   130  	return sc.cache.GetResizeStatus()
   131  }
   132  
   133  // SetContainerResourceAllocation sets resources allocated to a pod's container
   134  func (sc *stateCheckpoint) SetContainerResourceAllocation(podUID string, containerName string, alloc v1.ResourceList) error {
   135  	sc.mux.Lock()
   136  	defer sc.mux.Unlock()
   137  	sc.cache.SetContainerResourceAllocation(podUID, containerName, alloc)
   138  	return sc.storeState()
   139  }
   140  
   141  // SetPodResourceAllocation sets pod resource allocation
   142  func (sc *stateCheckpoint) SetPodResourceAllocation(a PodResourceAllocation) error {
   143  	sc.mux.Lock()
   144  	defer sc.mux.Unlock()
   145  	sc.cache.SetPodResourceAllocation(a)
   146  	return sc.storeState()
   147  }
   148  
   149  // SetPodResizeStatus sets the last resize decision for a pod
   150  func (sc *stateCheckpoint) SetPodResizeStatus(podUID string, resizeStatus v1.PodResizeStatus) error {
   151  	sc.mux.Lock()
   152  	defer sc.mux.Unlock()
   153  	sc.cache.SetPodResizeStatus(podUID, resizeStatus)
   154  	return sc.storeState()
   155  }
   156  
   157  // SetResizeStatus sets the resize decisions
   158  func (sc *stateCheckpoint) SetResizeStatus(rs PodResizeStatus) error {
   159  	sc.mux.Lock()
   160  	defer sc.mux.Unlock()
   161  	sc.cache.SetResizeStatus(rs)
   162  	return sc.storeState()
   163  }
   164  
   165  // Delete deletes allocations for specified pod
   166  func (sc *stateCheckpoint) Delete(podUID string, containerName string) error {
   167  	sc.mux.Lock()
   168  	defer sc.mux.Unlock()
   169  	sc.cache.Delete(podUID, containerName)
   170  	return sc.storeState()
   171  }
   172  
   173  // ClearState clears the state and saves it in a checkpoint
   174  func (sc *stateCheckpoint) ClearState() error {
   175  	sc.mux.Lock()
   176  	defer sc.mux.Unlock()
   177  	sc.cache.ClearState()
   178  	return sc.storeState()
   179  }
   180  
   181  type noopStateCheckpoint struct{}
   182  
   183  // NewNoopStateCheckpoint creates a dummy state checkpoint manager
   184  func NewNoopStateCheckpoint() State {
   185  	return &noopStateCheckpoint{}
   186  }
   187  
   188  func (sc *noopStateCheckpoint) GetContainerResourceAllocation(_ string, _ string) (v1.ResourceList, bool) {
   189  	return nil, false
   190  }
   191  
   192  func (sc *noopStateCheckpoint) GetPodResourceAllocation() PodResourceAllocation {
   193  	return nil
   194  }
   195  
   196  func (sc *noopStateCheckpoint) GetPodResizeStatus(_ string) (v1.PodResizeStatus, bool) {
   197  	return "", false
   198  }
   199  
   200  func (sc *noopStateCheckpoint) GetResizeStatus() PodResizeStatus {
   201  	return nil
   202  }
   203  
   204  func (sc *noopStateCheckpoint) SetContainerResourceAllocation(_ string, _ string, _ v1.ResourceList) error {
   205  	return nil
   206  }
   207  
   208  func (sc *noopStateCheckpoint) SetPodResourceAllocation(_ PodResourceAllocation) error {
   209  	return nil
   210  }
   211  
   212  func (sc *noopStateCheckpoint) SetPodResizeStatus(_ string, _ v1.PodResizeStatus) error {
   213  	return nil
   214  }
   215  
   216  func (sc *noopStateCheckpoint) SetResizeStatus(_ PodResizeStatus) error {
   217  	return nil
   218  }
   219  
   220  func (sc *noopStateCheckpoint) Delete(_ string, _ string) error {
   221  	return nil
   222  }
   223  
   224  func (sc *noopStateCheckpoint) ClearState() error {
   225  	return nil
   226  }