k8s.io/kubernetes@v1.29.3/pkg/kubelet/status/state/state_checkpoint.go (about) 1 /* 2 Copyright 2021 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package state 18 19 import ( 20 "fmt" 21 "path" 22 "sync" 23 24 "k8s.io/api/core/v1" 25 "k8s.io/klog/v2" 26 "k8s.io/kubernetes/pkg/kubelet/checkpointmanager" 27 "k8s.io/kubernetes/pkg/kubelet/checkpointmanager/errors" 28 ) 29 30 var _ State = &stateCheckpoint{} 31 32 type stateCheckpoint struct { 33 mux sync.RWMutex 34 cache State 35 checkpointManager checkpointmanager.CheckpointManager 36 checkpointName string 37 } 38 39 // NewStateCheckpoint creates new State for keeping track of pod resource allocations with checkpoint backend 40 func NewStateCheckpoint(stateDir, checkpointName string) (State, error) { 41 checkpointManager, err := checkpointmanager.NewCheckpointManager(stateDir) 42 if err != nil { 43 return nil, fmt.Errorf("failed to initialize checkpoint manager for pod allocation tracking: %v", err) 44 } 45 stateCheckpoint := &stateCheckpoint{ 46 cache: NewStateMemory(), 47 checkpointManager: checkpointManager, 48 checkpointName: checkpointName, 49 } 50 51 if err := stateCheckpoint.restoreState(); err != nil { 52 //lint:ignore ST1005 user-facing error message 53 return nil, fmt.Errorf("could not restore state from checkpoint: %v, please drain this node and delete pod allocation checkpoint file %q before restarting Kubelet", err, path.Join(stateDir, checkpointName)) 54 } 55 return stateCheckpoint, nil 56 } 57 58 // restores state from a checkpoint and creates it if it doesn't exist 59 func (sc *stateCheckpoint) restoreState() error { 60 sc.mux.Lock() 61 defer sc.mux.Unlock() 62 var err error 63 64 checkpoint := NewPodResourceAllocationCheckpoint() 65 66 if err = sc.checkpointManager.GetCheckpoint(sc.checkpointName, checkpoint); err != nil { 67 if err == errors.ErrCheckpointNotFound { 68 return sc.storeState() 69 } 70 return err 71 } 72 73 sc.cache.SetPodResourceAllocation(checkpoint.AllocationEntries) 74 sc.cache.SetResizeStatus(checkpoint.ResizeStatusEntries) 75 klog.V(2).InfoS("State checkpoint: restored pod resource allocation state from checkpoint") 76 return nil 77 } 78 79 // saves state to a checkpoint, caller is responsible for locking 80 func (sc *stateCheckpoint) storeState() error { 81 checkpoint := NewPodResourceAllocationCheckpoint() 82 83 podAllocation := sc.cache.GetPodResourceAllocation() 84 for pod := range podAllocation { 85 checkpoint.AllocationEntries[pod] = make(map[string]v1.ResourceList) 86 for container, alloc := range podAllocation[pod] { 87 checkpoint.AllocationEntries[pod][container] = alloc 88 } 89 } 90 91 podResizeStatus := sc.cache.GetResizeStatus() 92 checkpoint.ResizeStatusEntries = make(map[string]v1.PodResizeStatus) 93 for pUID, rStatus := range podResizeStatus { 94 checkpoint.ResizeStatusEntries[pUID] = rStatus 95 } 96 97 err := sc.checkpointManager.CreateCheckpoint(sc.checkpointName, checkpoint) 98 if err != nil { 99 klog.ErrorS(err, "Failed to save pod allocation checkpoint") 100 return err 101 } 102 return nil 103 } 104 105 // GetContainerResourceAllocation returns current resources allocated to a pod's container 106 func (sc *stateCheckpoint) GetContainerResourceAllocation(podUID string, containerName string) (v1.ResourceList, bool) { 107 sc.mux.RLock() 108 defer sc.mux.RUnlock() 109 return sc.cache.GetContainerResourceAllocation(podUID, containerName) 110 } 111 112 // GetPodResourceAllocation returns current pod resource allocation 113 func (sc *stateCheckpoint) GetPodResourceAllocation() PodResourceAllocation { 114 sc.mux.RLock() 115 defer sc.mux.RUnlock() 116 return sc.cache.GetPodResourceAllocation() 117 } 118 119 // GetPodResizeStatus returns the last resize decision for a pod 120 func (sc *stateCheckpoint) GetPodResizeStatus(podUID string) (v1.PodResizeStatus, bool) { 121 sc.mux.RLock() 122 defer sc.mux.RUnlock() 123 return sc.cache.GetPodResizeStatus(podUID) 124 } 125 126 // GetResizeStatus returns the set of resize decisions made 127 func (sc *stateCheckpoint) GetResizeStatus() PodResizeStatus { 128 sc.mux.RLock() 129 defer sc.mux.RUnlock() 130 return sc.cache.GetResizeStatus() 131 } 132 133 // SetContainerResourceAllocation sets resources allocated to a pod's container 134 func (sc *stateCheckpoint) SetContainerResourceAllocation(podUID string, containerName string, alloc v1.ResourceList) error { 135 sc.mux.Lock() 136 defer sc.mux.Unlock() 137 sc.cache.SetContainerResourceAllocation(podUID, containerName, alloc) 138 return sc.storeState() 139 } 140 141 // SetPodResourceAllocation sets pod resource allocation 142 func (sc *stateCheckpoint) SetPodResourceAllocation(a PodResourceAllocation) error { 143 sc.mux.Lock() 144 defer sc.mux.Unlock() 145 sc.cache.SetPodResourceAllocation(a) 146 return sc.storeState() 147 } 148 149 // SetPodResizeStatus sets the last resize decision for a pod 150 func (sc *stateCheckpoint) SetPodResizeStatus(podUID string, resizeStatus v1.PodResizeStatus) error { 151 sc.mux.Lock() 152 defer sc.mux.Unlock() 153 sc.cache.SetPodResizeStatus(podUID, resizeStatus) 154 return sc.storeState() 155 } 156 157 // SetResizeStatus sets the resize decisions 158 func (sc *stateCheckpoint) SetResizeStatus(rs PodResizeStatus) error { 159 sc.mux.Lock() 160 defer sc.mux.Unlock() 161 sc.cache.SetResizeStatus(rs) 162 return sc.storeState() 163 } 164 165 // Delete deletes allocations for specified pod 166 func (sc *stateCheckpoint) Delete(podUID string, containerName string) error { 167 sc.mux.Lock() 168 defer sc.mux.Unlock() 169 sc.cache.Delete(podUID, containerName) 170 return sc.storeState() 171 } 172 173 // ClearState clears the state and saves it in a checkpoint 174 func (sc *stateCheckpoint) ClearState() error { 175 sc.mux.Lock() 176 defer sc.mux.Unlock() 177 sc.cache.ClearState() 178 return sc.storeState() 179 } 180 181 type noopStateCheckpoint struct{} 182 183 // NewNoopStateCheckpoint creates a dummy state checkpoint manager 184 func NewNoopStateCheckpoint() State { 185 return &noopStateCheckpoint{} 186 } 187 188 func (sc *noopStateCheckpoint) GetContainerResourceAllocation(_ string, _ string) (v1.ResourceList, bool) { 189 return nil, false 190 } 191 192 func (sc *noopStateCheckpoint) GetPodResourceAllocation() PodResourceAllocation { 193 return nil 194 } 195 196 func (sc *noopStateCheckpoint) GetPodResizeStatus(_ string) (v1.PodResizeStatus, bool) { 197 return "", false 198 } 199 200 func (sc *noopStateCheckpoint) GetResizeStatus() PodResizeStatus { 201 return nil 202 } 203 204 func (sc *noopStateCheckpoint) SetContainerResourceAllocation(_ string, _ string, _ v1.ResourceList) error { 205 return nil 206 } 207 208 func (sc *noopStateCheckpoint) SetPodResourceAllocation(_ PodResourceAllocation) error { 209 return nil 210 } 211 212 func (sc *noopStateCheckpoint) SetPodResizeStatus(_ string, _ v1.PodResizeStatus) error { 213 return nil 214 } 215 216 func (sc *noopStateCheckpoint) SetResizeStatus(_ PodResizeStatus) error { 217 return nil 218 } 219 220 func (sc *noopStateCheckpoint) Delete(_ string, _ string) error { 221 return nil 222 } 223 224 func (sc *noopStateCheckpoint) ClearState() error { 225 return nil 226 }