github.com/kubewharf/katalyst-core@v0.5.3/pkg/agent/qrm-plugins/cpu/dynamicpolicy/state/state_checkpoint.go (about) 1 /* 2 Copyright 2022 The Katalyst Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package state 18 19 import ( 20 "fmt" 21 "path" 22 "reflect" 23 "sync" 24 25 "k8s.io/klog/v2" 26 "k8s.io/kubernetes/pkg/kubelet/checkpointmanager" 27 "k8s.io/kubernetes/pkg/kubelet/checkpointmanager/errors" 28 29 "github.com/kubewharf/katalyst-core/pkg/util/machine" 30 ) 31 32 // stateCheckpoint is an in-memory implementation of State; 33 // everytime we want to read or write states, those requests will always 34 // go to in-memory State, and then go to disk State, i.e. in write-back mode 35 type stateCheckpoint struct { 36 sync.RWMutex 37 cache State 38 policyName string 39 checkpointManager checkpointmanager.CheckpointManager 40 checkpointName string 41 // when we add new properties to checkpoint, 42 // it will cause checkpoint corruption, and we should skip it 43 skipStateCorruption bool 44 } 45 46 var _ State = &stateCheckpoint{} 47 48 func NewCheckpointState(stateDir, checkpointName, policyName string, 49 topology *machine.CPUTopology, skipStateCorruption bool, 50 ) (State, error) { 51 checkpointManager, err := checkpointmanager.NewCheckpointManager(stateDir) 52 if err != nil { 53 return nil, fmt.Errorf("failed to initialize checkpoint manager: %v", err) 54 } 55 56 sc := &stateCheckpoint{ 57 cache: NewCPUPluginState(topology), 58 policyName: policyName, 59 checkpointManager: checkpointManager, 60 checkpointName: checkpointName, 61 skipStateCorruption: skipStateCorruption, 62 } 63 64 if err := sc.restoreState(topology); err != nil { 65 return nil, fmt.Errorf("could not restore state from checkpoint: %v, please drain this node and delete "+ 66 "the cpu plugin checkpoint file %q before restarting Kubelet", err, path.Join(stateDir, checkpointName)) 67 } 68 return sc, nil 69 } 70 71 func (sc *stateCheckpoint) restoreState(topology *machine.CPUTopology) error { 72 sc.Lock() 73 defer sc.Unlock() 74 var err error 75 var foundAndSkippedStateCorruption bool 76 77 checkpoint := NewCPUPluginCheckpoint() 78 if err = sc.checkpointManager.GetCheckpoint(sc.checkpointName, checkpoint); err != nil { 79 if err == errors.ErrCheckpointNotFound { 80 return sc.storeState() 81 } else if err == errors.ErrCorruptCheckpoint { 82 if !sc.skipStateCorruption { 83 return err 84 } 85 86 foundAndSkippedStateCorruption = true 87 klog.Warningf("[cpu_plugin] restore checkpoint failed with err: %s, but we skip it", err) 88 } else { 89 return err 90 } 91 } 92 93 if sc.policyName != checkpoint.PolicyName && !sc.skipStateCorruption { 94 return fmt.Errorf("[cpu_plugin] configured policy %q differs from state checkpoint policy %q", sc.policyName, checkpoint.PolicyName) 95 } 96 97 generatedMachineState, err := GenerateMachineStateFromPodEntries(topology, checkpoint.PodEntries, sc.policyName) 98 if err != nil { 99 return fmt.Errorf("GenerateMachineStateFromPodEntries failed with error: %v", err) 100 } 101 102 sc.cache.SetMachineState(generatedMachineState) 103 sc.cache.SetPodEntries(checkpoint.PodEntries) 104 105 if !reflect.DeepEqual(generatedMachineState, checkpoint.MachineState) { 106 klog.Warningf("[cpu_plugin] machine state changed: generatedMachineState: %s; checkpointMachineState: %s", 107 generatedMachineState.String(), checkpoint.MachineState.String()) 108 err = sc.storeState() 109 if err != nil { 110 return fmt.Errorf("storeState when machine state changed failed with error: %v", err) 111 } 112 } 113 114 if foundAndSkippedStateCorruption { 115 klog.Infof("[cpu_plugin] found and skipped state corruption, we should store to rectify the checksum") 116 err = sc.storeState() 117 if err != nil { 118 return fmt.Errorf("storeState failed with error: %v", err) 119 } 120 } 121 122 klog.InfoS("[cpu_plugin] State checkpoint: restored state from checkpoint") 123 return nil 124 } 125 126 func (sc *stateCheckpoint) storeState() error { 127 checkpoint := NewCPUPluginCheckpoint() 128 checkpoint.PolicyName = sc.policyName 129 checkpoint.MachineState = sc.cache.GetMachineState() 130 checkpoint.PodEntries = sc.cache.GetPodEntries() 131 132 err := sc.checkpointManager.CreateCheckpoint(sc.checkpointName, checkpoint) 133 if err != nil { 134 klog.ErrorS(err, "Could not save checkpoint") 135 return err 136 } 137 return nil 138 } 139 140 func (sc *stateCheckpoint) GetMachineState() NUMANodeMap { 141 sc.RLock() 142 defer sc.RUnlock() 143 144 return sc.cache.GetMachineState() 145 } 146 147 func (sc *stateCheckpoint) GetAllocationInfo(podUID string, containerName string) *AllocationInfo { 148 sc.RLock() 149 defer sc.RUnlock() 150 151 return sc.cache.GetAllocationInfo(podUID, containerName) 152 } 153 154 func (sc *stateCheckpoint) GetPodEntries() PodEntries { 155 sc.RLock() 156 defer sc.RUnlock() 157 158 return sc.cache.GetPodEntries() 159 } 160 161 func (sc *stateCheckpoint) SetMachineState(numaNodeMap NUMANodeMap) { 162 sc.Lock() 163 defer sc.Unlock() 164 165 sc.cache.SetMachineState(numaNodeMap) 166 err := sc.storeState() 167 if err != nil { 168 klog.ErrorS(err, "[cpu_plugin] store machineState to checkpoint error") 169 } 170 } 171 172 func (sc *stateCheckpoint) SetAllocationInfo(podUID string, containerName string, allocationInfo *AllocationInfo) { 173 sc.Lock() 174 defer sc.Unlock() 175 176 sc.cache.SetAllocationInfo(podUID, containerName, allocationInfo) 177 err := sc.storeState() 178 if err != nil { 179 klog.ErrorS(err, "[cpu_plugin] store allocationInfo to checkpoint error") 180 } 181 } 182 183 func (sc *stateCheckpoint) SetPodEntries(podEntries PodEntries) { 184 sc.Lock() 185 defer sc.Unlock() 186 187 sc.cache.SetPodEntries(podEntries) 188 err := sc.storeState() 189 if err != nil { 190 klog.ErrorS(err, "[cpu_plugin] store pod entries to checkpoint error", "err") 191 } 192 } 193 194 func (sc *stateCheckpoint) Delete(podUID string, containerName string) { 195 sc.Lock() 196 defer sc.Unlock() 197 198 sc.cache.Delete(podUID, containerName) 199 err := sc.storeState() 200 if err != nil { 201 klog.ErrorS(err, "[cpu_plugin] store state after delete operation to checkpoint error") 202 } 203 } 204 205 func (sc *stateCheckpoint) ClearState() { 206 sc.Lock() 207 defer sc.Unlock() 208 209 sc.cache.ClearState() 210 err := sc.storeState() 211 if err != nil { 212 klog.ErrorS(err, "[cpu_plugin] store state after clear operation to checkpoint error") 213 } 214 }