github.com/kubewharf/katalyst-core@v0.5.3/pkg/agent/qrm-plugins/network/state/state_checkpoint.go (about) 1 /* 2 Copyright 2022 The Katalyst Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package state 18 19 import ( 20 "fmt" 21 "path" 22 "reflect" 23 "sync" 24 25 info "github.com/google/cadvisor/info/v1" 26 27 "github.com/kubewharf/katalyst-core/pkg/config/agent/qrm" 28 "github.com/kubewharf/katalyst-core/pkg/util/general" 29 "github.com/kubewharf/katalyst-core/pkg/util/machine" 30 31 "k8s.io/kubernetes/pkg/kubelet/checkpointmanager" 32 "k8s.io/kubernetes/pkg/kubelet/checkpointmanager/errors" 33 ) 34 35 var ( 36 _ State = &stateCheckpoint{} 37 generalLog general.Logger = general.LoggerWithPrefix("network_plugin", general.LoggingPKGFull) 38 ) 39 40 // stateCheckpoint is an in-memory implementation of State; 41 // everytime we want to read or write states, those requests will always 42 // go to in-memory State, and then go to disk State, i.e. in write-back mode 43 type stateCheckpoint struct { 44 sync.RWMutex 45 cache State 46 policyName string 47 checkpointManager checkpointmanager.CheckpointManager 48 checkpointName string 49 // when we add new properties to checkpoint, 50 // it will cause checkpoint corruption and we should skip it 51 skipStateCorruption bool 52 } 53 54 func NewCheckpointState(conf *qrm.QRMPluginsConfiguration, stateDir, checkpointName, policyName string, 55 machineInfo *info.MachineInfo, nics []machine.InterfaceInfo, reservedBandwidth map[string]uint32, 56 skipStateCorruption bool, 57 ) (State, error) { 58 checkpointManager, err := checkpointmanager.NewCheckpointManager(stateDir) 59 if err != nil { 60 return nil, fmt.Errorf("failed to initialize checkpoint manager: %v", err) 61 } 62 63 defaultCache, err := NewNetworkPluginState(conf, machineInfo, nics, reservedBandwidth) 64 if err != nil { 65 return nil, fmt.Errorf("NewNetworkPluginState failed with error: %v", err) 66 } 67 68 stateCheckpoint := &stateCheckpoint{ 69 cache: defaultCache, 70 policyName: policyName, 71 checkpointManager: checkpointManager, 72 checkpointName: checkpointName, 73 skipStateCorruption: skipStateCorruption, 74 } 75 76 if err := stateCheckpoint.restoreState(conf, nics, reservedBandwidth); err != nil { 77 return nil, fmt.Errorf("could not restore state from checkpoint: %v, please drain this node and delete the network plugin checkpoint file %q before restarting Kubelet", 78 err, path.Join(stateDir, checkpointName)) 79 } 80 81 return stateCheckpoint, nil 82 } 83 84 func (sc *stateCheckpoint) restoreState(conf *qrm.QRMPluginsConfiguration, nics []machine.InterfaceInfo, reservedBandwidth map[string]uint32) error { 85 sc.Lock() 86 defer sc.Unlock() 87 var err error 88 var foundAndSkippedStateCorruption bool 89 90 checkpoint := NewNetworkPluginCheckpoint() 91 if err = sc.checkpointManager.GetCheckpoint(sc.checkpointName, checkpoint); err != nil { 92 if err == errors.ErrCheckpointNotFound { 93 return sc.storeState() 94 } else if err == errors.ErrCorruptCheckpoint { 95 if !sc.skipStateCorruption { 96 return err 97 } 98 99 foundAndSkippedStateCorruption = true 100 generalLog.Infof("restore checkpoint failed with err: %s, but we skip it", err) 101 } else { 102 return err 103 } 104 } 105 106 if sc.policyName != checkpoint.PolicyName && !sc.skipStateCorruption { 107 return fmt.Errorf("[network_plugin] configured policy %q differs from state checkpoint policy %q", sc.policyName, checkpoint.PolicyName) 108 } 109 110 generatedNetworkState, err := GenerateMachineStateFromPodEntries(conf, nics, checkpoint.PodEntries, reservedBandwidth) 111 if err != nil { 112 return fmt.Errorf("GenerateMachineStateFromPodEntries failed with error: %v", err) 113 } 114 115 sc.cache.SetMachineState(generatedNetworkState) 116 sc.cache.SetPodEntries(checkpoint.PodEntries) 117 118 if !reflect.DeepEqual(generatedNetworkState, checkpoint.MachineState) { 119 generalLog.Warningf("machine state changed: "+ 120 "generatedNetworkState: %s; checkpointMachineState: %s", 121 generatedNetworkState.String(), checkpoint.MachineState.String()) 122 123 err = sc.storeState() 124 if err != nil { 125 return fmt.Errorf("storeState when machine state changed failed with error: %v", err) 126 } 127 } 128 129 if foundAndSkippedStateCorruption { 130 generalLog.Infof("found and skipped state corruption, we shoud store to rectify the checksum") 131 132 err = sc.storeState() 133 if err != nil { 134 return fmt.Errorf("storeState failed with error: %v", err) 135 } 136 } 137 138 generalLog.InfoS("state checkpoint: restored state from checkpoint") 139 140 return nil 141 } 142 143 func (sc *stateCheckpoint) storeState() error { 144 checkpoint := NewNetworkPluginCheckpoint() 145 checkpoint.PolicyName = sc.policyName 146 checkpoint.MachineState = sc.cache.GetMachineState() 147 checkpoint.PodEntries = sc.cache.GetPodEntries() 148 149 err := sc.checkpointManager.CreateCheckpoint(sc.checkpointName, checkpoint) 150 if err != nil { 151 generalLog.ErrorS(err, "could not save checkpoint") 152 return err 153 } 154 return nil 155 } 156 157 func (sc *stateCheckpoint) GetReservedBandwidth() map[string]uint32 { 158 sc.RLock() 159 defer sc.RUnlock() 160 161 return sc.cache.GetReservedBandwidth() 162 } 163 164 func (sc *stateCheckpoint) GetMachineInfo() *info.MachineInfo { 165 sc.RLock() 166 defer sc.RUnlock() 167 168 return sc.cache.GetMachineInfo() 169 } 170 171 func (sc *stateCheckpoint) GetEnabledNICs() []machine.InterfaceInfo { 172 sc.RLock() 173 defer sc.RUnlock() 174 175 return sc.cache.GetEnabledNICs() 176 } 177 178 func (sc *stateCheckpoint) GetMachineState() NICMap { 179 sc.RLock() 180 defer sc.RUnlock() 181 182 return sc.cache.GetMachineState() 183 } 184 185 func (sc *stateCheckpoint) GetAllocationInfo(podUID, containerName string) *AllocationInfo { 186 sc.RLock() 187 defer sc.RUnlock() 188 189 return sc.cache.GetAllocationInfo(podUID, containerName) 190 } 191 192 func (sc *stateCheckpoint) GetPodEntries() PodEntries { 193 sc.RLock() 194 defer sc.RUnlock() 195 196 return sc.cache.GetPodEntries() 197 } 198 199 func (sc *stateCheckpoint) SetMachineState(nicMap NICMap) { 200 sc.Lock() 201 defer sc.Unlock() 202 203 sc.cache.SetMachineState(nicMap) 204 err := sc.storeState() 205 if err != nil { 206 generalLog.ErrorS(err, "store machineState to checkpoint error") 207 } 208 } 209 210 func (sc *stateCheckpoint) SetAllocationInfo(podUID, containerName string, allocationInfo *AllocationInfo) { 211 sc.Lock() 212 defer sc.Unlock() 213 214 sc.cache.SetAllocationInfo(podUID, containerName, allocationInfo) 215 err := sc.storeState() 216 if err != nil { 217 generalLog.ErrorS(err, "store allocationInfo to checkpoint error") 218 } 219 } 220 221 func (sc *stateCheckpoint) SetPodEntries(podEntries PodEntries) { 222 sc.Lock() 223 defer sc.Unlock() 224 225 sc.cache.SetPodEntries(podEntries) 226 err := sc.storeState() 227 if err != nil { 228 generalLog.ErrorS(err, "store pod entries to checkpoint error", "err") 229 } 230 } 231 232 func (sc *stateCheckpoint) Delete(podUID, containerName string) { 233 sc.Lock() 234 defer sc.Unlock() 235 236 sc.cache.Delete(podUID, containerName) 237 err := sc.storeState() 238 if err != nil { 239 generalLog.ErrorS(err, "store state after delete operation to checkpoint error") 240 } 241 } 242 243 func (sc *stateCheckpoint) ClearState() { 244 sc.Lock() 245 defer sc.Unlock() 246 247 sc.cache.ClearState() 248 err := sc.storeState() 249 if err != nil { 250 generalLog.ErrorS(err, "store state after clear operation to checkpoint error") 251 } 252 }