github.com/kubewharf/katalyst-core@v0.5.3/pkg/agent/qrm-plugins/network/state/state_checkpoint.go (about)

     1  /*
     2  Copyright 2022 The Katalyst Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package state
    18  
    19  import (
    20  	"fmt"
    21  	"path"
    22  	"reflect"
    23  	"sync"
    24  
    25  	info "github.com/google/cadvisor/info/v1"
    26  
    27  	"github.com/kubewharf/katalyst-core/pkg/config/agent/qrm"
    28  	"github.com/kubewharf/katalyst-core/pkg/util/general"
    29  	"github.com/kubewharf/katalyst-core/pkg/util/machine"
    30  
    31  	"k8s.io/kubernetes/pkg/kubelet/checkpointmanager"
    32  	"k8s.io/kubernetes/pkg/kubelet/checkpointmanager/errors"
    33  )
    34  
    35  var (
    36  	_          State          = &stateCheckpoint{}
    37  	generalLog general.Logger = general.LoggerWithPrefix("network_plugin", general.LoggingPKGFull)
    38  )
    39  
    40  // stateCheckpoint is an in-memory implementation of State;
    41  // everytime we want to read or write states, those requests will always
    42  // go to in-memory State, and then go to disk State, i.e. in write-back mode
    43  type stateCheckpoint struct {
    44  	sync.RWMutex
    45  	cache             State
    46  	policyName        string
    47  	checkpointManager checkpointmanager.CheckpointManager
    48  	checkpointName    string
    49  	// when we add new properties to checkpoint,
    50  	// it will cause checkpoint corruption and we should skip it
    51  	skipStateCorruption bool
    52  }
    53  
    54  func NewCheckpointState(conf *qrm.QRMPluginsConfiguration, stateDir, checkpointName, policyName string,
    55  	machineInfo *info.MachineInfo, nics []machine.InterfaceInfo, reservedBandwidth map[string]uint32,
    56  	skipStateCorruption bool,
    57  ) (State, error) {
    58  	checkpointManager, err := checkpointmanager.NewCheckpointManager(stateDir)
    59  	if err != nil {
    60  		return nil, fmt.Errorf("failed to initialize checkpoint manager: %v", err)
    61  	}
    62  
    63  	defaultCache, err := NewNetworkPluginState(conf, machineInfo, nics, reservedBandwidth)
    64  	if err != nil {
    65  		return nil, fmt.Errorf("NewNetworkPluginState failed with error: %v", err)
    66  	}
    67  
    68  	stateCheckpoint := &stateCheckpoint{
    69  		cache:               defaultCache,
    70  		policyName:          policyName,
    71  		checkpointManager:   checkpointManager,
    72  		checkpointName:      checkpointName,
    73  		skipStateCorruption: skipStateCorruption,
    74  	}
    75  
    76  	if err := stateCheckpoint.restoreState(conf, nics, reservedBandwidth); err != nil {
    77  		return nil, fmt.Errorf("could not restore state from checkpoint: %v, please drain this node and delete the network plugin checkpoint file %q before restarting Kubelet",
    78  			err, path.Join(stateDir, checkpointName))
    79  	}
    80  
    81  	return stateCheckpoint, nil
    82  }
    83  
    84  func (sc *stateCheckpoint) restoreState(conf *qrm.QRMPluginsConfiguration, nics []machine.InterfaceInfo, reservedBandwidth map[string]uint32) error {
    85  	sc.Lock()
    86  	defer sc.Unlock()
    87  	var err error
    88  	var foundAndSkippedStateCorruption bool
    89  
    90  	checkpoint := NewNetworkPluginCheckpoint()
    91  	if err = sc.checkpointManager.GetCheckpoint(sc.checkpointName, checkpoint); err != nil {
    92  		if err == errors.ErrCheckpointNotFound {
    93  			return sc.storeState()
    94  		} else if err == errors.ErrCorruptCheckpoint {
    95  			if !sc.skipStateCorruption {
    96  				return err
    97  			}
    98  
    99  			foundAndSkippedStateCorruption = true
   100  			generalLog.Infof("restore checkpoint failed with err: %s, but we skip it", err)
   101  		} else {
   102  			return err
   103  		}
   104  	}
   105  
   106  	if sc.policyName != checkpoint.PolicyName && !sc.skipStateCorruption {
   107  		return fmt.Errorf("[network_plugin] configured policy %q differs from state checkpoint policy %q", sc.policyName, checkpoint.PolicyName)
   108  	}
   109  
   110  	generatedNetworkState, err := GenerateMachineStateFromPodEntries(conf, nics, checkpoint.PodEntries, reservedBandwidth)
   111  	if err != nil {
   112  		return fmt.Errorf("GenerateMachineStateFromPodEntries failed with error: %v", err)
   113  	}
   114  
   115  	sc.cache.SetMachineState(generatedNetworkState)
   116  	sc.cache.SetPodEntries(checkpoint.PodEntries)
   117  
   118  	if !reflect.DeepEqual(generatedNetworkState, checkpoint.MachineState) {
   119  		generalLog.Warningf("machine state changed: "+
   120  			"generatedNetworkState: %s; checkpointMachineState: %s",
   121  			generatedNetworkState.String(), checkpoint.MachineState.String())
   122  
   123  		err = sc.storeState()
   124  		if err != nil {
   125  			return fmt.Errorf("storeState when machine state changed failed with error: %v", err)
   126  		}
   127  	}
   128  
   129  	if foundAndSkippedStateCorruption {
   130  		generalLog.Infof("found and skipped state corruption, we shoud store to rectify the checksum")
   131  
   132  		err = sc.storeState()
   133  		if err != nil {
   134  			return fmt.Errorf("storeState failed with error: %v", err)
   135  		}
   136  	}
   137  
   138  	generalLog.InfoS("state checkpoint: restored state from checkpoint")
   139  
   140  	return nil
   141  }
   142  
   143  func (sc *stateCheckpoint) storeState() error {
   144  	checkpoint := NewNetworkPluginCheckpoint()
   145  	checkpoint.PolicyName = sc.policyName
   146  	checkpoint.MachineState = sc.cache.GetMachineState()
   147  	checkpoint.PodEntries = sc.cache.GetPodEntries()
   148  
   149  	err := sc.checkpointManager.CreateCheckpoint(sc.checkpointName, checkpoint)
   150  	if err != nil {
   151  		generalLog.ErrorS(err, "could not save checkpoint")
   152  		return err
   153  	}
   154  	return nil
   155  }
   156  
   157  func (sc *stateCheckpoint) GetReservedBandwidth() map[string]uint32 {
   158  	sc.RLock()
   159  	defer sc.RUnlock()
   160  
   161  	return sc.cache.GetReservedBandwidth()
   162  }
   163  
   164  func (sc *stateCheckpoint) GetMachineInfo() *info.MachineInfo {
   165  	sc.RLock()
   166  	defer sc.RUnlock()
   167  
   168  	return sc.cache.GetMachineInfo()
   169  }
   170  
   171  func (sc *stateCheckpoint) GetEnabledNICs() []machine.InterfaceInfo {
   172  	sc.RLock()
   173  	defer sc.RUnlock()
   174  
   175  	return sc.cache.GetEnabledNICs()
   176  }
   177  
   178  func (sc *stateCheckpoint) GetMachineState() NICMap {
   179  	sc.RLock()
   180  	defer sc.RUnlock()
   181  
   182  	return sc.cache.GetMachineState()
   183  }
   184  
   185  func (sc *stateCheckpoint) GetAllocationInfo(podUID, containerName string) *AllocationInfo {
   186  	sc.RLock()
   187  	defer sc.RUnlock()
   188  
   189  	return sc.cache.GetAllocationInfo(podUID, containerName)
   190  }
   191  
   192  func (sc *stateCheckpoint) GetPodEntries() PodEntries {
   193  	sc.RLock()
   194  	defer sc.RUnlock()
   195  
   196  	return sc.cache.GetPodEntries()
   197  }
   198  
   199  func (sc *stateCheckpoint) SetMachineState(nicMap NICMap) {
   200  	sc.Lock()
   201  	defer sc.Unlock()
   202  
   203  	sc.cache.SetMachineState(nicMap)
   204  	err := sc.storeState()
   205  	if err != nil {
   206  		generalLog.ErrorS(err, "store machineState to checkpoint error")
   207  	}
   208  }
   209  
   210  func (sc *stateCheckpoint) SetAllocationInfo(podUID, containerName string, allocationInfo *AllocationInfo) {
   211  	sc.Lock()
   212  	defer sc.Unlock()
   213  
   214  	sc.cache.SetAllocationInfo(podUID, containerName, allocationInfo)
   215  	err := sc.storeState()
   216  	if err != nil {
   217  		generalLog.ErrorS(err, "store allocationInfo to checkpoint error")
   218  	}
   219  }
   220  
   221  func (sc *stateCheckpoint) SetPodEntries(podEntries PodEntries) {
   222  	sc.Lock()
   223  	defer sc.Unlock()
   224  
   225  	sc.cache.SetPodEntries(podEntries)
   226  	err := sc.storeState()
   227  	if err != nil {
   228  		generalLog.ErrorS(err, "store pod entries to checkpoint error", "err")
   229  	}
   230  }
   231  
   232  func (sc *stateCheckpoint) Delete(podUID, containerName string) {
   233  	sc.Lock()
   234  	defer sc.Unlock()
   235  
   236  	sc.cache.Delete(podUID, containerName)
   237  	err := sc.storeState()
   238  	if err != nil {
   239  		generalLog.ErrorS(err, "store state after delete operation to checkpoint error")
   240  	}
   241  }
   242  
   243  func (sc *stateCheckpoint) ClearState() {
   244  	sc.Lock()
   245  	defer sc.Unlock()
   246  
   247  	sc.cache.ClearState()
   248  	err := sc.storeState()
   249  	if err != nil {
   250  		generalLog.ErrorS(err, "store state after clear operation to checkpoint error")
   251  	}
   252  }