github.com/kubewharf/katalyst-core@v0.5.3/pkg/metaserver/spd/cache.go (about)

     1  /*
     2  Copyright 2022 The Katalyst Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package spd
    18  
    19  import (
    20  	"context"
    21  	"fmt"
    22  	"sync"
    23  	"time"
    24  
    25  	"k8s.io/apimachinery/pkg/util/wait"
    26  	"k8s.io/klog/v2"
    27  	"k8s.io/kubernetes/pkg/kubelet/checkpointmanager"
    28  
    29  	workloadapis "github.com/kubewharf/katalyst-api/pkg/apis/workload/v1alpha1"
    30  	"github.com/kubewharf/katalyst-core/pkg/metaserver/spd/checkpoint"
    31  	"github.com/kubewharf/katalyst-core/pkg/util"
    32  	"github.com/kubewharf/katalyst-core/pkg/util/native"
    33  )
    34  
    35  type spdInfo struct {
    36  	// lastFetchRemoteTime records the timestamp of the last attempt to fetch
    37  	// the remote spd, not the actual fetch
    38  	lastFetchRemoteTime time.Time
    39  
    40  	// penaltyForFetchingRemoteTime records the penalty of fetching remote spd if it was deleted
    41  	penaltyForFetchingRemoteTime time.Duration
    42  
    43  	// retryCount records the count of fetching remote deleted spd
    44  	retryCount int64
    45  
    46  	// lastGetTime records the timestamp of the last time GetSPD was called to
    47  	// get spd, which is used for gc spd cache
    48  	lastGetTime time.Time
    49  
    50  	// spd is target spd
    51  	spd *workloadapis.ServiceProfileDescriptor
    52  }
    53  
    54  // Cache is spd cache stores current
    55  type Cache struct {
    56  	sync.RWMutex
    57  
    58  	skipCorruptionError bool
    59  	expiredTime         time.Duration
    60  	cacheTTL            time.Duration
    61  	jitterFactor        float64
    62  	maxRetryCount       int64
    63  
    64  	manager checkpointmanager.CheckpointManager
    65  	spdInfo map[string]*spdInfo
    66  }
    67  
    68  func NewSPDCache(manager checkpointmanager.CheckpointManager, skipCorruptionError bool,
    69  	cacheTTL, expiredTime time.Duration, maxRetryCount int64, jitterFactor float64,
    70  ) (*Cache, error) {
    71  	cache := &Cache{
    72  		spdInfo:             map[string]*spdInfo{},
    73  		manager:             manager,
    74  		skipCorruptionError: skipCorruptionError,
    75  		expiredTime:         expiredTime,
    76  		cacheTTL:            cacheTTL,
    77  		jitterFactor:        jitterFactor,
    78  		maxRetryCount:       maxRetryCount,
    79  	}
    80  
    81  	err := cache.restore()
    82  	if err != nil {
    83  		klog.Errorf("restore spd from local disk failed, %v", err)
    84  		return nil, err
    85  	}
    86  
    87  	return cache, nil
    88  }
    89  
    90  // SetLastFetchRemoteTime set last fetch remote spd timestamp
    91  func (s *Cache) SetLastFetchRemoteTime(key string, t time.Time) {
    92  	s.Lock()
    93  	defer s.Unlock()
    94  
    95  	s.initSPDInfoWithoutLock(key)
    96  	s.spdInfo[key].lastFetchRemoteTime = t
    97  }
    98  
    99  // GetNextFetchRemoteTime get next fetch remote spd timestamp
   100  func (s *Cache) GetNextFetchRemoteTime(key string, now time.Time, skipBackoff bool) time.Time {
   101  	s.RLock()
   102  	defer s.RUnlock()
   103  
   104  	info, ok := s.spdInfo[key]
   105  	if ok && info != nil {
   106  		if !skipBackoff && info.penaltyForFetchingRemoteTime > 0 {
   107  			return info.lastFetchRemoteTime.Add(info.penaltyForFetchingRemoteTime)
   108  		}
   109  
   110  		nextFetchRemoteTime := info.lastFetchRemoteTime
   111  		// to avoid burst remote request when lastFetchRemoteTime is too old, add some random
   112  		if !info.lastFetchRemoteTime.IsZero() && info.lastFetchRemoteTime.Add(time.Duration((1+s.jitterFactor)*float64(s.cacheTTL))).Before(now) {
   113  			nextFetchRemoteTime = now.Add(-wait.Jitter(s.cacheTTL, s.jitterFactor))
   114  		}
   115  
   116  		nextFetchRemoteTime = nextFetchRemoteTime.Add(wait.Jitter(s.cacheTTL, s.jitterFactor))
   117  		// If no one tries to get this spd for a long time, a penalty from lastGetTime to lastFetchRemoteTime will be added,
   118  		// which will linearly increase the period of accessing the remote, thereby reducing the frequency of accessing the api-server
   119  		if !skipBackoff && info.lastFetchRemoteTime.After(info.lastGetTime) {
   120  			nextFetchRemoteTime = nextFetchRemoteTime.Add(info.lastFetchRemoteTime.Sub(info.lastGetTime))
   121  		}
   122  
   123  		return nextFetchRemoteTime
   124  	}
   125  
   126  	return time.Time{}
   127  }
   128  
   129  // ListAllSPDKeys list all spd key
   130  func (s *Cache) ListAllSPDKeys() []string {
   131  	s.RLock()
   132  	defer s.RUnlock()
   133  
   134  	spdKeys := make([]string, 0, len(s.spdInfo))
   135  	for key := range s.spdInfo {
   136  		spdKeys = append(spdKeys, key)
   137  	}
   138  
   139  	return spdKeys
   140  }
   141  
   142  // SetSPD set target spd to cache and checkpoint
   143  func (s *Cache) SetSPD(key string, spd *workloadapis.ServiceProfileDescriptor) error {
   144  	s.Lock()
   145  	defer s.Unlock()
   146  
   147  	// if current spd hash is empty, calculate and set it
   148  	if util.GetSPDHash(spd) == "" {
   149  		hash, err := util.CalculateSPDHash(spd)
   150  		if err != nil {
   151  			return err
   152  		}
   153  		util.SetSPDHash(spd, hash)
   154  	}
   155  
   156  	s.initSPDInfoWithoutLock(key)
   157  	util.SetLastFetchTime(spd, s.spdInfo[key].lastFetchRemoteTime)
   158  	err := checkpoint.WriteSPD(s.manager, spd)
   159  	if err != nil {
   160  		return err
   161  	}
   162  
   163  	s.spdInfo[key].spd = spd
   164  	s.spdInfo[key].penaltyForFetchingRemoteTime = 0
   165  	s.spdInfo[key].retryCount = 0
   166  	return nil
   167  }
   168  
   169  // DeleteSPD delete target spd by namespace/name key
   170  func (s *Cache) DeleteSPD(key string) error {
   171  	s.Lock()
   172  	defer s.Unlock()
   173  
   174  	info, ok := s.spdInfo[key]
   175  	if ok && info != nil {
   176  		// update the penalty of fetching remote spd if it was already deleted
   177  		if info.retryCount < s.maxRetryCount {
   178  			info.retryCount += 1
   179  			info.penaltyForFetchingRemoteTime += wait.Jitter(s.cacheTTL, s.jitterFactor)
   180  		} else {
   181  			info.penaltyForFetchingRemoteTime = s.expiredTime
   182  		}
   183  
   184  		err := checkpoint.DeleteSPD(s.manager, info.spd)
   185  		if err != nil {
   186  			return err
   187  		}
   188  	}
   189  
   190  	return nil
   191  }
   192  
   193  // GetSPD gets target spd by namespace/name key
   194  func (s *Cache) GetSPD(key string, updateLastGetTime bool) *workloadapis.ServiceProfileDescriptor {
   195  	s.Lock()
   196  	defer s.Unlock()
   197  
   198  	s.initSPDInfoWithoutLock(key)
   199  
   200  	if updateLastGetTime {
   201  		// update last get spd time
   202  		s.spdInfo[key].lastGetTime = time.Now()
   203  	}
   204  
   205  	info, ok := s.spdInfo[key]
   206  	if ok && info != nil {
   207  		return info.spd
   208  	}
   209  
   210  	return nil
   211  }
   212  
   213  // Run to clear local unused spd
   214  func (s *Cache) Run(ctx context.Context) {
   215  	// sleep with cacheTTL to wait the last get time update
   216  	// for each spd
   217  	time.Sleep(s.cacheTTL)
   218  	wait.UntilWithContext(ctx, s.clearUnusedSPDs, s.expiredTime)
   219  }
   220  
   221  // restore all spd from disk at startup
   222  func (s *Cache) restore() error {
   223  	s.Lock()
   224  	defer s.Unlock()
   225  
   226  	spdList, err := checkpoint.LoadSPDs(s.manager, s.skipCorruptionError)
   227  	if err != nil {
   228  		return fmt.Errorf("restore spd failed: %v", err)
   229  	}
   230  
   231  	for _, spd := range spdList {
   232  		if spd == nil {
   233  			continue
   234  		}
   235  		key := native.GenerateUniqObjectNameKey(spd)
   236  		s.initSPDInfoWithoutLock(key)
   237  		s.spdInfo[key].spd = spd
   238  		s.spdInfo[key].lastFetchRemoteTime = util.GetLastFetchTime(spd)
   239  		klog.Infof("restore spd cache %s: %+v", key, s.spdInfo[key])
   240  	}
   241  
   242  	return nil
   243  }
   244  
   245  // clearUnusedSPDs is to clear unused spd according to its lastGetSPDTime
   246  func (s *Cache) clearUnusedSPDs(_ context.Context) {
   247  	s.Lock()
   248  	defer s.Unlock()
   249  
   250  	now := time.Now()
   251  	for key, info := range s.spdInfo {
   252  		if info != nil && info.lastGetTime.Add(s.expiredTime).Before(now) {
   253  			err := checkpoint.DeleteSPD(s.manager, info.spd)
   254  			if err != nil {
   255  				klog.Errorf("clear unused spd %s failed: %v", key, err)
   256  				continue
   257  			}
   258  			delete(s.spdInfo, key)
   259  			klog.Infof("clear spd cache %s", key)
   260  		}
   261  	}
   262  }
   263  
   264  func (s *Cache) initSPDInfoWithoutLock(key string) {
   265  	info, ok := s.spdInfo[key]
   266  	if !ok || info == nil {
   267  		s.spdInfo[key] = &spdInfo{}
   268  	}
   269  }