github.com/kubewharf/katalyst-core@v0.5.3/pkg/metaserver/spd/cache.go (about) 1 /* 2 Copyright 2022 The Katalyst Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package spd 18 19 import ( 20 "context" 21 "fmt" 22 "sync" 23 "time" 24 25 "k8s.io/apimachinery/pkg/util/wait" 26 "k8s.io/klog/v2" 27 "k8s.io/kubernetes/pkg/kubelet/checkpointmanager" 28 29 workloadapis "github.com/kubewharf/katalyst-api/pkg/apis/workload/v1alpha1" 30 "github.com/kubewharf/katalyst-core/pkg/metaserver/spd/checkpoint" 31 "github.com/kubewharf/katalyst-core/pkg/util" 32 "github.com/kubewharf/katalyst-core/pkg/util/native" 33 ) 34 35 type spdInfo struct { 36 // lastFetchRemoteTime records the timestamp of the last attempt to fetch 37 // the remote spd, not the actual fetch 38 lastFetchRemoteTime time.Time 39 40 // penaltyForFetchingRemoteTime records the penalty of fetching remote spd if it was deleted 41 penaltyForFetchingRemoteTime time.Duration 42 43 // retryCount records the count of fetching remote deleted spd 44 retryCount int64 45 46 // lastGetTime records the timestamp of the last time GetSPD was called to 47 // get spd, which is used for gc spd cache 48 lastGetTime time.Time 49 50 // spd is target spd 51 spd *workloadapis.ServiceProfileDescriptor 52 } 53 54 // Cache is spd cache stores current 55 type Cache struct { 56 sync.RWMutex 57 58 skipCorruptionError bool 59 expiredTime time.Duration 60 cacheTTL time.Duration 61 jitterFactor float64 62 maxRetryCount int64 63 64 manager checkpointmanager.CheckpointManager 65 spdInfo map[string]*spdInfo 66 } 67 68 func NewSPDCache(manager checkpointmanager.CheckpointManager, skipCorruptionError bool, 69 cacheTTL, expiredTime time.Duration, maxRetryCount int64, jitterFactor float64, 70 ) (*Cache, error) { 71 cache := &Cache{ 72 spdInfo: map[string]*spdInfo{}, 73 manager: manager, 74 skipCorruptionError: skipCorruptionError, 75 expiredTime: expiredTime, 76 cacheTTL: cacheTTL, 77 jitterFactor: jitterFactor, 78 maxRetryCount: maxRetryCount, 79 } 80 81 err := cache.restore() 82 if err != nil { 83 klog.Errorf("restore spd from local disk failed, %v", err) 84 return nil, err 85 } 86 87 return cache, nil 88 } 89 90 // SetLastFetchRemoteTime set last fetch remote spd timestamp 91 func (s *Cache) SetLastFetchRemoteTime(key string, t time.Time) { 92 s.Lock() 93 defer s.Unlock() 94 95 s.initSPDInfoWithoutLock(key) 96 s.spdInfo[key].lastFetchRemoteTime = t 97 } 98 99 // GetNextFetchRemoteTime get next fetch remote spd timestamp 100 func (s *Cache) GetNextFetchRemoteTime(key string, now time.Time, skipBackoff bool) time.Time { 101 s.RLock() 102 defer s.RUnlock() 103 104 info, ok := s.spdInfo[key] 105 if ok && info != nil { 106 if !skipBackoff && info.penaltyForFetchingRemoteTime > 0 { 107 return info.lastFetchRemoteTime.Add(info.penaltyForFetchingRemoteTime) 108 } 109 110 nextFetchRemoteTime := info.lastFetchRemoteTime 111 // to avoid burst remote request when lastFetchRemoteTime is too old, add some random 112 if !info.lastFetchRemoteTime.IsZero() && info.lastFetchRemoteTime.Add(time.Duration((1+s.jitterFactor)*float64(s.cacheTTL))).Before(now) { 113 nextFetchRemoteTime = now.Add(-wait.Jitter(s.cacheTTL, s.jitterFactor)) 114 } 115 116 nextFetchRemoteTime = nextFetchRemoteTime.Add(wait.Jitter(s.cacheTTL, s.jitterFactor)) 117 // If no one tries to get this spd for a long time, a penalty from lastGetTime to lastFetchRemoteTime will be added, 118 // which will linearly increase the period of accessing the remote, thereby reducing the frequency of accessing the api-server 119 if !skipBackoff && info.lastFetchRemoteTime.After(info.lastGetTime) { 120 nextFetchRemoteTime = nextFetchRemoteTime.Add(info.lastFetchRemoteTime.Sub(info.lastGetTime)) 121 } 122 123 return nextFetchRemoteTime 124 } 125 126 return time.Time{} 127 } 128 129 // ListAllSPDKeys list all spd key 130 func (s *Cache) ListAllSPDKeys() []string { 131 s.RLock() 132 defer s.RUnlock() 133 134 spdKeys := make([]string, 0, len(s.spdInfo)) 135 for key := range s.spdInfo { 136 spdKeys = append(spdKeys, key) 137 } 138 139 return spdKeys 140 } 141 142 // SetSPD set target spd to cache and checkpoint 143 func (s *Cache) SetSPD(key string, spd *workloadapis.ServiceProfileDescriptor) error { 144 s.Lock() 145 defer s.Unlock() 146 147 // if current spd hash is empty, calculate and set it 148 if util.GetSPDHash(spd) == "" { 149 hash, err := util.CalculateSPDHash(spd) 150 if err != nil { 151 return err 152 } 153 util.SetSPDHash(spd, hash) 154 } 155 156 s.initSPDInfoWithoutLock(key) 157 util.SetLastFetchTime(spd, s.spdInfo[key].lastFetchRemoteTime) 158 err := checkpoint.WriteSPD(s.manager, spd) 159 if err != nil { 160 return err 161 } 162 163 s.spdInfo[key].spd = spd 164 s.spdInfo[key].penaltyForFetchingRemoteTime = 0 165 s.spdInfo[key].retryCount = 0 166 return nil 167 } 168 169 // DeleteSPD delete target spd by namespace/name key 170 func (s *Cache) DeleteSPD(key string) error { 171 s.Lock() 172 defer s.Unlock() 173 174 info, ok := s.spdInfo[key] 175 if ok && info != nil { 176 // update the penalty of fetching remote spd if it was already deleted 177 if info.retryCount < s.maxRetryCount { 178 info.retryCount += 1 179 info.penaltyForFetchingRemoteTime += wait.Jitter(s.cacheTTL, s.jitterFactor) 180 } else { 181 info.penaltyForFetchingRemoteTime = s.expiredTime 182 } 183 184 err := checkpoint.DeleteSPD(s.manager, info.spd) 185 if err != nil { 186 return err 187 } 188 } 189 190 return nil 191 } 192 193 // GetSPD gets target spd by namespace/name key 194 func (s *Cache) GetSPD(key string, updateLastGetTime bool) *workloadapis.ServiceProfileDescriptor { 195 s.Lock() 196 defer s.Unlock() 197 198 s.initSPDInfoWithoutLock(key) 199 200 if updateLastGetTime { 201 // update last get spd time 202 s.spdInfo[key].lastGetTime = time.Now() 203 } 204 205 info, ok := s.spdInfo[key] 206 if ok && info != nil { 207 return info.spd 208 } 209 210 return nil 211 } 212 213 // Run to clear local unused spd 214 func (s *Cache) Run(ctx context.Context) { 215 // sleep with cacheTTL to wait the last get time update 216 // for each spd 217 time.Sleep(s.cacheTTL) 218 wait.UntilWithContext(ctx, s.clearUnusedSPDs, s.expiredTime) 219 } 220 221 // restore all spd from disk at startup 222 func (s *Cache) restore() error { 223 s.Lock() 224 defer s.Unlock() 225 226 spdList, err := checkpoint.LoadSPDs(s.manager, s.skipCorruptionError) 227 if err != nil { 228 return fmt.Errorf("restore spd failed: %v", err) 229 } 230 231 for _, spd := range spdList { 232 if spd == nil { 233 continue 234 } 235 key := native.GenerateUniqObjectNameKey(spd) 236 s.initSPDInfoWithoutLock(key) 237 s.spdInfo[key].spd = spd 238 s.spdInfo[key].lastFetchRemoteTime = util.GetLastFetchTime(spd) 239 klog.Infof("restore spd cache %s: %+v", key, s.spdInfo[key]) 240 } 241 242 return nil 243 } 244 245 // clearUnusedSPDs is to clear unused spd according to its lastGetSPDTime 246 func (s *Cache) clearUnusedSPDs(_ context.Context) { 247 s.Lock() 248 defer s.Unlock() 249 250 now := time.Now() 251 for key, info := range s.spdInfo { 252 if info != nil && info.lastGetTime.Add(s.expiredTime).Before(now) { 253 err := checkpoint.DeleteSPD(s.manager, info.spd) 254 if err != nil { 255 klog.Errorf("clear unused spd %s failed: %v", key, err) 256 continue 257 } 258 delete(s.spdInfo, key) 259 klog.Infof("clear spd cache %s", key) 260 } 261 } 262 } 263 264 func (s *Cache) initSPDInfoWithoutLock(key string) { 265 info, ok := s.spdInfo[key] 266 if !ok || info == nil { 267 s.spdInfo[key] = &spdInfo{} 268 } 269 }