github.com/kubewharf/katalyst-core@v0.5.3/pkg/metaserver/spd/fetcher.go (about) 1 /* 2 Copyright 2022 The Katalyst Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package spd 18 19 import ( 20 "context" 21 "fmt" 22 "sync" 23 "time" 24 25 "go.uber.org/atomic" 26 "k8s.io/apimachinery/pkg/api/errors" 27 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 28 "k8s.io/apimachinery/pkg/util/wait" 29 "k8s.io/client-go/tools/cache" 30 "k8s.io/klog/v2" 31 "k8s.io/kubernetes/pkg/kubelet/checkpointmanager" 32 33 configapis "github.com/kubewharf/katalyst-api/pkg/apis/config/v1alpha1" 34 workloadapis "github.com/kubewharf/katalyst-api/pkg/apis/workload/v1alpha1" 35 "github.com/kubewharf/katalyst-core/pkg/client" 36 pkgconfig "github.com/kubewharf/katalyst-core/pkg/config" 37 "github.com/kubewharf/katalyst-core/pkg/metaserver/agent/cnc" 38 "github.com/kubewharf/katalyst-core/pkg/metrics" 39 "github.com/kubewharf/katalyst-core/pkg/util" 40 "github.com/kubewharf/katalyst-core/pkg/util/general" 41 "github.com/kubewharf/katalyst-core/pkg/util/native" 42 ) 43 44 const ( 45 defaultClearUnusedSPDPeriod = 12 * time.Hour 46 defaultMaxRetryCount = 3 47 defaultJitterFactor = 1 48 ) 49 50 const ( 51 metricsNameGetCNCTargetConfigFailed = "spd_manager_get_cnc_target_failed" 52 metricsNameUpdateCacheFailed = "spd_manager_update_cache_failed" 53 metricsNameCacheNotFound = "spd_manager_cache_not_found" 54 metricsNameUpdateCacheSuccess = "spd_manager_update_cache_success" 55 metricsNameDeleteCache = "spd_manager_delete_cache" 56 ) 57 58 type GetPodSPDNameFunc func(_ metav1.ObjectMeta) (string, error) 59 60 type SPDFetcher interface { 61 // GetSPD get spd for given pod 62 GetSPD(ctx context.Context, podMeta metav1.ObjectMeta) (*workloadapis.ServiceProfileDescriptor, error) 63 64 // Run async loop to clear unused spd 65 Run(ctx context.Context) 66 } 67 68 type DummySPDFetcher struct { 69 SPD *workloadapis.ServiceProfileDescriptor 70 } 71 72 func (d DummySPDFetcher) GetSPD(_ context.Context, _ metav1.ObjectMeta) (*workloadapis.ServiceProfileDescriptor, error) { 73 return d.SPD, nil 74 } 75 76 func (d DummySPDFetcher) Run(_ context.Context) { 77 return 78 } 79 80 type spdFetcher struct { 81 started *atomic.Bool 82 mux sync.Mutex 83 84 client *client.GenericClientSet 85 emitter metrics.MetricEmitter 86 cncFetcher cnc.CNCFetcher 87 checkpointManager checkpointmanager.CheckpointManager 88 getPodSPDNameFunc GetPodSPDNameFunc 89 90 // spdCache is a cache of namespace/name to current target spd 91 spdCache *Cache 92 } 93 94 // NewSPDFetcher creates a spd manager to implement SPDFetcher 95 func NewSPDFetcher(clientSet *client.GenericClientSet, emitter metrics.MetricEmitter, 96 cncFetcher cnc.CNCFetcher, conf *pkgconfig.Configuration, 97 ) (SPDFetcher, error) { 98 checkpointManager, err := checkpointmanager.NewCheckpointManager(conf.CheckpointManagerDir) 99 if err != nil { 100 return nil, fmt.Errorf("failed to initialize checkpoint manager: %v", err) 101 } 102 103 m := &spdFetcher{ 104 started: atomic.NewBool(false), 105 client: clientSet, 106 emitter: emitter, 107 checkpointManager: checkpointManager, 108 cncFetcher: cncFetcher, 109 } 110 111 m.getPodSPDNameFunc = util.GetPodSPDName 112 m.spdCache, err = NewSPDCache(checkpointManager, conf.ServiceProfileSkipCorruptionError, conf.ServiceProfileCacheTTL, 113 defaultClearUnusedSPDPeriod, defaultMaxRetryCount, defaultJitterFactor) 114 if err != nil { 115 return nil, err 116 } 117 118 return m, nil 119 } 120 121 func (s *spdFetcher) GetSPD(ctx context.Context, podMeta metav1.ObjectMeta) (*workloadapis.ServiceProfileDescriptor, error) { 122 spdName, err := s.getPodSPDNameFunc(podMeta) 123 if err != nil { 124 general.Warningf("get spd for pod (%v/%v) err %v", podMeta.Namespace, podMeta.Name, err) 125 return nil, errors.NewNotFound(workloadapis.Resource(workloadapis.ResourceNameServiceProfileDescriptors), fmt.Sprintf("for pod(%v/%v)", podMeta.Namespace, podMeta.Name)) 126 } 127 128 return s.getSPDByNamespaceName(ctx, podMeta.GetNamespace(), spdName) 129 } 130 131 // SetGetPodSPDNameFunc set get spd name function to override default getPodSPDNameFunc before started 132 func (s *spdFetcher) SetGetPodSPDNameFunc(f GetPodSPDNameFunc) { 133 if s.started.Load() { 134 klog.Warningf("spd manager has already started, not allowed to set implementations") 135 return 136 } 137 138 s.getPodSPDNameFunc = f 139 } 140 141 func (s *spdFetcher) Run(ctx context.Context) { 142 if s.started.Swap(true) { 143 return 144 } 145 146 go s.spdCache.Run(ctx) 147 go wait.UntilWithContext(ctx, s.sync, 30*time.Second) 148 <-ctx.Done() 149 } 150 151 func (s *spdFetcher) getSPDByNamespaceName(_ context.Context, namespace, name string) (*workloadapis.ServiceProfileDescriptor, error) { 152 key := native.GenerateNamespaceNameKey(namespace, name) 153 baseTag := []metrics.MetricTag{ 154 {Key: "spdNamespace", Val: namespace}, 155 {Key: "spdName", Val: name}, 156 } 157 158 // get current spd from cache 159 currentSPD := s.spdCache.GetSPD(key, true) 160 if currentSPD != nil { 161 return currentSPD, nil 162 } 163 164 _ = s.emitter.StoreInt64(metricsNameCacheNotFound, 1, metrics.MetricTypeNameCount, baseTag...) 165 166 return nil, errors.NewNotFound(workloadapis.Resource(workloadapis.ResourceNameServiceProfileDescriptors), name) 167 } 168 169 // getSPDTargetConfig get spd target config from cnc 170 func (s *spdFetcher) getSPDTargetConfig(ctx context.Context, namespace, name string) (*configapis.TargetConfig, error) { 171 currentCNC, err := s.cncFetcher.GetCNC(ctx) 172 if err != nil { 173 return &configapis.TargetConfig{}, err 174 } 175 176 for _, target := range currentCNC.Status.ServiceProfileConfigList { 177 if target.ConfigNamespace == namespace && target.ConfigName == name { 178 return &target, nil 179 } 180 } 181 182 return nil, fmt.Errorf("get target spd %s/%s not found", namespace, name) 183 } 184 185 func (s *spdFetcher) sync(ctx context.Context) { 186 spdKeys := s.spdCache.ListAllSPDKeys() 187 for _, key := range spdKeys { 188 namespace, name, err := cache.SplitMetaNamespaceKey(key) 189 if err != nil { 190 continue 191 } 192 193 baseTag := []metrics.MetricTag{ 194 {Key: "spdNamespace", Val: namespace}, 195 {Key: "spdName", Val: name}, 196 } 197 198 // first get spd origin spd from local cache 199 originSPD := s.spdCache.GetSPD(key, false) 200 201 // get spd current target config from cnc to limit rate of get remote spd by comparing local spd 202 // hash with cnc target config hash, if cnc target config not found it will get remote spd directly 203 targetConfig, err := s.getSPDTargetConfig(ctx, namespace, name) 204 if err != nil { 205 klog.Warningf("[spd-manager] get spd targetConfig config failed: %v, use local cache instead", err) 206 targetConfig = &configapis.TargetConfig{ 207 ConfigNamespace: namespace, 208 ConfigName: name, 209 } 210 _ = s.emitter.StoreInt64(metricsNameGetCNCTargetConfigFailed, 1, metrics.MetricTypeNameCount, baseTag...) 211 } 212 213 // try to update spd cache from remote if cache spd hash is not equal to target config hash, 214 // the rate of getting remote spd will be limited by spd ServiceProfileCacheTTL 215 err = s.updateSPDCacheIfNeed(ctx, originSPD, targetConfig) 216 if err != nil { 217 klog.Errorf("[spd-manager] failed update spd cache from remote: %v, use local cache instead", err) 218 _ = s.emitter.StoreInt64(metricsNameUpdateCacheFailed, 1, metrics.MetricTypeNameCount, baseTag...) 219 } 220 } 221 } 222 223 // updateSPDCacheIfNeed checks if the previous spd has changed, and 224 // re-get from APIServer if the previous is out-of date. 225 func (s *spdFetcher) updateSPDCacheIfNeed(ctx context.Context, originSPD *workloadapis.ServiceProfileDescriptor, 226 targetConfig *configapis.TargetConfig, 227 ) error { 228 if originSPD == nil && targetConfig == nil { 229 return nil 230 } 231 232 now := time.Now() 233 if originSPD == nil || util.GetSPDHash(originSPD) != targetConfig.Hash { 234 key := native.GenerateNamespaceNameKey(targetConfig.ConfigNamespace, targetConfig.ConfigName) 235 // Skip the backoff delay if the configuration hash of the CNC target changes, ensuring the 236 // local SPD cache is always updated with the latest configuration. 237 if nextFetchRemoteTime := s.spdCache.GetNextFetchRemoteTime(key, now, targetConfig.Hash != ""); nextFetchRemoteTime.After(time.Now()) { 238 return nil 239 } else { 240 // first update the timestamp of the last attempt to fetch the remote spd to 241 // avoid frequent requests to the api-server in some bad situations 242 s.spdCache.SetLastFetchRemoteTime(key, now) 243 } 244 245 baseTag := []metrics.MetricTag{ 246 {Key: "spdNamespace", Val: targetConfig.ConfigNamespace}, 247 {Key: "spdName", Val: targetConfig.ConfigName}, 248 } 249 250 klog.Infof("[spd-manager] spd %s targetConfig hash is changed from %s to %s", key, util.GetSPDHash(originSPD), targetConfig.Hash) 251 spd, err := s.client.InternalClient.WorkloadV1alpha1().ServiceProfileDescriptors(targetConfig.ConfigNamespace). 252 Get(ctx, targetConfig.ConfigName, metav1.GetOptions{ResourceVersion: "0"}) 253 if err != nil && !errors.IsNotFound(err) { 254 return fmt.Errorf("get spd %s from remote failed: %v", key, err) 255 } else if err != nil { 256 _ = s.emitter.StoreInt64(metricsNameDeleteCache, 1, metrics.MetricTypeNameCount, baseTag...) 257 err = s.spdCache.DeleteSPD(key) 258 if err != nil { 259 return fmt.Errorf("delete spd %s from cache failed: %v", key, err) 260 } 261 262 klog.Infof("[spd-manager] spd %s cache has been deleted", key) 263 return nil 264 } 265 266 _ = s.emitter.StoreInt64(metricsNameUpdateCacheSuccess, 1, metrics.MetricTypeNameCount, baseTag...) 267 268 err = s.spdCache.SetSPD(key, spd) 269 if err != nil { 270 return err 271 } 272 klog.Infof("[spd-manager] spd %s cache has been updated to %v", key, spd) 273 } 274 275 return nil 276 }