github.com/kubewharf/katalyst-core@v0.5.3/pkg/controller/spd/cnc.go (about) 1 /* 2 Copyright 2022 The Katalyst Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package spd 18 19 import ( 20 "context" 21 "fmt" 22 "sort" 23 "time" 24 25 v1 "k8s.io/api/core/v1" 26 apiequality "k8s.io/apimachinery/pkg/api/equality" 27 "k8s.io/apimachinery/pkg/api/errors" 28 "k8s.io/apimachinery/pkg/labels" 29 "k8s.io/apimachinery/pkg/runtime/schema" 30 utilruntime "k8s.io/apimachinery/pkg/util/runtime" 31 "k8s.io/apimachinery/pkg/util/wait" 32 coreinformers "k8s.io/client-go/informers/core/v1" 33 corelisters "k8s.io/client-go/listers/core/v1" 34 "k8s.io/client-go/tools/cache" 35 "k8s.io/client-go/util/workqueue" 36 "k8s.io/klog/v2" 37 38 configapis "github.com/kubewharf/katalyst-api/pkg/apis/config/v1alpha1" 39 apiworkload "github.com/kubewharf/katalyst-api/pkg/apis/workload/v1alpha1" 40 configinformers "github.com/kubewharf/katalyst-api/pkg/client/informers/externalversions/config/v1alpha1" 41 "github.com/kubewharf/katalyst-api/pkg/client/informers/externalversions/workload/v1alpha1" 42 configlisters "github.com/kubewharf/katalyst-api/pkg/client/listers/config/v1alpha1" 43 apiListers "github.com/kubewharf/katalyst-api/pkg/client/listers/workload/v1alpha1" 44 "github.com/kubewharf/katalyst-core/pkg/client/control" 45 "github.com/kubewharf/katalyst-core/pkg/config/controller" 46 "github.com/kubewharf/katalyst-core/pkg/metrics" 47 "github.com/kubewharf/katalyst-core/pkg/util" 48 "github.com/kubewharf/katalyst-core/pkg/util/general" 49 "github.com/kubewharf/katalyst-core/pkg/util/native" 50 ) 51 52 const ( 53 metricsNameSyncCNCCacheCost = "sync_cnc_cache_cost" 54 metricsNameClearUnusedCNCCacheCost = "clear_unused_cnc_cache_cost" 55 56 cncWorkerCount = 1 57 ) 58 59 type cncCacheController struct { 60 ctx context.Context 61 conf *controller.SPDConfig 62 63 cncControl control.CNCControl 64 65 spdIndexer cache.Indexer 66 podIndexer cache.Indexer 67 68 podLister corelisters.PodLister 69 spdLister apiListers.ServiceProfileDescriptorLister 70 cncLister configlisters.CustomNodeConfigLister 71 workloadGVKLister map[schema.GroupVersionKind]cache.GenericLister 72 workloadLister map[schema.GroupVersionResource]cache.GenericLister 73 74 cncSyncQueue workqueue.RateLimitingInterface 75 76 metricsEmitter metrics.MetricEmitter 77 } 78 79 func newCNCCacheController(ctx context.Context, 80 podInformer coreinformers.PodInformer, 81 cncInformer configinformers.CustomNodeConfigInformer, 82 spdInformer v1alpha1.ServiceProfileDescriptorInformer, 83 workloadGVKLister map[schema.GroupVersionKind]cache.GenericLister, 84 workloadLister map[schema.GroupVersionResource]cache.GenericLister, 85 cncControl control.CNCControl, 86 metricsEmitter metrics.MetricEmitter, 87 conf *controller.SPDConfig, 88 ) (*cncCacheController, error) { 89 c := &cncCacheController{ 90 ctx: ctx, 91 conf: conf, 92 cncControl: cncControl, 93 spdIndexer: spdInformer.Informer().GetIndexer(), 94 podIndexer: podInformer.Informer().GetIndexer(), 95 podLister: podInformer.Lister(), 96 cncLister: cncInformer.Lister(), 97 spdLister: spdInformer.Lister(), 98 workloadGVKLister: workloadGVKLister, 99 workloadLister: workloadLister, 100 cncSyncQueue: workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "spd-cnc"), 101 metricsEmitter: metricsEmitter, 102 } 103 104 // if cnc cache is disabled all the event handler is not need, 105 // and it will clear all cnc spd config 106 if !c.conf.EnableCNCCache { 107 return c, nil 108 } 109 general.Infof("cnc cache is enable") 110 111 // build index: node ---> pod 112 err := native.AddNodeNameIndexerForPod(podInformer) 113 if err != nil { 114 return nil, fmt.Errorf("failed to add node name index for pod: %v", err) 115 } 116 117 podInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ 118 AddFunc: c.addPod, 119 UpdateFunc: c.updatePod, 120 }) 121 122 cncInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ 123 AddFunc: c.addCNC, 124 UpdateFunc: c.updateCNC, 125 }) 126 127 spdInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ 128 AddFunc: c.addSPD, 129 UpdateFunc: c.updateSPD, 130 }) 131 132 return c, nil 133 } 134 135 func (c *cncCacheController) Run() { 136 defer c.cncSyncQueue.ShutDown() 137 138 if c.conf.EnableCNCCache { 139 for i := 0; i < cncWorkerCount; i++ { 140 go wait.Until(c.cncWorker, time.Second, c.ctx.Done()) 141 } 142 } 143 144 go wait.Until(c.clearUnusedConfig, time.Hour*1, c.ctx.Done()) 145 146 <-c.ctx.Done() 147 } 148 149 func (c *cncCacheController) cncWorker() { 150 for c.processNextCNC() { 151 } 152 } 153 154 func (c *cncCacheController) processNextCNC() bool { 155 key, quit := c.cncSyncQueue.Get() 156 if quit { 157 return false 158 } 159 defer c.cncSyncQueue.Done(key) 160 161 err := c.syncCNC(key.(string)) 162 if err == nil { 163 c.cncSyncQueue.Forget(key) 164 return true 165 } 166 167 utilruntime.HandleError(fmt.Errorf("sync %q failed with %v", key, err)) 168 c.cncSyncQueue.AddRateLimited(key) 169 170 return true 171 } 172 173 func (c *cncCacheController) syncCNC(key string) error { 174 klog.V(5).Infof("[spd] syncing cnc [%v]", key) 175 begin := time.Now() 176 defer func() { 177 costs := time.Since(begin) 178 klog.V(5).Infof("[spd] finished sync cnc %q (%v)", key, costs) 179 _ = c.metricsEmitter.StoreInt64(metricsNameSyncCNCCacheCost, costs.Microseconds(), 180 metrics.MetricTypeNameRaw, metrics.MetricTag{Key: "name", Val: key}) 181 }() 182 183 cnc, err := c.cncLister.Get(key) 184 if err != nil { 185 general.Errorf("failed to get cnc [%v]", key) 186 if errors.IsNotFound(err) { 187 return nil 188 } 189 return err 190 } 191 192 spdMap, err := c.getSPDMapForCNC(cnc) 193 if err != nil { 194 return err 195 } 196 197 setCNC := func(cnc *configapis.CustomNodeConfig) { 198 for _, spd := range spdMap { 199 applySPDTargetConfigToCNC(cnc, spd) 200 } 201 202 sort.SliceStable(cnc.Status.ServiceProfileConfigList, func(i, j int) bool { 203 if cnc.Status.ServiceProfileConfigList[i].ConfigNamespace == cnc.Status.ServiceProfileConfigList[j].ConfigNamespace { 204 return cnc.Status.ServiceProfileConfigList[i].ConfigName < cnc.Status.ServiceProfileConfigList[j].ConfigName 205 } 206 return cnc.Status.ServiceProfileConfigList[i].ConfigNamespace < cnc.Status.ServiceProfileConfigList[j].ConfigNamespace 207 }) 208 } 209 210 _, err = c.patchCNC(cnc, setCNC) 211 if err != nil { 212 return err 213 } 214 215 return nil 216 } 217 218 func (c *cncCacheController) clearUnusedConfig() { 219 begin := time.Now() 220 defer func() { 221 costs := time.Since(begin) 222 general.Infof("finished (%v)", costs) 223 _ = c.metricsEmitter.StoreInt64(metricsNameClearUnusedCNCCacheCost, costs.Microseconds(), 224 metrics.MetricTypeNameRaw) 225 }() 226 227 cncList, err := c.cncLister.List(labels.Everything()) 228 if err != nil { 229 general.Errorf("clear unused config list all custom node config failed") 230 return 231 } 232 233 // func for clear cnc config if spd config not exists or cnc cache is disabled 234 setFunc := func(cnc *configapis.CustomNodeConfig) { 235 spdMap := make(map[string]*apiworkload.ServiceProfileDescriptor) 236 // if disable cnc cache, it will clear all cnc spd configs 237 if c.conf.EnableCNCCache { 238 spdMap, err = c.getSPDMapForCNC(cnc) 239 if err != nil { 240 general.Errorf("get spd map for cnc %s failed, %v", cnc.Name, err) 241 return 242 } 243 } 244 245 cnc.Status.ServiceProfileConfigList = util.RemoveUnusedTargetConfig(cnc.Status.ServiceProfileConfigList, 246 func(config configapis.TargetConfig) bool { 247 spdKey := native.GenerateNamespaceNameKey(config.ConfigNamespace, config.ConfigName) 248 if _, ok := spdMap[spdKey]; !ok { 249 return true 250 } 251 return false 252 }) 253 } 254 255 clearCNCConfigs := func(i int) { 256 cnc := cncList[i] 257 _, err = c.patchCNC(cnc, setFunc) 258 if err != nil { 259 general.Errorf("patch cnc %s failed", cnc.GetName()) 260 return 261 } 262 } 263 264 // parallelize to clear cnc configs 265 workqueue.ParallelizeUntil(c.ctx, 16, len(cncList), clearCNCConfigs) 266 } 267 268 func (c *cncCacheController) addPod(obj interface{}) { 269 pod, ok := obj.(*v1.Pod) 270 if !ok { 271 general.Errorf("cannot convert obj to *core.Pod") 272 return 273 } 274 275 c.enqueueCNCForPod(pod) 276 } 277 278 func (c *cncCacheController) updatePod(oldObj interface{}, newObj interface{}) { 279 oldPod, ok := oldObj.(*v1.Pod) 280 if !ok { 281 general.Errorf("cannot convert obj to *core.Pod") 282 return 283 } 284 285 newPod, ok := newObj.(*v1.Pod) 286 if !ok { 287 general.Errorf("cannot convert obj to *core.Pod") 288 return 289 } 290 291 if oldPod.Spec.NodeName == "" && newPod.Spec.NodeName != "" { 292 c.enqueueCNCForPod(newPod) 293 } 294 } 295 296 func (c *cncCacheController) addSPD(obj interface{}) { 297 spd, ok := obj.(*apiworkload.ServiceProfileDescriptor) 298 if !ok { 299 general.Errorf("cannot convert obj to *apiworkload.ServiceProfileDescriptor") 300 return 301 } 302 c.enqueueCNCForSPD(spd) 303 } 304 305 func (c *cncCacheController) updateSPD(oldObj, newObj interface{}) { 306 oldSPD, ok := oldObj.(*apiworkload.ServiceProfileDescriptor) 307 if !ok { 308 general.Errorf("cannot convert obj to *apiworkload.ServiceProfileDescriptor") 309 return 310 } 311 312 newSPD, ok := newObj.(*apiworkload.ServiceProfileDescriptor) 313 if !ok { 314 general.Errorf("cannot convert obj to *apiworkload.ServiceProfileDescriptor") 315 return 316 } 317 318 if util.GetSPDHash(oldSPD) != util.GetSPDHash(newSPD) { 319 c.enqueueCNCForSPD(newSPD) 320 } 321 } 322 323 func (c *cncCacheController) addCNC(obj interface{}) { 324 cnc, ok := obj.(*configapis.CustomNodeConfig) 325 if !ok { 326 general.Errorf("cannot convert obj to *configapis.CustomNodeConfig") 327 return 328 } 329 330 c.enqueueCNC(cnc) 331 } 332 333 func (c *cncCacheController) updateCNC(oldObj interface{}, newObj interface{}) { 334 oldCNC, ok := oldObj.(*configapis.CustomNodeConfig) 335 if !ok { 336 general.Errorf("cannot convert obj to *configapis.CustomNodeConfig") 337 return 338 } 339 340 newCNC, ok := newObj.(*configapis.CustomNodeConfig) 341 if !ok { 342 general.Errorf("cannot convert obj to *configapis.CustomNodeConfig") 343 return 344 } 345 346 if !apiequality.Semantic.DeepEqual(oldCNC.Status.ServiceProfileConfigList, 347 newCNC.Status.ServiceProfileConfigList) { 348 c.enqueueCNC(newCNC) 349 } 350 } 351 352 func (c *cncCacheController) enqueueCNCForSPD(spd *apiworkload.ServiceProfileDescriptor) { 353 if util.GetSPDHash(spd) == "" { 354 return 355 } 356 357 podList, err := util.GetPodListForSPD(spd, c.podIndexer, c.conf.SPDPodLabelIndexerKeys, 358 c.workloadLister, c.podLister) 359 if err != nil { 360 return 361 } 362 363 for _, pod := range podList { 364 if pod == nil { 365 continue 366 } 367 368 c.enqueueCNCForPod(pod) 369 } 370 } 371 372 func (c *cncCacheController) enqueueCNCForPod(pod *v1.Pod) { 373 if pod.Spec.NodeName == "" { 374 return 375 } 376 377 cnc, err := c.cncLister.Get(pod.Spec.NodeName) 378 if err != nil { 379 return 380 } 381 382 c.enqueueCNC(cnc) 383 } 384 385 func (c *cncCacheController) enqueueCNC(cnc *configapis.CustomNodeConfig) { 386 if cnc == nil { 387 general.Warningf("trying to enqueue a nil cnc") 388 return 389 } 390 391 c.cncSyncQueue.Add(cnc.Name) 392 } 393 394 func (c *cncCacheController) getSPDMapForCNC(cnc *configapis.CustomNodeConfig) (map[string]*apiworkload.ServiceProfileDescriptor, error) { 395 podList, err := native.GetPodsAssignedToNode(cnc.Name, c.podIndexer) 396 if err != nil { 397 return nil, err 398 } 399 400 spdMap := make(map[string]*apiworkload.ServiceProfileDescriptor) 401 for _, pod := range podList { 402 if native.PodIsTerminated(pod) { 403 continue 404 } 405 406 spd, err := util.GetSPDForPod(pod, c.spdIndexer, c.workloadGVKLister, c.spdLister, false) 407 if err != nil && !errors.IsNotFound(err) { 408 return nil, err 409 } 410 411 if spd == nil { 412 continue 413 } 414 415 spdKey := native.GenerateUniqObjectNameKey(spd) 416 spdMap[spdKey] = spd 417 } 418 419 return spdMap, nil 420 } 421 422 func (c *cncCacheController) patchCNC(cnc *configapis.CustomNodeConfig, setFunc func(*configapis.CustomNodeConfig)) (*configapis.CustomNodeConfig, error) { 423 cncCopy := cnc.DeepCopy() 424 setFunc(cncCopy) 425 if apiequality.Semantic.DeepEqual(cnc, cncCopy) { 426 return cnc, nil 427 } 428 429 general.Infof("cnc %s config changed need to patch", cnc.GetName()) 430 return c.cncControl.PatchCNCStatus(c.ctx, cnc.Name, cnc, cncCopy) 431 } 432 433 func applySPDTargetConfigToCNC(cnc *configapis.CustomNodeConfig, 434 spd *apiworkload.ServiceProfileDescriptor, 435 ) { 436 if cnc == nil || spd == nil { 437 return 438 } 439 440 idx := 0 441 serviceProfileConfigList := cnc.Status.ServiceProfileConfigList 442 // find target config 443 for ; idx < len(serviceProfileConfigList); idx++ { 444 if serviceProfileConfigList[idx].ConfigNamespace == spd.Namespace && 445 serviceProfileConfigList[idx].ConfigName == spd.Name { 446 break 447 } 448 } 449 450 targetConfig := configapis.TargetConfig{ 451 ConfigNamespace: spd.Namespace, 452 ConfigName: spd.Name, 453 Hash: util.GetSPDHash(spd), 454 } 455 456 // update target config if the spd config is already existed 457 if idx < len(serviceProfileConfigList) { 458 serviceProfileConfigList[idx] = targetConfig 459 } else { 460 serviceProfileConfigList = append(serviceProfileConfigList, targetConfig) 461 cnc.Status.ServiceProfileConfigList = serviceProfileConfigList 462 } 463 }