github.com/kubewharf/katalyst-core@v0.5.3/pkg/controller/spd/cnc.go (about)

     1  /*
     2  Copyright 2022 The Katalyst Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package spd
    18  
    19  import (
    20  	"context"
    21  	"fmt"
    22  	"sort"
    23  	"time"
    24  
    25  	v1 "k8s.io/api/core/v1"
    26  	apiequality "k8s.io/apimachinery/pkg/api/equality"
    27  	"k8s.io/apimachinery/pkg/api/errors"
    28  	"k8s.io/apimachinery/pkg/labels"
    29  	"k8s.io/apimachinery/pkg/runtime/schema"
    30  	utilruntime "k8s.io/apimachinery/pkg/util/runtime"
    31  	"k8s.io/apimachinery/pkg/util/wait"
    32  	coreinformers "k8s.io/client-go/informers/core/v1"
    33  	corelisters "k8s.io/client-go/listers/core/v1"
    34  	"k8s.io/client-go/tools/cache"
    35  	"k8s.io/client-go/util/workqueue"
    36  	"k8s.io/klog/v2"
    37  
    38  	configapis "github.com/kubewharf/katalyst-api/pkg/apis/config/v1alpha1"
    39  	apiworkload "github.com/kubewharf/katalyst-api/pkg/apis/workload/v1alpha1"
    40  	configinformers "github.com/kubewharf/katalyst-api/pkg/client/informers/externalversions/config/v1alpha1"
    41  	"github.com/kubewharf/katalyst-api/pkg/client/informers/externalversions/workload/v1alpha1"
    42  	configlisters "github.com/kubewharf/katalyst-api/pkg/client/listers/config/v1alpha1"
    43  	apiListers "github.com/kubewharf/katalyst-api/pkg/client/listers/workload/v1alpha1"
    44  	"github.com/kubewharf/katalyst-core/pkg/client/control"
    45  	"github.com/kubewharf/katalyst-core/pkg/config/controller"
    46  	"github.com/kubewharf/katalyst-core/pkg/metrics"
    47  	"github.com/kubewharf/katalyst-core/pkg/util"
    48  	"github.com/kubewharf/katalyst-core/pkg/util/general"
    49  	"github.com/kubewharf/katalyst-core/pkg/util/native"
    50  )
    51  
    52  const (
    53  	metricsNameSyncCNCCacheCost        = "sync_cnc_cache_cost"
    54  	metricsNameClearUnusedCNCCacheCost = "clear_unused_cnc_cache_cost"
    55  
    56  	cncWorkerCount = 1
    57  )
    58  
    59  type cncCacheController struct {
    60  	ctx  context.Context
    61  	conf *controller.SPDConfig
    62  
    63  	cncControl control.CNCControl
    64  
    65  	spdIndexer cache.Indexer
    66  	podIndexer cache.Indexer
    67  
    68  	podLister         corelisters.PodLister
    69  	spdLister         apiListers.ServiceProfileDescriptorLister
    70  	cncLister         configlisters.CustomNodeConfigLister
    71  	workloadGVKLister map[schema.GroupVersionKind]cache.GenericLister
    72  	workloadLister    map[schema.GroupVersionResource]cache.GenericLister
    73  
    74  	cncSyncQueue workqueue.RateLimitingInterface
    75  
    76  	metricsEmitter metrics.MetricEmitter
    77  }
    78  
    79  func newCNCCacheController(ctx context.Context,
    80  	podInformer coreinformers.PodInformer,
    81  	cncInformer configinformers.CustomNodeConfigInformer,
    82  	spdInformer v1alpha1.ServiceProfileDescriptorInformer,
    83  	workloadGVKLister map[schema.GroupVersionKind]cache.GenericLister,
    84  	workloadLister map[schema.GroupVersionResource]cache.GenericLister,
    85  	cncControl control.CNCControl,
    86  	metricsEmitter metrics.MetricEmitter,
    87  	conf *controller.SPDConfig,
    88  ) (*cncCacheController, error) {
    89  	c := &cncCacheController{
    90  		ctx:               ctx,
    91  		conf:              conf,
    92  		cncControl:        cncControl,
    93  		spdIndexer:        spdInformer.Informer().GetIndexer(),
    94  		podIndexer:        podInformer.Informer().GetIndexer(),
    95  		podLister:         podInformer.Lister(),
    96  		cncLister:         cncInformer.Lister(),
    97  		spdLister:         spdInformer.Lister(),
    98  		workloadGVKLister: workloadGVKLister,
    99  		workloadLister:    workloadLister,
   100  		cncSyncQueue:      workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "spd-cnc"),
   101  		metricsEmitter:    metricsEmitter,
   102  	}
   103  
   104  	// if cnc cache is disabled all the event handler is not need,
   105  	// and it will clear all cnc spd config
   106  	if !c.conf.EnableCNCCache {
   107  		return c, nil
   108  	}
   109  	general.Infof("cnc cache is enable")
   110  
   111  	// build index: node ---> pod
   112  	err := native.AddNodeNameIndexerForPod(podInformer)
   113  	if err != nil {
   114  		return nil, fmt.Errorf("failed to add node name index for pod: %v", err)
   115  	}
   116  
   117  	podInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
   118  		AddFunc:    c.addPod,
   119  		UpdateFunc: c.updatePod,
   120  	})
   121  
   122  	cncInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
   123  		AddFunc:    c.addCNC,
   124  		UpdateFunc: c.updateCNC,
   125  	})
   126  
   127  	spdInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
   128  		AddFunc:    c.addSPD,
   129  		UpdateFunc: c.updateSPD,
   130  	})
   131  
   132  	return c, nil
   133  }
   134  
   135  func (c *cncCacheController) Run() {
   136  	defer c.cncSyncQueue.ShutDown()
   137  
   138  	if c.conf.EnableCNCCache {
   139  		for i := 0; i < cncWorkerCount; i++ {
   140  			go wait.Until(c.cncWorker, time.Second, c.ctx.Done())
   141  		}
   142  	}
   143  
   144  	go wait.Until(c.clearUnusedConfig, time.Hour*1, c.ctx.Done())
   145  
   146  	<-c.ctx.Done()
   147  }
   148  
   149  func (c *cncCacheController) cncWorker() {
   150  	for c.processNextCNC() {
   151  	}
   152  }
   153  
   154  func (c *cncCacheController) processNextCNC() bool {
   155  	key, quit := c.cncSyncQueue.Get()
   156  	if quit {
   157  		return false
   158  	}
   159  	defer c.cncSyncQueue.Done(key)
   160  
   161  	err := c.syncCNC(key.(string))
   162  	if err == nil {
   163  		c.cncSyncQueue.Forget(key)
   164  		return true
   165  	}
   166  
   167  	utilruntime.HandleError(fmt.Errorf("sync %q failed with %v", key, err))
   168  	c.cncSyncQueue.AddRateLimited(key)
   169  
   170  	return true
   171  }
   172  
   173  func (c *cncCacheController) syncCNC(key string) error {
   174  	klog.V(5).Infof("[spd] syncing cnc [%v]", key)
   175  	begin := time.Now()
   176  	defer func() {
   177  		costs := time.Since(begin)
   178  		klog.V(5).Infof("[spd] finished sync cnc %q (%v)", key, costs)
   179  		_ = c.metricsEmitter.StoreInt64(metricsNameSyncCNCCacheCost, costs.Microseconds(),
   180  			metrics.MetricTypeNameRaw, metrics.MetricTag{Key: "name", Val: key})
   181  	}()
   182  
   183  	cnc, err := c.cncLister.Get(key)
   184  	if err != nil {
   185  		general.Errorf("failed to get cnc [%v]", key)
   186  		if errors.IsNotFound(err) {
   187  			return nil
   188  		}
   189  		return err
   190  	}
   191  
   192  	spdMap, err := c.getSPDMapForCNC(cnc)
   193  	if err != nil {
   194  		return err
   195  	}
   196  
   197  	setCNC := func(cnc *configapis.CustomNodeConfig) {
   198  		for _, spd := range spdMap {
   199  			applySPDTargetConfigToCNC(cnc, spd)
   200  		}
   201  
   202  		sort.SliceStable(cnc.Status.ServiceProfileConfigList, func(i, j int) bool {
   203  			if cnc.Status.ServiceProfileConfigList[i].ConfigNamespace == cnc.Status.ServiceProfileConfigList[j].ConfigNamespace {
   204  				return cnc.Status.ServiceProfileConfigList[i].ConfigName < cnc.Status.ServiceProfileConfigList[j].ConfigName
   205  			}
   206  			return cnc.Status.ServiceProfileConfigList[i].ConfigNamespace < cnc.Status.ServiceProfileConfigList[j].ConfigNamespace
   207  		})
   208  	}
   209  
   210  	_, err = c.patchCNC(cnc, setCNC)
   211  	if err != nil {
   212  		return err
   213  	}
   214  
   215  	return nil
   216  }
   217  
   218  func (c *cncCacheController) clearUnusedConfig() {
   219  	begin := time.Now()
   220  	defer func() {
   221  		costs := time.Since(begin)
   222  		general.Infof("finished (%v)", costs)
   223  		_ = c.metricsEmitter.StoreInt64(metricsNameClearUnusedCNCCacheCost, costs.Microseconds(),
   224  			metrics.MetricTypeNameRaw)
   225  	}()
   226  
   227  	cncList, err := c.cncLister.List(labels.Everything())
   228  	if err != nil {
   229  		general.Errorf("clear unused config list all custom node config failed")
   230  		return
   231  	}
   232  
   233  	// func for clear cnc config if spd config not exists or cnc cache is disabled
   234  	setFunc := func(cnc *configapis.CustomNodeConfig) {
   235  		spdMap := make(map[string]*apiworkload.ServiceProfileDescriptor)
   236  		// if disable cnc cache, it will clear all cnc spd configs
   237  		if c.conf.EnableCNCCache {
   238  			spdMap, err = c.getSPDMapForCNC(cnc)
   239  			if err != nil {
   240  				general.Errorf("get spd map for cnc %s failed, %v", cnc.Name, err)
   241  				return
   242  			}
   243  		}
   244  
   245  		cnc.Status.ServiceProfileConfigList = util.RemoveUnusedTargetConfig(cnc.Status.ServiceProfileConfigList,
   246  			func(config configapis.TargetConfig) bool {
   247  				spdKey := native.GenerateNamespaceNameKey(config.ConfigNamespace, config.ConfigName)
   248  				if _, ok := spdMap[spdKey]; !ok {
   249  					return true
   250  				}
   251  				return false
   252  			})
   253  	}
   254  
   255  	clearCNCConfigs := func(i int) {
   256  		cnc := cncList[i]
   257  		_, err = c.patchCNC(cnc, setFunc)
   258  		if err != nil {
   259  			general.Errorf("patch cnc %s failed", cnc.GetName())
   260  			return
   261  		}
   262  	}
   263  
   264  	// parallelize to clear cnc configs
   265  	workqueue.ParallelizeUntil(c.ctx, 16, len(cncList), clearCNCConfigs)
   266  }
   267  
   268  func (c *cncCacheController) addPod(obj interface{}) {
   269  	pod, ok := obj.(*v1.Pod)
   270  	if !ok {
   271  		general.Errorf("cannot convert obj to *core.Pod")
   272  		return
   273  	}
   274  
   275  	c.enqueueCNCForPod(pod)
   276  }
   277  
   278  func (c *cncCacheController) updatePod(oldObj interface{}, newObj interface{}) {
   279  	oldPod, ok := oldObj.(*v1.Pod)
   280  	if !ok {
   281  		general.Errorf("cannot convert obj to *core.Pod")
   282  		return
   283  	}
   284  
   285  	newPod, ok := newObj.(*v1.Pod)
   286  	if !ok {
   287  		general.Errorf("cannot convert obj to *core.Pod")
   288  		return
   289  	}
   290  
   291  	if oldPod.Spec.NodeName == "" && newPod.Spec.NodeName != "" {
   292  		c.enqueueCNCForPod(newPod)
   293  	}
   294  }
   295  
   296  func (c *cncCacheController) addSPD(obj interface{}) {
   297  	spd, ok := obj.(*apiworkload.ServiceProfileDescriptor)
   298  	if !ok {
   299  		general.Errorf("cannot convert obj to *apiworkload.ServiceProfileDescriptor")
   300  		return
   301  	}
   302  	c.enqueueCNCForSPD(spd)
   303  }
   304  
   305  func (c *cncCacheController) updateSPD(oldObj, newObj interface{}) {
   306  	oldSPD, ok := oldObj.(*apiworkload.ServiceProfileDescriptor)
   307  	if !ok {
   308  		general.Errorf("cannot convert obj to *apiworkload.ServiceProfileDescriptor")
   309  		return
   310  	}
   311  
   312  	newSPD, ok := newObj.(*apiworkload.ServiceProfileDescriptor)
   313  	if !ok {
   314  		general.Errorf("cannot convert obj to *apiworkload.ServiceProfileDescriptor")
   315  		return
   316  	}
   317  
   318  	if util.GetSPDHash(oldSPD) != util.GetSPDHash(newSPD) {
   319  		c.enqueueCNCForSPD(newSPD)
   320  	}
   321  }
   322  
   323  func (c *cncCacheController) addCNC(obj interface{}) {
   324  	cnc, ok := obj.(*configapis.CustomNodeConfig)
   325  	if !ok {
   326  		general.Errorf("cannot convert obj to *configapis.CustomNodeConfig")
   327  		return
   328  	}
   329  
   330  	c.enqueueCNC(cnc)
   331  }
   332  
   333  func (c *cncCacheController) updateCNC(oldObj interface{}, newObj interface{}) {
   334  	oldCNC, ok := oldObj.(*configapis.CustomNodeConfig)
   335  	if !ok {
   336  		general.Errorf("cannot convert obj to *configapis.CustomNodeConfig")
   337  		return
   338  	}
   339  
   340  	newCNC, ok := newObj.(*configapis.CustomNodeConfig)
   341  	if !ok {
   342  		general.Errorf("cannot convert obj to *configapis.CustomNodeConfig")
   343  		return
   344  	}
   345  
   346  	if !apiequality.Semantic.DeepEqual(oldCNC.Status.ServiceProfileConfigList,
   347  		newCNC.Status.ServiceProfileConfigList) {
   348  		c.enqueueCNC(newCNC)
   349  	}
   350  }
   351  
   352  func (c *cncCacheController) enqueueCNCForSPD(spd *apiworkload.ServiceProfileDescriptor) {
   353  	if util.GetSPDHash(spd) == "" {
   354  		return
   355  	}
   356  
   357  	podList, err := util.GetPodListForSPD(spd, c.podIndexer, c.conf.SPDPodLabelIndexerKeys,
   358  		c.workloadLister, c.podLister)
   359  	if err != nil {
   360  		return
   361  	}
   362  
   363  	for _, pod := range podList {
   364  		if pod == nil {
   365  			continue
   366  		}
   367  
   368  		c.enqueueCNCForPod(pod)
   369  	}
   370  }
   371  
   372  func (c *cncCacheController) enqueueCNCForPod(pod *v1.Pod) {
   373  	if pod.Spec.NodeName == "" {
   374  		return
   375  	}
   376  
   377  	cnc, err := c.cncLister.Get(pod.Spec.NodeName)
   378  	if err != nil {
   379  		return
   380  	}
   381  
   382  	c.enqueueCNC(cnc)
   383  }
   384  
   385  func (c *cncCacheController) enqueueCNC(cnc *configapis.CustomNodeConfig) {
   386  	if cnc == nil {
   387  		general.Warningf("trying to enqueue a nil cnc")
   388  		return
   389  	}
   390  
   391  	c.cncSyncQueue.Add(cnc.Name)
   392  }
   393  
   394  func (c *cncCacheController) getSPDMapForCNC(cnc *configapis.CustomNodeConfig) (map[string]*apiworkload.ServiceProfileDescriptor, error) {
   395  	podList, err := native.GetPodsAssignedToNode(cnc.Name, c.podIndexer)
   396  	if err != nil {
   397  		return nil, err
   398  	}
   399  
   400  	spdMap := make(map[string]*apiworkload.ServiceProfileDescriptor)
   401  	for _, pod := range podList {
   402  		if native.PodIsTerminated(pod) {
   403  			continue
   404  		}
   405  
   406  		spd, err := util.GetSPDForPod(pod, c.spdIndexer, c.workloadGVKLister, c.spdLister, false)
   407  		if err != nil && !errors.IsNotFound(err) {
   408  			return nil, err
   409  		}
   410  
   411  		if spd == nil {
   412  			continue
   413  		}
   414  
   415  		spdKey := native.GenerateUniqObjectNameKey(spd)
   416  		spdMap[spdKey] = spd
   417  	}
   418  
   419  	return spdMap, nil
   420  }
   421  
   422  func (c *cncCacheController) patchCNC(cnc *configapis.CustomNodeConfig, setFunc func(*configapis.CustomNodeConfig)) (*configapis.CustomNodeConfig, error) {
   423  	cncCopy := cnc.DeepCopy()
   424  	setFunc(cncCopy)
   425  	if apiequality.Semantic.DeepEqual(cnc, cncCopy) {
   426  		return cnc, nil
   427  	}
   428  
   429  	general.Infof("cnc %s config changed need to patch", cnc.GetName())
   430  	return c.cncControl.PatchCNCStatus(c.ctx, cnc.Name, cnc, cncCopy)
   431  }
   432  
   433  func applySPDTargetConfigToCNC(cnc *configapis.CustomNodeConfig,
   434  	spd *apiworkload.ServiceProfileDescriptor,
   435  ) {
   436  	if cnc == nil || spd == nil {
   437  		return
   438  	}
   439  
   440  	idx := 0
   441  	serviceProfileConfigList := cnc.Status.ServiceProfileConfigList
   442  	// find target config
   443  	for ; idx < len(serviceProfileConfigList); idx++ {
   444  		if serviceProfileConfigList[idx].ConfigNamespace == spd.Namespace &&
   445  			serviceProfileConfigList[idx].ConfigName == spd.Name {
   446  			break
   447  		}
   448  	}
   449  
   450  	targetConfig := configapis.TargetConfig{
   451  		ConfigNamespace: spd.Namespace,
   452  		ConfigName:      spd.Name,
   453  		Hash:            util.GetSPDHash(spd),
   454  	}
   455  
   456  	// update target config if the spd config is already existed
   457  	if idx < len(serviceProfileConfigList) {
   458  		serviceProfileConfigList[idx] = targetConfig
   459  	} else {
   460  		serviceProfileConfigList = append(serviceProfileConfigList, targetConfig)
   461  		cnc.Status.ServiceProfileConfigList = serviceProfileConfigList
   462  	}
   463  }