github.com/kubewharf/katalyst-core@v0.5.3/pkg/controller/lifecycle/cnr.go (about)

     1  /*
     2  Copyright 2022 The Katalyst Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package lifecycle
    18  
    19  import (
    20  	"context"
    21  	"fmt"
    22  	"time"
    23  
    24  	corev1 "k8s.io/api/core/v1"
    25  	apiequality "k8s.io/apimachinery/pkg/api/equality"
    26  	"k8s.io/apimachinery/pkg/api/errors"
    27  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    28  	"k8s.io/apimachinery/pkg/labels"
    29  	utilruntime "k8s.io/apimachinery/pkg/util/runtime"
    30  	"k8s.io/apimachinery/pkg/util/wait"
    31  	coreinformers "k8s.io/client-go/informers/core/v1"
    32  	corelisters "k8s.io/client-go/listers/core/v1"
    33  	"k8s.io/client-go/tools/cache"
    34  	"k8s.io/client-go/util/workqueue"
    35  	"k8s.io/klog/v2"
    36  
    37  	apis "github.com/kubewharf/katalyst-api/pkg/apis/node/v1alpha1"
    38  	informers "github.com/kubewharf/katalyst-api/pkg/client/informers/externalversions/node/v1alpha1"
    39  	listers "github.com/kubewharf/katalyst-api/pkg/client/listers/node/v1alpha1"
    40  	"github.com/kubewharf/katalyst-core/pkg/client"
    41  	"github.com/kubewharf/katalyst-core/pkg/client/control"
    42  	"github.com/kubewharf/katalyst-core/pkg/config/controller"
    43  	"github.com/kubewharf/katalyst-core/pkg/config/generic"
    44  	"github.com/kubewharf/katalyst-core/pkg/metrics"
    45  	"github.com/kubewharf/katalyst-core/pkg/util/general"
    46  )
    47  
    48  const (
    49  	cnrLifecycleControllerName = "cnr-lifecycle"
    50  	cnrLifeCycleWorkerCount    = 1
    51  )
    52  
    53  const (
    54  	clearCNRPeriod = 30 * time.Second
    55  )
    56  
    57  type CNRLifecycle struct {
    58  	ctx context.Context
    59  
    60  	client     *client.GenericClientSet
    61  	cnrControl control.CNRControl
    62  
    63  	nodeListerSynced cache.InformerSynced
    64  	nodeLister       corelisters.NodeLister
    65  	cnrListerSynced  cache.InformerSynced
    66  	cnrLister        listers.CustomNodeResourceLister
    67  
    68  	// queue for node
    69  	syncQueue workqueue.RateLimitingInterface
    70  
    71  	// metricsEmitter for emit metrics
    72  	metricsEmitter metrics.MetricEmitter
    73  }
    74  
    75  func NewCNRLifecycle(ctx context.Context,
    76  	genericConf *generic.GenericConfiguration,
    77  	_ *controller.GenericControllerConfiguration,
    78  	_ *controller.CNRLifecycleConfig,
    79  	client *client.GenericClientSet,
    80  	nodeInformer coreinformers.NodeInformer,
    81  	cnrInformer informers.CustomNodeResourceInformer,
    82  	metricsEmitter metrics.MetricEmitter,
    83  ) (*CNRLifecycle, error) {
    84  	cnrLifecycle := &CNRLifecycle{
    85  		ctx:    ctx,
    86  		client: client,
    87  		syncQueue: workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(),
    88  			cnrLifecycleControllerName),
    89  	}
    90  
    91  	nodeInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
    92  		AddFunc:    cnrLifecycle.addNodeEventHandle,
    93  		UpdateFunc: cnrLifecycle.updateNodeEventHandle,
    94  	})
    95  	cnrLifecycle.nodeListerSynced = nodeInformer.Informer().HasSynced
    96  	cnrLifecycle.nodeLister = nodeInformer.Lister()
    97  
    98  	cnrInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
    99  		AddFunc:    cnrLifecycle.addCNREventHandle,
   100  		UpdateFunc: cnrLifecycle.updateCNREventHandle,
   101  		DeleteFunc: cnrLifecycle.deleteCNREventHandle,
   102  	})
   103  	cnrLifecycle.cnrLister = cnrInformer.Lister()
   104  	cnrLifecycle.cnrListerSynced = cnrInformer.Informer().HasSynced
   105  
   106  	if metricsEmitter == nil {
   107  		cnrLifecycle.metricsEmitter = metrics.DummyMetrics{}
   108  	} else {
   109  		cnrLifecycle.metricsEmitter = metricsEmitter.WithTags(cnrLifecycleControllerName)
   110  	}
   111  
   112  	cnrLifecycle.cnrControl = control.DummyCNRControl{}
   113  	if !genericConf.DryRun {
   114  		cnrLifecycle.cnrControl = control.NewCNRControlImpl(client.InternalClient)
   115  	}
   116  
   117  	return cnrLifecycle, nil
   118  }
   119  
   120  func (cl *CNRLifecycle) Run() {
   121  	defer utilruntime.HandleCrash()
   122  	defer cl.syncQueue.ShutDown()
   123  
   124  	defer klog.Infof("Shutting down %s controller", cnrLifecycleControllerName)
   125  
   126  	if !cache.WaitForCacheSync(cl.ctx.Done(), cl.nodeListerSynced, cl.cnrListerSynced) {
   127  		utilruntime.HandleError(fmt.Errorf("unable to sync caches for %s controller", cnrLifecycleControllerName))
   128  		return
   129  	}
   130  	klog.Infof("Caches are synced for %s controller", cnrLifecycleControllerName)
   131  	klog.Infof("start %d workers for %s controller", cnrLifeCycleWorkerCount, cnrLifecycleControllerName)
   132  
   133  	go wait.Until(cl.clearUnexpectedCNR, clearCNRPeriod, cl.ctx.Done())
   134  	for i := 0; i < cnrLifeCycleWorkerCount; i++ {
   135  		go wait.Until(cl.worker, time.Second, cl.ctx.Done())
   136  	}
   137  
   138  	<-cl.ctx.Done()
   139  }
   140  
   141  func (cl *CNRLifecycle) addNodeEventHandle(obj interface{}) {
   142  	n, ok := obj.(*corev1.Node)
   143  	if !ok {
   144  		klog.Errorf("cannot convert obj to *corev1.Node: %v", obj)
   145  		return
   146  	}
   147  	klog.V(4).Infof("notice addition of Node %s", n.Name)
   148  	cl.enqueueWorkItem(n)
   149  }
   150  
   151  func (cl *CNRLifecycle) updateNodeEventHandle(old, cur interface{}) {
   152  	oldNode, ok := old.(*corev1.Node)
   153  	if !ok {
   154  		klog.Errorf("cannot convert oldObj to *corev1.Node: %v", old)
   155  		return
   156  	}
   157  
   158  	curNode, ok := cur.(*corev1.Node)
   159  	if !ok {
   160  		klog.Errorf("cannot convert curObj to *corev1.Node: %v", cur)
   161  		return
   162  	}
   163  
   164  	if curNode.Labels == nil {
   165  		return
   166  	}
   167  
   168  	if !general.CheckMapEqual(oldNode.Labels, curNode.Labels) {
   169  		cl.enqueueWorkItem(curNode)
   170  	}
   171  }
   172  
   173  func (cl *CNRLifecycle) addCNREventHandle(obj interface{}) {
   174  	c, ok := obj.(*apis.CustomNodeResource)
   175  	if !ok {
   176  		klog.Errorf("cannot convert obj to *apis.CNR: %v", obj)
   177  		return
   178  	}
   179  	klog.V(4).Infof("notice addition of cnr %s", c.Name)
   180  
   181  	cl.enqueueWorkItem(obj)
   182  }
   183  
   184  func (cl *CNRLifecycle) updateCNREventHandle(_, new interface{}) {
   185  	c, ok := new.(*apis.CustomNodeResource)
   186  	if !ok {
   187  		klog.Errorf("cannot convert newObj to *apis.CNR: %v", c)
   188  		return
   189  	}
   190  	klog.V(4).Infof("notice addition of cnr %s", c.Name)
   191  
   192  	cl.enqueueWorkItem(new)
   193  }
   194  
   195  func (cl *CNRLifecycle) deleteCNREventHandle(obj interface{}) {
   196  	c, ok := obj.(*apis.CustomNodeResource)
   197  	if !ok {
   198  		klog.Errorf("cannot convert oldObj to *apis.CNR: %v", c)
   199  		return
   200  	}
   201  	klog.V(4).Infof("notice addition of cnr %s", c.Name)
   202  
   203  	cl.enqueueWorkItem(obj)
   204  }
   205  
   206  func (cl *CNRLifecycle) worker() {
   207  	for cl.processNextWorkItem() {
   208  	}
   209  }
   210  
   211  // processNextWorkItem dequeues items, processes them, and marks them done.
   212  // It enforces that the sync is never invoked concurrently with the same key.
   213  func (cl *CNRLifecycle) processNextWorkItem() bool {
   214  	key, quit := cl.syncQueue.Get()
   215  	if quit {
   216  		return false
   217  	}
   218  	defer cl.syncQueue.Done(key)
   219  
   220  	err := cl.sync(key.(string))
   221  	if err == nil {
   222  		cl.syncQueue.Forget(key)
   223  		return true
   224  	}
   225  
   226  	utilruntime.HandleError(fmt.Errorf("sync %q failed with %v", key, err))
   227  	cl.syncQueue.AddRateLimited(key)
   228  
   229  	return true
   230  }
   231  
   232  // enqueueWorkItem enqueues the given node in the work queue.
   233  func (cl *CNRLifecycle) enqueueWorkItem(obj interface{}) {
   234  	key, err := cache.DeletionHandlingMetaNamespaceKeyFunc(obj)
   235  	if err != nil {
   236  		utilruntime.HandleError(fmt.Errorf("Cound't get key for object %+v: %v", obj, err))
   237  		return
   238  	}
   239  	cl.syncQueue.Add(key)
   240  }
   241  
   242  // sync syncs the given node.
   243  func (cl *CNRLifecycle) sync(key string) error {
   244  	_, name, err := cache.SplitMetaNamespaceKey(key)
   245  	if err != nil {
   246  		return err
   247  	}
   248  	node, err := cl.nodeLister.Get(name)
   249  	if errors.IsNotFound(err) {
   250  		klog.Infof("node has been deleted %v", key)
   251  		return nil
   252  	}
   253  	if err != nil {
   254  		return err
   255  	}
   256  
   257  	err = cl.updateOrCreateCNR(node)
   258  	if err != nil {
   259  		return err
   260  	}
   261  
   262  	return nil
   263  }
   264  
   265  // clearUnexpectedCNR is used to clear unexpected cnr
   266  // for instance, orphaned cnr due to unexpected node deletion options or manually creation
   267  func (cl *CNRLifecycle) clearUnexpectedCNR() {
   268  	targetCNRSelector := labels.Everything()
   269  	cnrs, err := cl.cnrLister.List(targetCNRSelector)
   270  	if err != nil {
   271  		klog.Errorf("failed to list all cnr")
   272  		return
   273  	}
   274  
   275  	for _, cnr := range cnrs {
   276  		_, err := cl.nodeLister.Get(cnr.Name)
   277  		if errors.IsNotFound(err) {
   278  			// double check if this node is deleted
   279  			_, nErr := cl.client.KubeClient.CoreV1().Nodes().Get(cl.ctx, cnr.Name, metav1.GetOptions{ResourceVersion: "0"})
   280  			if !errors.IsNotFound(nErr) {
   281  				continue
   282  			}
   283  
   284  			if dErr := cl.cnrControl.DeleteCNR(cl.ctx, cnr.Name); dErr != nil {
   285  				klog.Errorf("delete unexpected cnr %s failed: %v", cnr.Name, dErr)
   286  			}
   287  			continue
   288  		} else if err != nil {
   289  			klog.Errorf("get node for CNR %v failed in clear: %v", cnr.Name, err)
   290  			continue
   291  		}
   292  	}
   293  }
   294  
   295  func (cl *CNRLifecycle) updateOrCreateCNR(node *corev1.Node) error {
   296  	cnr, err := cl.cnrLister.Get(node.Name)
   297  	if err != nil && !errors.IsNotFound(err) {
   298  		return fmt.Errorf("failed to get cnr from lister %s: %v", node.Name, err)
   299  	}
   300  	if errors.IsNotFound(err) {
   301  		cnr = &apis.CustomNodeResource{
   302  			ObjectMeta: metav1.ObjectMeta{
   303  				Name:   node.Name,
   304  				Labels: node.Labels,
   305  			},
   306  		}
   307  
   308  		setCNROwnerReference(cnr, node)
   309  		_, err = cl.cnrControl.CreateCNR(cl.ctx, cnr)
   310  		if err != nil && !errors.IsAlreadyExists(err) {
   311  			return fmt.Errorf("failed to create cnr %s: %v", cnr.Name, err)
   312  		}
   313  		if errors.IsAlreadyExists(err) {
   314  			cnr, err = cl.client.InternalClient.NodeV1alpha1().CustomNodeResources().Get(cl.ctx, node.Name, metav1.GetOptions{ResourceVersion: "0"})
   315  			if err != nil {
   316  				return fmt.Errorf("failed to get cnr from apiserver %s: %v", node.Name, err)
   317  			}
   318  		}
   319  	}
   320  
   321  	newCNR := cnr.DeepCopy()
   322  	newCNR.Labels = general.MergeMap(newCNR.Labels, node.Labels)
   323  	setCNROwnerReference(newCNR, node)
   324  	if apiequality.Semantic.DeepEqual(newCNR, cnr) {
   325  		return nil
   326  	}
   327  
   328  	_, err = cl.cnrControl.PatchCNRSpecAndMetadata(cl.ctx, cnr.Name, cnr, newCNR)
   329  	return err
   330  }
   331  
   332  func setCNROwnerReference(cnr *apis.CustomNodeResource, node *corev1.Node) {
   333  	if cnr == nil || node == nil {
   334  		return
   335  	}
   336  
   337  	blocker := true
   338  	cnr.OwnerReferences = []metav1.OwnerReference{
   339  		{
   340  			APIVersion:         "v1",
   341  			Kind:               "Node",
   342  			Name:               node.Name,
   343  			UID:                node.GetUID(),
   344  			Controller:         &blocker,
   345  			BlockOwnerDeletion: &blocker,
   346  		},
   347  	}
   348  }