github.com/cilium/cilium@v1.16.2/operator/watchers/node_taint.go (about)

     1  // SPDX-License-Identifier: Apache-2.0
     2  // Copyright Authors of Cilium
     3  
     4  package watchers
     5  
     6  import (
     7  	"context"
     8  	"encoding/json"
     9  	"errors"
    10  	"fmt"
    11  	"sync"
    12  
    13  	"github.com/sirupsen/logrus"
    14  	corev1 "k8s.io/api/core/v1"
    15  	k8sErrors "k8s.io/apimachinery/pkg/api/errors"
    16  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    17  	k8sTypes "k8s.io/apimachinery/pkg/types"
    18  	"k8s.io/client-go/kubernetes"
    19  	"k8s.io/client-go/tools/cache"
    20  	"k8s.io/client-go/util/workqueue"
    21  
    22  	"github.com/cilium/cilium/operator/option"
    23  	"github.com/cilium/cilium/pkg/controller"
    24  	"github.com/cilium/cilium/pkg/k8s"
    25  	k8sClient "github.com/cilium/cilium/pkg/k8s/client"
    26  	"github.com/cilium/cilium/pkg/k8s/informer"
    27  	slim_corev1 "github.com/cilium/cilium/pkg/k8s/slim/k8s/api/core/v1"
    28  	slim_metav1 "github.com/cilium/cilium/pkg/k8s/slim/k8s/apis/meta/v1"
    29  	k8sUtils "github.com/cilium/cilium/pkg/k8s/utils"
    30  	"github.com/cilium/cilium/pkg/logging/logfields"
    31  	pkgOption "github.com/cilium/cilium/pkg/option"
    32  )
    33  
    34  const (
    35  	hostnameIndexer = "hostname-indexer"
    36  
    37  	// ciliumNodeConditionReason is the condition name used by Cilium to set
    38  	// when the Network is setup in the node.
    39  	ciliumNodeConditionReason = "CiliumIsUp"
    40  )
    41  
    42  var (
    43  	// ciliumPodsStore contains all Cilium pods running in the cluster
    44  	ciliumPodsStore = cache.NewIndexer(cache.DeletionHandlingMetaNamespaceKeyFunc, ciliumIndexers)
    45  
    46  	// ciliumIndexers will index Cilium pods by namespace/name and hostname.
    47  	ciliumIndexers = cache.Indexers{
    48  		cache.NamespaceIndex: cache.MetaNamespaceIndexFunc,
    49  		hostnameIndexer:      hostNameIndexFunc,
    50  	}
    51  
    52  	errNoPod = errors.New("object is not a *slim_corev1.Pod")
    53  
    54  	queueKeyFunc = cache.DeletionHandlingMetaNamespaceKeyFunc
    55  
    56  	ctrlMgr = controller.NewManager()
    57  
    58  	mno markNodeOptions
    59  
    60  	markK8sNodeControllerGroup = controller.NewGroup("mark-k8s-node-taints-conditions")
    61  )
    62  
    63  func checkTaintForNextNodeItem(c kubernetes.Interface, nodeGetter slimNodeGetter, workQueue workqueue.RateLimitingInterface) bool {
    64  	// Get the next 'key' from the queue.
    65  	key, quit := workQueue.Get()
    66  	if quit {
    67  		return false
    68  	}
    69  	// Done marks item as done processing, and if it has been marked as dirty
    70  	// again while it was being processed, it will be re-added to the queue for
    71  	// re-processing.
    72  	defer workQueue.Done(key)
    73  
    74  	success := checkAndMarkNode(c, nodeGetter, key.(string), mno)
    75  	if !success {
    76  		workQueue.Forget(key)
    77  		return true
    78  	}
    79  
    80  	// If the event was processed correctly then forget it from the queue.
    81  	// If we don't do this, the next ".Get()" will always return this 'key'.
    82  	// It also depends on if the queue has a rate-limiter (not used in this
    83  	// program)
    84  	workQueue.Forget(key)
    85  	return true
    86  }
    87  
    88  // checkAndMarkNode checks if the node contains a Cilium pod in running state
    89  // so that it can set the taints / conditions of the node
    90  func checkAndMarkNode(c kubernetes.Interface, nodeGetter slimNodeGetter, nodeName string, options markNodeOptions) bool {
    91  	node, err := nodeGetter.GetK8sSlimNode(nodeName)
    92  	if node == nil || err != nil {
    93  		return false
    94  	}
    95  
    96  	// should we remove the taint?
    97  	scheduled, running := nodeHasCiliumPod(node.GetName())
    98  	if running {
    99  		if (options.RemoveNodeTaint && hasAgentNotReadyTaint(node)) ||
   100  			(options.SetCiliumIsUpCondition && !HasCiliumIsUpCondition(node)) {
   101  			log.WithFields(logrus.Fields{
   102  				logfields.NodeName: node.GetName(),
   103  			}).Info("Cilium pod running for node; marking accordingly")
   104  
   105  			markNode(c, nodeGetter, node.GetName(), options, true)
   106  		}
   107  	} else if scheduled { // Taint nodes where the pod is scheduled but not running
   108  		if options.SetNodeTaint && !hasAgentNotReadyTaint(node) {
   109  			log.WithFields(logrus.Fields{
   110  				logfields.NodeName: node.GetName(),
   111  			}).Info("Cilium pod scheduled but not running for node; setting taint")
   112  			markNode(c, nodeGetter, node.GetName(), options, false)
   113  		}
   114  	}
   115  	return true
   116  }
   117  
   118  // ciliumPodsWatcher starts up a pod watcher to handle pod events.
   119  func ciliumPodsWatcher(wg *sync.WaitGroup, clientset k8sClient.Clientset, stopCh <-chan struct{}) {
   120  	ciliumQueue := workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "cilium-pod-queue")
   121  
   122  	ciliumPodInformer := informer.NewInformerWithStore(
   123  		k8sUtils.ListerWatcherWithModifier(
   124  			k8sUtils.ListerWatcherFromTyped[*slim_corev1.PodList](
   125  				clientset.Slim().CoreV1().Pods(option.Config.CiliumK8sNamespace),
   126  			),
   127  			func(options *metav1.ListOptions) {
   128  				options.LabelSelector = option.Config.CiliumPodLabels
   129  			}),
   130  		&slim_corev1.Pod{},
   131  		0,
   132  		cache.ResourceEventHandlerFuncs{
   133  			AddFunc: func(obj interface{}) {
   134  				key, _ := queueKeyFunc(obj)
   135  				ciliumQueue.Add(key)
   136  			},
   137  			UpdateFunc: func(_, newObj interface{}) {
   138  				key, _ := queueKeyFunc(newObj)
   139  				ciliumQueue.Add(key)
   140  			},
   141  		},
   142  		transformToCiliumPod,
   143  		ciliumPodsStore,
   144  	)
   145  
   146  	nodeGetter := &nodeGetter{}
   147  
   148  	wg.Add(1)
   149  	go func() {
   150  		defer wg.Done()
   151  		// Do not use the k8sClient provided by the nodesInit function since we
   152  		// need a k8s client that can update node structures and not simply
   153  		// watch for node events.
   154  		for processNextCiliumPodItem(clientset, nodeGetter, ciliumQueue) {
   155  		}
   156  	}()
   157  
   158  	wg.Add(1)
   159  	go func() {
   160  		defer wg.Done()
   161  		defer ciliumQueue.ShutDown()
   162  
   163  		ciliumPodInformer.Run(stopCh)
   164  	}()
   165  }
   166  
   167  func processNextCiliumPodItem(c kubernetes.Interface, nodeGetter slimNodeGetter, workQueue workqueue.RateLimitingInterface) bool {
   168  	// Get the next 'key' from the queue.
   169  	key, quit := workQueue.Get()
   170  	if quit {
   171  		return false
   172  	}
   173  	// Done marks item as done processing, and if it has been marked as dirty
   174  	// again while it was being processed, it will be re-added to the queue for
   175  	// re-processing.
   176  	defer workQueue.Done(key)
   177  
   178  	podInterface, exists, err := ciliumPodsStore.GetByKey(key.(string))
   179  	if err != nil && !k8sErrors.IsNotFound(err) {
   180  		return true
   181  	}
   182  	if !exists || podInterface == nil {
   183  		workQueue.Forget(key)
   184  		return true
   185  	}
   186  
   187  	pod := podInterface.(*slim_corev1.Pod)
   188  	nodeName := pod.Spec.NodeName
   189  
   190  	success := checkAndMarkNode(c, nodeGetter, nodeName, mno)
   191  	if !success {
   192  		workQueue.Forget(key)
   193  		return true
   194  	}
   195  
   196  	// If the event was processed correctly then forget it from the queue.
   197  	// If we don't do this, the next ".Get()" will always return this 'key'.
   198  	// It also depends on if the queue has a rate-limiter (not used in this
   199  	// program)
   200  	workQueue.Forget(key)
   201  	return true
   202  }
   203  
   204  // nodeHasCiliumPod determines if a the node has a Cilium agent pod scheduled
   205  // on it, and if it is running and ready.
   206  func nodeHasCiliumPod(nodeName string) (scheduled bool, ready bool) {
   207  	ciliumPodsInNode, err := ciliumPodsStore.ByIndex(hostnameIndexer, nodeName)
   208  	if err != nil {
   209  		return false, false
   210  	}
   211  	if len(ciliumPodsInNode) == 0 {
   212  		return false, false
   213  	}
   214  	for _, ciliumPodInterface := range ciliumPodsInNode {
   215  		ciliumPod := ciliumPodInterface.(*slim_corev1.Pod)
   216  		if ciliumPod.DeletionTimestamp != nil { // even if the pod is running, it will be down shortly
   217  			continue
   218  		}
   219  		if k8sUtils.GetLatestPodReadiness(ciliumPod.Status) == slim_corev1.ConditionTrue {
   220  			return true, true
   221  		}
   222  	}
   223  	return true, false
   224  }
   225  
   226  // hasAgentNotReadyTaint returns true if the given node has the Cilium Agent
   227  // Not Ready Node Taint.
   228  func hasAgentNotReadyTaint(k8sNode *slim_corev1.Node) bool {
   229  	for _, taint := range k8sNode.Spec.Taints {
   230  		if taint.Key == pkgOption.Config.AgentNotReadyNodeTaintValue() {
   231  			return true
   232  		}
   233  	}
   234  	return false
   235  }
   236  
   237  // hostNameIndexFunc index pods by node name.
   238  func hostNameIndexFunc(obj interface{}) ([]string, error) {
   239  	switch t := obj.(type) {
   240  	case *slim_corev1.Pod:
   241  		return []string{t.Spec.NodeName}, nil
   242  	}
   243  	return nil, fmt.Errorf("%w - found %T", errNoPod, obj)
   244  }
   245  
   246  func transformToCiliumPod(obj interface{}) (interface{}, error) {
   247  	switch concreteObj := obj.(type) {
   248  	case *slim_corev1.Pod:
   249  		p := &slim_corev1.Pod{
   250  			TypeMeta: concreteObj.TypeMeta,
   251  			ObjectMeta: slim_metav1.ObjectMeta{
   252  				Name:            concreteObj.Name,
   253  				Namespace:       concreteObj.Namespace,
   254  				ResourceVersion: concreteObj.ResourceVersion,
   255  			},
   256  			Spec: slim_corev1.PodSpec{
   257  				NodeName: concreteObj.Spec.NodeName,
   258  			},
   259  			Status: slim_corev1.PodStatus{
   260  				Conditions: concreteObj.Status.Conditions,
   261  			},
   262  		}
   263  		*concreteObj = slim_corev1.Pod{}
   264  		return p, nil
   265  	case cache.DeletedFinalStateUnknown:
   266  		pod, ok := concreteObj.Obj.(*slim_corev1.Pod)
   267  		if !ok {
   268  			return nil, fmt.Errorf("unknown object type %T", concreteObj.Obj)
   269  		}
   270  		dfsu := cache.DeletedFinalStateUnknown{
   271  			Key: concreteObj.Key,
   272  			Obj: &slim_corev1.Pod{
   273  				TypeMeta: pod.TypeMeta,
   274  				ObjectMeta: slim_metav1.ObjectMeta{
   275  					Name:            pod.Name,
   276  					Namespace:       pod.Namespace,
   277  					ResourceVersion: pod.ResourceVersion,
   278  				},
   279  				Spec: slim_corev1.PodSpec{
   280  					NodeName: pod.Spec.NodeName,
   281  				},
   282  				Status: slim_corev1.PodStatus{
   283  					Conditions: pod.Status.Conditions,
   284  				},
   285  			},
   286  		}
   287  		// Small GC optimization
   288  		*pod = slim_corev1.Pod{}
   289  		return dfsu, nil
   290  	default:
   291  		return nil, fmt.Errorf("unknown object type %T", concreteObj)
   292  	}
   293  }
   294  
   295  // setNodeNetworkUnavailableFalse sets Kubernetes NodeNetworkUnavailable to
   296  // false as Cilium is managing the network connectivity.
   297  // https://kubernetes.io/docs/concepts/architecture/nodes/#condition
   298  // This is because some clusters (notably GCP) come up with a NodeNetworkUnavailable condition set
   299  // and the network provider is expected to remove this manually.
   300  func setNodeNetworkUnavailableFalse(ctx context.Context, c kubernetes.Interface, nodeGetter slimNodeGetter, nodeName string) error {
   301  	n, err := nodeGetter.GetK8sSlimNode(nodeName)
   302  	if err != nil {
   303  		return err
   304  	}
   305  
   306  	if HasCiliumIsUpCondition(n) {
   307  		return nil
   308  	}
   309  
   310  	now := metav1.Now()
   311  	condition := corev1.NodeCondition{
   312  		Type:               corev1.NodeNetworkUnavailable,
   313  		Status:             corev1.ConditionFalse,
   314  		Reason:             ciliumNodeConditionReason,
   315  		Message:            "Cilium is running on this node",
   316  		LastTransitionTime: now,
   317  		LastHeartbeatTime:  now,
   318  	}
   319  	raw, err := json.Marshal(&[]corev1.NodeCondition{condition})
   320  	if err != nil {
   321  		return err
   322  	}
   323  	patch := []byte(fmt.Sprintf(`{"status":{"conditions":%s}}`, raw))
   324  	_, err = c.CoreV1().Nodes().PatchStatus(ctx, nodeName, patch)
   325  	if err != nil {
   326  		log.WithField(logfields.NodeName, nodeName).WithError(err).Info("Failed to patch node while setting condition")
   327  	}
   328  	return err
   329  }
   330  
   331  // HasCiliumIsUpCondition returns true if the given k8s node has the cilium node
   332  // condition set.
   333  func HasCiliumIsUpCondition(n *slim_corev1.Node) bool {
   334  	for _, condition := range n.Status.Conditions {
   335  		if condition.Type == slim_corev1.NodeNetworkUnavailable &&
   336  			condition.Status == slim_corev1.ConditionFalse &&
   337  			condition.Reason == ciliumNodeConditionReason {
   338  			return true
   339  		}
   340  	}
   341  	return false
   342  }
   343  
   344  // removeNodeTaint removes the AgentNotReadyNodeTaint allowing for pods to be
   345  // scheduled once Cilium is setup. Mostly used in cloud providers to prevent
   346  // existing CNI plugins from managing pods.
   347  func removeNodeTaint(ctx context.Context, c kubernetes.Interface, nodeGetter slimNodeGetter, nodeName string) error {
   348  	k8sNode, err := nodeGetter.GetK8sSlimNode(nodeName)
   349  	if err != nil {
   350  		return err
   351  	}
   352  
   353  	var taintFound bool
   354  
   355  	var taints []slim_corev1.Taint
   356  	for _, taint := range k8sNode.Spec.Taints {
   357  		if taint.Key != pkgOption.Config.AgentNotReadyNodeTaintValue() {
   358  			taints = append(taints, taint)
   359  		} else {
   360  			taintFound = true
   361  		}
   362  	}
   363  
   364  	// No cilium taints found
   365  	if !taintFound {
   366  		log.WithFields(logrus.Fields{
   367  			logfields.NodeName: nodeName,
   368  			"taint":            pkgOption.Config.AgentNotReadyNodeTaintValue(),
   369  		}).Debug("Taint not found in node")
   370  		return nil
   371  	}
   372  	log.WithFields(logrus.Fields{
   373  		logfields.NodeName: nodeName,
   374  		"taint":            pkgOption.Config.AgentNotReadyNodeTaintValue(),
   375  	}).Debug("Removing Node Taint")
   376  
   377  	createStatusAndNodePatch := []k8s.JSONPatch{
   378  		{
   379  			OP:    "test",
   380  			Path:  "/spec/taints",
   381  			Value: k8sNode.Spec.Taints,
   382  		},
   383  		{
   384  			OP:    "replace",
   385  			Path:  "/spec/taints",
   386  			Value: taints,
   387  		},
   388  	}
   389  
   390  	patch, err := json.Marshal(createStatusAndNodePatch)
   391  	if err != nil {
   392  		return err
   393  	}
   394  
   395  	_, err = c.CoreV1().Nodes().Patch(ctx, nodeName, k8sTypes.JSONPatchType, patch, metav1.PatchOptions{})
   396  	if err != nil {
   397  		log.WithField(logfields.NodeName, nodeName).WithError(err).Info("Failed to patch node while removing taint")
   398  	}
   399  	return err
   400  }
   401  
   402  // setNodeTaint sets the AgentNotReady taint on a node
   403  func setNodeTaint(ctx context.Context, c kubernetes.Interface, nodeGetter slimNodeGetter, nodeName string) error {
   404  	k8sNode, err := nodeGetter.GetK8sSlimNode(nodeName)
   405  	if err != nil {
   406  		return err
   407  	}
   408  
   409  	taintFound := false
   410  
   411  	taints := append([]slim_corev1.Taint{}, k8sNode.Spec.Taints...)
   412  	for _, taint := range k8sNode.Spec.Taints {
   413  		if taint.Key == pkgOption.Config.AgentNotReadyNodeTaintValue() {
   414  			taintFound = true
   415  			break
   416  		}
   417  	}
   418  
   419  	if taintFound {
   420  		log.WithFields(logrus.Fields{
   421  			logfields.NodeName: nodeName,
   422  			"taint":            pkgOption.Config.AgentNotReadyNodeTaintValue(),
   423  		}).Debug("Taint already set in node; skipping")
   424  		return nil
   425  	}
   426  	log.WithFields(logrus.Fields{
   427  		logfields.NodeName: nodeName,
   428  		"taint":            pkgOption.Config.AgentNotReadyNodeTaintValue(),
   429  	}).Debug("Setting Node Taint")
   430  
   431  	taints = append(taints, slim_corev1.Taint{
   432  		Key:    pkgOption.Config.AgentNotReadyNodeTaintValue(), // the function says value, but it's really a key
   433  		Value:  "",
   434  		Effect: slim_corev1.TaintEffectNoSchedule,
   435  	})
   436  
   437  	createStatusAndNodePatch := []k8s.JSONPatch{
   438  		{
   439  			OP:    "test",
   440  			Path:  "/spec/taints",
   441  			Value: k8sNode.Spec.Taints,
   442  		},
   443  		{
   444  			OP:    "replace",
   445  			Path:  "/spec/taints",
   446  			Value: taints,
   447  		},
   448  	}
   449  
   450  	patch, err := json.Marshal(createStatusAndNodePatch)
   451  	if err != nil {
   452  		return err
   453  	}
   454  
   455  	_, err = c.CoreV1().Nodes().Patch(ctx, nodeName, k8sTypes.JSONPatchType, patch, metav1.PatchOptions{})
   456  	if err != nil {
   457  		log.WithField(logfields.NodeName, nodeName).WithError(err).Info("Failed to patch node while adding taint")
   458  	}
   459  	return err
   460  }
   461  
   462  type markNodeOptions struct {
   463  	RemoveNodeTaint        bool
   464  	SetNodeTaint           bool
   465  	SetCiliumIsUpCondition bool
   466  }
   467  
   468  // markNode marks the Kubernetes node depending on the modes that it is passed
   469  // on.
   470  func markNode(c kubernetes.Interface, nodeGetter slimNodeGetter, nodeName string, options markNodeOptions, running bool) {
   471  	ctrlName := fmt.Sprintf("mark-k8s-node-%s-taints-conditions", nodeName)
   472  
   473  	ctrlMgr.UpdateController(ctrlName,
   474  		controller.ControllerParams{
   475  			Group: markK8sNodeControllerGroup,
   476  			DoFunc: func(ctx context.Context) error {
   477  				if running && options.RemoveNodeTaint {
   478  					err := removeNodeTaint(ctx, c, nodeGetter, nodeName)
   479  					if err != nil {
   480  						return err
   481  					}
   482  				}
   483  				if running && options.SetCiliumIsUpCondition {
   484  					err := setNodeNetworkUnavailableFalse(ctx, c, nodeGetter, nodeName)
   485  					if err != nil {
   486  						return err
   487  					}
   488  				}
   489  				if !running && options.SetNodeTaint {
   490  					err := setNodeTaint(ctx, c, nodeGetter, nodeName)
   491  					if err != nil {
   492  						return err
   493  					}
   494  				}
   495  
   496  				return nil
   497  			},
   498  		})
   499  }
   500  
   501  // HandleNodeTolerationAndTaints remove node
   502  func HandleNodeTolerationAndTaints(wg *sync.WaitGroup, clientset k8sClient.Clientset, stopCh <-chan struct{}) {
   503  	mno = markNodeOptions{
   504  		RemoveNodeTaint:        option.Config.RemoveCiliumNodeTaints,
   505  		SetNodeTaint:           option.Config.SetCiliumNodeTaints,
   506  		SetCiliumIsUpCondition: option.Config.SetCiliumIsUpCondition,
   507  	}
   508  	nodesInit(wg, clientset.Slim(), stopCh)
   509  
   510  	wg.Add(1)
   511  	go func() {
   512  		defer wg.Done()
   513  		// Do not use the k8sClient provided by the nodesInit function since we
   514  		// need a k8s client that can update node structures and not simply
   515  		// watch for node events.
   516  		for checkTaintForNextNodeItem(clientset, &nodeGetter{}, nodeQueue) {
   517  		}
   518  	}()
   519  
   520  	ciliumPodsWatcher(wg, clientset, stopCh)
   521  }