github.com/cilium/cilium@v1.16.2/operator/watchers/node_taint.go (about) 1 // SPDX-License-Identifier: Apache-2.0 2 // Copyright Authors of Cilium 3 4 package watchers 5 6 import ( 7 "context" 8 "encoding/json" 9 "errors" 10 "fmt" 11 "sync" 12 13 "github.com/sirupsen/logrus" 14 corev1 "k8s.io/api/core/v1" 15 k8sErrors "k8s.io/apimachinery/pkg/api/errors" 16 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 17 k8sTypes "k8s.io/apimachinery/pkg/types" 18 "k8s.io/client-go/kubernetes" 19 "k8s.io/client-go/tools/cache" 20 "k8s.io/client-go/util/workqueue" 21 22 "github.com/cilium/cilium/operator/option" 23 "github.com/cilium/cilium/pkg/controller" 24 "github.com/cilium/cilium/pkg/k8s" 25 k8sClient "github.com/cilium/cilium/pkg/k8s/client" 26 "github.com/cilium/cilium/pkg/k8s/informer" 27 slim_corev1 "github.com/cilium/cilium/pkg/k8s/slim/k8s/api/core/v1" 28 slim_metav1 "github.com/cilium/cilium/pkg/k8s/slim/k8s/apis/meta/v1" 29 k8sUtils "github.com/cilium/cilium/pkg/k8s/utils" 30 "github.com/cilium/cilium/pkg/logging/logfields" 31 pkgOption "github.com/cilium/cilium/pkg/option" 32 ) 33 34 const ( 35 hostnameIndexer = "hostname-indexer" 36 37 // ciliumNodeConditionReason is the condition name used by Cilium to set 38 // when the Network is setup in the node. 39 ciliumNodeConditionReason = "CiliumIsUp" 40 ) 41 42 var ( 43 // ciliumPodsStore contains all Cilium pods running in the cluster 44 ciliumPodsStore = cache.NewIndexer(cache.DeletionHandlingMetaNamespaceKeyFunc, ciliumIndexers) 45 46 // ciliumIndexers will index Cilium pods by namespace/name and hostname. 47 ciliumIndexers = cache.Indexers{ 48 cache.NamespaceIndex: cache.MetaNamespaceIndexFunc, 49 hostnameIndexer: hostNameIndexFunc, 50 } 51 52 errNoPod = errors.New("object is not a *slim_corev1.Pod") 53 54 queueKeyFunc = cache.DeletionHandlingMetaNamespaceKeyFunc 55 56 ctrlMgr = controller.NewManager() 57 58 mno markNodeOptions 59 60 markK8sNodeControllerGroup = controller.NewGroup("mark-k8s-node-taints-conditions") 61 ) 62 63 func checkTaintForNextNodeItem(c kubernetes.Interface, nodeGetter slimNodeGetter, workQueue workqueue.RateLimitingInterface) bool { 64 // Get the next 'key' from the queue. 65 key, quit := workQueue.Get() 66 if quit { 67 return false 68 } 69 // Done marks item as done processing, and if it has been marked as dirty 70 // again while it was being processed, it will be re-added to the queue for 71 // re-processing. 72 defer workQueue.Done(key) 73 74 success := checkAndMarkNode(c, nodeGetter, key.(string), mno) 75 if !success { 76 workQueue.Forget(key) 77 return true 78 } 79 80 // If the event was processed correctly then forget it from the queue. 81 // If we don't do this, the next ".Get()" will always return this 'key'. 82 // It also depends on if the queue has a rate-limiter (not used in this 83 // program) 84 workQueue.Forget(key) 85 return true 86 } 87 88 // checkAndMarkNode checks if the node contains a Cilium pod in running state 89 // so that it can set the taints / conditions of the node 90 func checkAndMarkNode(c kubernetes.Interface, nodeGetter slimNodeGetter, nodeName string, options markNodeOptions) bool { 91 node, err := nodeGetter.GetK8sSlimNode(nodeName) 92 if node == nil || err != nil { 93 return false 94 } 95 96 // should we remove the taint? 97 scheduled, running := nodeHasCiliumPod(node.GetName()) 98 if running { 99 if (options.RemoveNodeTaint && hasAgentNotReadyTaint(node)) || 100 (options.SetCiliumIsUpCondition && !HasCiliumIsUpCondition(node)) { 101 log.WithFields(logrus.Fields{ 102 logfields.NodeName: node.GetName(), 103 }).Info("Cilium pod running for node; marking accordingly") 104 105 markNode(c, nodeGetter, node.GetName(), options, true) 106 } 107 } else if scheduled { // Taint nodes where the pod is scheduled but not running 108 if options.SetNodeTaint && !hasAgentNotReadyTaint(node) { 109 log.WithFields(logrus.Fields{ 110 logfields.NodeName: node.GetName(), 111 }).Info("Cilium pod scheduled but not running for node; setting taint") 112 markNode(c, nodeGetter, node.GetName(), options, false) 113 } 114 } 115 return true 116 } 117 118 // ciliumPodsWatcher starts up a pod watcher to handle pod events. 119 func ciliumPodsWatcher(wg *sync.WaitGroup, clientset k8sClient.Clientset, stopCh <-chan struct{}) { 120 ciliumQueue := workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "cilium-pod-queue") 121 122 ciliumPodInformer := informer.NewInformerWithStore( 123 k8sUtils.ListerWatcherWithModifier( 124 k8sUtils.ListerWatcherFromTyped[*slim_corev1.PodList]( 125 clientset.Slim().CoreV1().Pods(option.Config.CiliumK8sNamespace), 126 ), 127 func(options *metav1.ListOptions) { 128 options.LabelSelector = option.Config.CiliumPodLabels 129 }), 130 &slim_corev1.Pod{}, 131 0, 132 cache.ResourceEventHandlerFuncs{ 133 AddFunc: func(obj interface{}) { 134 key, _ := queueKeyFunc(obj) 135 ciliumQueue.Add(key) 136 }, 137 UpdateFunc: func(_, newObj interface{}) { 138 key, _ := queueKeyFunc(newObj) 139 ciliumQueue.Add(key) 140 }, 141 }, 142 transformToCiliumPod, 143 ciliumPodsStore, 144 ) 145 146 nodeGetter := &nodeGetter{} 147 148 wg.Add(1) 149 go func() { 150 defer wg.Done() 151 // Do not use the k8sClient provided by the nodesInit function since we 152 // need a k8s client that can update node structures and not simply 153 // watch for node events. 154 for processNextCiliumPodItem(clientset, nodeGetter, ciliumQueue) { 155 } 156 }() 157 158 wg.Add(1) 159 go func() { 160 defer wg.Done() 161 defer ciliumQueue.ShutDown() 162 163 ciliumPodInformer.Run(stopCh) 164 }() 165 } 166 167 func processNextCiliumPodItem(c kubernetes.Interface, nodeGetter slimNodeGetter, workQueue workqueue.RateLimitingInterface) bool { 168 // Get the next 'key' from the queue. 169 key, quit := workQueue.Get() 170 if quit { 171 return false 172 } 173 // Done marks item as done processing, and if it has been marked as dirty 174 // again while it was being processed, it will be re-added to the queue for 175 // re-processing. 176 defer workQueue.Done(key) 177 178 podInterface, exists, err := ciliumPodsStore.GetByKey(key.(string)) 179 if err != nil && !k8sErrors.IsNotFound(err) { 180 return true 181 } 182 if !exists || podInterface == nil { 183 workQueue.Forget(key) 184 return true 185 } 186 187 pod := podInterface.(*slim_corev1.Pod) 188 nodeName := pod.Spec.NodeName 189 190 success := checkAndMarkNode(c, nodeGetter, nodeName, mno) 191 if !success { 192 workQueue.Forget(key) 193 return true 194 } 195 196 // If the event was processed correctly then forget it from the queue. 197 // If we don't do this, the next ".Get()" will always return this 'key'. 198 // It also depends on if the queue has a rate-limiter (not used in this 199 // program) 200 workQueue.Forget(key) 201 return true 202 } 203 204 // nodeHasCiliumPod determines if a the node has a Cilium agent pod scheduled 205 // on it, and if it is running and ready. 206 func nodeHasCiliumPod(nodeName string) (scheduled bool, ready bool) { 207 ciliumPodsInNode, err := ciliumPodsStore.ByIndex(hostnameIndexer, nodeName) 208 if err != nil { 209 return false, false 210 } 211 if len(ciliumPodsInNode) == 0 { 212 return false, false 213 } 214 for _, ciliumPodInterface := range ciliumPodsInNode { 215 ciliumPod := ciliumPodInterface.(*slim_corev1.Pod) 216 if ciliumPod.DeletionTimestamp != nil { // even if the pod is running, it will be down shortly 217 continue 218 } 219 if k8sUtils.GetLatestPodReadiness(ciliumPod.Status) == slim_corev1.ConditionTrue { 220 return true, true 221 } 222 } 223 return true, false 224 } 225 226 // hasAgentNotReadyTaint returns true if the given node has the Cilium Agent 227 // Not Ready Node Taint. 228 func hasAgentNotReadyTaint(k8sNode *slim_corev1.Node) bool { 229 for _, taint := range k8sNode.Spec.Taints { 230 if taint.Key == pkgOption.Config.AgentNotReadyNodeTaintValue() { 231 return true 232 } 233 } 234 return false 235 } 236 237 // hostNameIndexFunc index pods by node name. 238 func hostNameIndexFunc(obj interface{}) ([]string, error) { 239 switch t := obj.(type) { 240 case *slim_corev1.Pod: 241 return []string{t.Spec.NodeName}, nil 242 } 243 return nil, fmt.Errorf("%w - found %T", errNoPod, obj) 244 } 245 246 func transformToCiliumPod(obj interface{}) (interface{}, error) { 247 switch concreteObj := obj.(type) { 248 case *slim_corev1.Pod: 249 p := &slim_corev1.Pod{ 250 TypeMeta: concreteObj.TypeMeta, 251 ObjectMeta: slim_metav1.ObjectMeta{ 252 Name: concreteObj.Name, 253 Namespace: concreteObj.Namespace, 254 ResourceVersion: concreteObj.ResourceVersion, 255 }, 256 Spec: slim_corev1.PodSpec{ 257 NodeName: concreteObj.Spec.NodeName, 258 }, 259 Status: slim_corev1.PodStatus{ 260 Conditions: concreteObj.Status.Conditions, 261 }, 262 } 263 *concreteObj = slim_corev1.Pod{} 264 return p, nil 265 case cache.DeletedFinalStateUnknown: 266 pod, ok := concreteObj.Obj.(*slim_corev1.Pod) 267 if !ok { 268 return nil, fmt.Errorf("unknown object type %T", concreteObj.Obj) 269 } 270 dfsu := cache.DeletedFinalStateUnknown{ 271 Key: concreteObj.Key, 272 Obj: &slim_corev1.Pod{ 273 TypeMeta: pod.TypeMeta, 274 ObjectMeta: slim_metav1.ObjectMeta{ 275 Name: pod.Name, 276 Namespace: pod.Namespace, 277 ResourceVersion: pod.ResourceVersion, 278 }, 279 Spec: slim_corev1.PodSpec{ 280 NodeName: pod.Spec.NodeName, 281 }, 282 Status: slim_corev1.PodStatus{ 283 Conditions: pod.Status.Conditions, 284 }, 285 }, 286 } 287 // Small GC optimization 288 *pod = slim_corev1.Pod{} 289 return dfsu, nil 290 default: 291 return nil, fmt.Errorf("unknown object type %T", concreteObj) 292 } 293 } 294 295 // setNodeNetworkUnavailableFalse sets Kubernetes NodeNetworkUnavailable to 296 // false as Cilium is managing the network connectivity. 297 // https://kubernetes.io/docs/concepts/architecture/nodes/#condition 298 // This is because some clusters (notably GCP) come up with a NodeNetworkUnavailable condition set 299 // and the network provider is expected to remove this manually. 300 func setNodeNetworkUnavailableFalse(ctx context.Context, c kubernetes.Interface, nodeGetter slimNodeGetter, nodeName string) error { 301 n, err := nodeGetter.GetK8sSlimNode(nodeName) 302 if err != nil { 303 return err 304 } 305 306 if HasCiliumIsUpCondition(n) { 307 return nil 308 } 309 310 now := metav1.Now() 311 condition := corev1.NodeCondition{ 312 Type: corev1.NodeNetworkUnavailable, 313 Status: corev1.ConditionFalse, 314 Reason: ciliumNodeConditionReason, 315 Message: "Cilium is running on this node", 316 LastTransitionTime: now, 317 LastHeartbeatTime: now, 318 } 319 raw, err := json.Marshal(&[]corev1.NodeCondition{condition}) 320 if err != nil { 321 return err 322 } 323 patch := []byte(fmt.Sprintf(`{"status":{"conditions":%s}}`, raw)) 324 _, err = c.CoreV1().Nodes().PatchStatus(ctx, nodeName, patch) 325 if err != nil { 326 log.WithField(logfields.NodeName, nodeName).WithError(err).Info("Failed to patch node while setting condition") 327 } 328 return err 329 } 330 331 // HasCiliumIsUpCondition returns true if the given k8s node has the cilium node 332 // condition set. 333 func HasCiliumIsUpCondition(n *slim_corev1.Node) bool { 334 for _, condition := range n.Status.Conditions { 335 if condition.Type == slim_corev1.NodeNetworkUnavailable && 336 condition.Status == slim_corev1.ConditionFalse && 337 condition.Reason == ciliumNodeConditionReason { 338 return true 339 } 340 } 341 return false 342 } 343 344 // removeNodeTaint removes the AgentNotReadyNodeTaint allowing for pods to be 345 // scheduled once Cilium is setup. Mostly used in cloud providers to prevent 346 // existing CNI plugins from managing pods. 347 func removeNodeTaint(ctx context.Context, c kubernetes.Interface, nodeGetter slimNodeGetter, nodeName string) error { 348 k8sNode, err := nodeGetter.GetK8sSlimNode(nodeName) 349 if err != nil { 350 return err 351 } 352 353 var taintFound bool 354 355 var taints []slim_corev1.Taint 356 for _, taint := range k8sNode.Spec.Taints { 357 if taint.Key != pkgOption.Config.AgentNotReadyNodeTaintValue() { 358 taints = append(taints, taint) 359 } else { 360 taintFound = true 361 } 362 } 363 364 // No cilium taints found 365 if !taintFound { 366 log.WithFields(logrus.Fields{ 367 logfields.NodeName: nodeName, 368 "taint": pkgOption.Config.AgentNotReadyNodeTaintValue(), 369 }).Debug("Taint not found in node") 370 return nil 371 } 372 log.WithFields(logrus.Fields{ 373 logfields.NodeName: nodeName, 374 "taint": pkgOption.Config.AgentNotReadyNodeTaintValue(), 375 }).Debug("Removing Node Taint") 376 377 createStatusAndNodePatch := []k8s.JSONPatch{ 378 { 379 OP: "test", 380 Path: "/spec/taints", 381 Value: k8sNode.Spec.Taints, 382 }, 383 { 384 OP: "replace", 385 Path: "/spec/taints", 386 Value: taints, 387 }, 388 } 389 390 patch, err := json.Marshal(createStatusAndNodePatch) 391 if err != nil { 392 return err 393 } 394 395 _, err = c.CoreV1().Nodes().Patch(ctx, nodeName, k8sTypes.JSONPatchType, patch, metav1.PatchOptions{}) 396 if err != nil { 397 log.WithField(logfields.NodeName, nodeName).WithError(err).Info("Failed to patch node while removing taint") 398 } 399 return err 400 } 401 402 // setNodeTaint sets the AgentNotReady taint on a node 403 func setNodeTaint(ctx context.Context, c kubernetes.Interface, nodeGetter slimNodeGetter, nodeName string) error { 404 k8sNode, err := nodeGetter.GetK8sSlimNode(nodeName) 405 if err != nil { 406 return err 407 } 408 409 taintFound := false 410 411 taints := append([]slim_corev1.Taint{}, k8sNode.Spec.Taints...) 412 for _, taint := range k8sNode.Spec.Taints { 413 if taint.Key == pkgOption.Config.AgentNotReadyNodeTaintValue() { 414 taintFound = true 415 break 416 } 417 } 418 419 if taintFound { 420 log.WithFields(logrus.Fields{ 421 logfields.NodeName: nodeName, 422 "taint": pkgOption.Config.AgentNotReadyNodeTaintValue(), 423 }).Debug("Taint already set in node; skipping") 424 return nil 425 } 426 log.WithFields(logrus.Fields{ 427 logfields.NodeName: nodeName, 428 "taint": pkgOption.Config.AgentNotReadyNodeTaintValue(), 429 }).Debug("Setting Node Taint") 430 431 taints = append(taints, slim_corev1.Taint{ 432 Key: pkgOption.Config.AgentNotReadyNodeTaintValue(), // the function says value, but it's really a key 433 Value: "", 434 Effect: slim_corev1.TaintEffectNoSchedule, 435 }) 436 437 createStatusAndNodePatch := []k8s.JSONPatch{ 438 { 439 OP: "test", 440 Path: "/spec/taints", 441 Value: k8sNode.Spec.Taints, 442 }, 443 { 444 OP: "replace", 445 Path: "/spec/taints", 446 Value: taints, 447 }, 448 } 449 450 patch, err := json.Marshal(createStatusAndNodePatch) 451 if err != nil { 452 return err 453 } 454 455 _, err = c.CoreV1().Nodes().Patch(ctx, nodeName, k8sTypes.JSONPatchType, patch, metav1.PatchOptions{}) 456 if err != nil { 457 log.WithField(logfields.NodeName, nodeName).WithError(err).Info("Failed to patch node while adding taint") 458 } 459 return err 460 } 461 462 type markNodeOptions struct { 463 RemoveNodeTaint bool 464 SetNodeTaint bool 465 SetCiliumIsUpCondition bool 466 } 467 468 // markNode marks the Kubernetes node depending on the modes that it is passed 469 // on. 470 func markNode(c kubernetes.Interface, nodeGetter slimNodeGetter, nodeName string, options markNodeOptions, running bool) { 471 ctrlName := fmt.Sprintf("mark-k8s-node-%s-taints-conditions", nodeName) 472 473 ctrlMgr.UpdateController(ctrlName, 474 controller.ControllerParams{ 475 Group: markK8sNodeControllerGroup, 476 DoFunc: func(ctx context.Context) error { 477 if running && options.RemoveNodeTaint { 478 err := removeNodeTaint(ctx, c, nodeGetter, nodeName) 479 if err != nil { 480 return err 481 } 482 } 483 if running && options.SetCiliumIsUpCondition { 484 err := setNodeNetworkUnavailableFalse(ctx, c, nodeGetter, nodeName) 485 if err != nil { 486 return err 487 } 488 } 489 if !running && options.SetNodeTaint { 490 err := setNodeTaint(ctx, c, nodeGetter, nodeName) 491 if err != nil { 492 return err 493 } 494 } 495 496 return nil 497 }, 498 }) 499 } 500 501 // HandleNodeTolerationAndTaints remove node 502 func HandleNodeTolerationAndTaints(wg *sync.WaitGroup, clientset k8sClient.Clientset, stopCh <-chan struct{}) { 503 mno = markNodeOptions{ 504 RemoveNodeTaint: option.Config.RemoveCiliumNodeTaints, 505 SetNodeTaint: option.Config.SetCiliumNodeTaints, 506 SetCiliumIsUpCondition: option.Config.SetCiliumIsUpCondition, 507 } 508 nodesInit(wg, clientset.Slim(), stopCh) 509 510 wg.Add(1) 511 go func() { 512 defer wg.Done() 513 // Do not use the k8sClient provided by the nodesInit function since we 514 // need a k8s client that can update node structures and not simply 515 // watch for node events. 516 for checkTaintForNextNodeItem(clientset, &nodeGetter{}, nodeQueue) { 517 } 518 }() 519 520 ciliumPodsWatcher(wg, clientset, stopCh) 521 }