k8s.io/kubernetes@v1.31.0-alpha.0.0.20240520171757-56147500dadc/pkg/controller/daemon/daemon_controller.go (about) 1 /* 2 Copyright 2015 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package daemon 18 19 import ( 20 "context" 21 "fmt" 22 "reflect" 23 "sort" 24 "sync" 25 "time" 26 27 apps "k8s.io/api/apps/v1" 28 v1 "k8s.io/api/core/v1" 29 apiequality "k8s.io/apimachinery/pkg/api/equality" 30 apierrors "k8s.io/apimachinery/pkg/api/errors" 31 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 32 "k8s.io/apimachinery/pkg/labels" 33 utilerrors "k8s.io/apimachinery/pkg/util/errors" 34 utilruntime "k8s.io/apimachinery/pkg/util/runtime" 35 "k8s.io/apimachinery/pkg/util/wait" 36 appsinformers "k8s.io/client-go/informers/apps/v1" 37 coreinformers "k8s.io/client-go/informers/core/v1" 38 clientset "k8s.io/client-go/kubernetes" 39 "k8s.io/client-go/kubernetes/scheme" 40 unversionedapps "k8s.io/client-go/kubernetes/typed/apps/v1" 41 v1core "k8s.io/client-go/kubernetes/typed/core/v1" 42 appslisters "k8s.io/client-go/listers/apps/v1" 43 corelisters "k8s.io/client-go/listers/core/v1" 44 "k8s.io/client-go/tools/cache" 45 "k8s.io/client-go/tools/record" 46 "k8s.io/client-go/util/flowcontrol" 47 "k8s.io/client-go/util/workqueue" 48 v1helper "k8s.io/component-helpers/scheduling/corev1" 49 "k8s.io/component-helpers/scheduling/corev1/nodeaffinity" 50 "k8s.io/klog/v2" 51 podutil "k8s.io/kubernetes/pkg/api/v1/pod" 52 "k8s.io/kubernetes/pkg/controller" 53 "k8s.io/kubernetes/pkg/controller/daemon/util" 54 ) 55 56 const ( 57 // BurstReplicas is a rate limiter for booting pods on a lot of pods. 58 // The value of 250 is chosen b/c values that are too high can cause registry DoS issues. 59 BurstReplicas = 250 60 61 // StatusUpdateRetries limits the number of retries if sending a status update to API server fails. 62 StatusUpdateRetries = 1 63 64 // BackoffGCInterval is the time that has to pass before next iteration of backoff GC is run 65 BackoffGCInterval = 1 * time.Minute 66 ) 67 68 // Reasons for DaemonSet events 69 const ( 70 // SelectingAllReason is added to an event when a DaemonSet selects all Pods. 71 SelectingAllReason = "SelectingAll" 72 // FailedPlacementReason is added to an event when a DaemonSet can't schedule a Pod to a specified node. 73 FailedPlacementReason = "FailedPlacement" 74 // FailedDaemonPodReason is added to an event when the status of a Pod of a DaemonSet is 'Failed'. 75 FailedDaemonPodReason = "FailedDaemonPod" 76 // SucceededDaemonPodReason is added to an event when the status of a Pod of a DaemonSet is 'Succeeded'. 77 SucceededDaemonPodReason = "SucceededDaemonPod" 78 ) 79 80 // controllerKind contains the schema.GroupVersionKind for this controller type. 81 var controllerKind = apps.SchemeGroupVersion.WithKind("DaemonSet") 82 83 // DaemonSetsController is responsible for synchronizing DaemonSet objects stored 84 // in the system with actual running pods. 85 type DaemonSetsController struct { 86 kubeClient clientset.Interface 87 88 eventBroadcaster record.EventBroadcaster 89 eventRecorder record.EventRecorder 90 91 podControl controller.PodControlInterface 92 crControl controller.ControllerRevisionControlInterface 93 94 // An dsc is temporarily suspended after creating/deleting these many replicas. 95 // It resumes normal action after observing the watch events for them. 96 burstReplicas int 97 98 // To allow injection of syncDaemonSet for testing. 99 syncHandler func(ctx context.Context, dsKey string) error 100 // used for unit testing 101 enqueueDaemonSet func(ds *apps.DaemonSet) 102 // A TTLCache of pod creates/deletes each ds expects to see 103 expectations controller.ControllerExpectationsInterface 104 // dsLister can list/get daemonsets from the shared informer's store 105 dsLister appslisters.DaemonSetLister 106 // dsStoreSynced returns true if the daemonset store has been synced at least once. 107 // Added as a member to the struct to allow injection for testing. 108 dsStoreSynced cache.InformerSynced 109 // historyLister get list/get history from the shared informers's store 110 historyLister appslisters.ControllerRevisionLister 111 // historyStoreSynced returns true if the history store has been synced at least once. 112 // Added as a member to the struct to allow injection for testing. 113 historyStoreSynced cache.InformerSynced 114 // podLister get list/get pods from the shared informers's store 115 podLister corelisters.PodLister 116 // podStoreSynced returns true if the pod store has been synced at least once. 117 // Added as a member to the struct to allow injection for testing. 118 podStoreSynced cache.InformerSynced 119 // nodeLister can list/get nodes from the shared informer's store 120 nodeLister corelisters.NodeLister 121 // nodeStoreSynced returns true if the node store has been synced at least once. 122 // Added as a member to the struct to allow injection for testing. 123 nodeStoreSynced cache.InformerSynced 124 125 // DaemonSet keys that need to be synced. 126 queue workqueue.TypedRateLimitingInterface[string] 127 128 failedPodsBackoff *flowcontrol.Backoff 129 } 130 131 // NewDaemonSetsController creates a new DaemonSetsController 132 func NewDaemonSetsController( 133 ctx context.Context, 134 daemonSetInformer appsinformers.DaemonSetInformer, 135 historyInformer appsinformers.ControllerRevisionInformer, 136 podInformer coreinformers.PodInformer, 137 nodeInformer coreinformers.NodeInformer, 138 kubeClient clientset.Interface, 139 failedPodsBackoff *flowcontrol.Backoff, 140 ) (*DaemonSetsController, error) { 141 eventBroadcaster := record.NewBroadcaster(record.WithContext(ctx)) 142 logger := klog.FromContext(ctx) 143 dsc := &DaemonSetsController{ 144 kubeClient: kubeClient, 145 eventBroadcaster: eventBroadcaster, 146 eventRecorder: eventBroadcaster.NewRecorder(scheme.Scheme, v1.EventSource{Component: "daemonset-controller"}), 147 podControl: controller.RealPodControl{ 148 KubeClient: kubeClient, 149 Recorder: eventBroadcaster.NewRecorder(scheme.Scheme, v1.EventSource{Component: "daemonset-controller"}), 150 }, 151 crControl: controller.RealControllerRevisionControl{ 152 KubeClient: kubeClient, 153 }, 154 burstReplicas: BurstReplicas, 155 expectations: controller.NewControllerExpectations(), 156 queue: workqueue.NewTypedRateLimitingQueueWithConfig( 157 workqueue.DefaultTypedControllerRateLimiter[string](), 158 workqueue.TypedRateLimitingQueueConfig[string]{ 159 Name: "daemonset", 160 }, 161 ), 162 } 163 164 daemonSetInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ 165 AddFunc: func(obj interface{}) { 166 dsc.addDaemonset(logger, obj) 167 }, 168 UpdateFunc: func(oldObj, newObj interface{}) { 169 dsc.updateDaemonset(logger, oldObj, newObj) 170 }, 171 DeleteFunc: func(obj interface{}) { 172 dsc.deleteDaemonset(logger, obj) 173 }, 174 }) 175 dsc.dsLister = daemonSetInformer.Lister() 176 dsc.dsStoreSynced = daemonSetInformer.Informer().HasSynced 177 178 historyInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ 179 AddFunc: func(obj interface{}) { 180 dsc.addHistory(logger, obj) 181 }, 182 UpdateFunc: func(oldObj, newObj interface{}) { 183 dsc.updateHistory(logger, oldObj, newObj) 184 }, 185 DeleteFunc: func(obj interface{}) { 186 dsc.deleteHistory(logger, obj) 187 }, 188 }) 189 dsc.historyLister = historyInformer.Lister() 190 dsc.historyStoreSynced = historyInformer.Informer().HasSynced 191 192 // Watch for creation/deletion of pods. The reason we watch is that we don't want a daemon set to create/delete 193 // more pods until all the effects (expectations) of a daemon set's create/delete have been observed. 194 podInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ 195 AddFunc: func(obj interface{}) { 196 dsc.addPod(logger, obj) 197 }, 198 UpdateFunc: func(oldObj, newObj interface{}) { 199 dsc.updatePod(logger, oldObj, newObj) 200 }, 201 DeleteFunc: func(obj interface{}) { 202 dsc.deletePod(logger, obj) 203 }, 204 }) 205 dsc.podLister = podInformer.Lister() 206 dsc.podStoreSynced = podInformer.Informer().HasSynced 207 208 nodeInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ 209 AddFunc: func(obj interface{}) { 210 dsc.addNode(logger, obj) 211 }, 212 UpdateFunc: func(oldObj, newObj interface{}) { 213 dsc.updateNode(logger, oldObj, newObj) 214 }, 215 }, 216 ) 217 dsc.nodeStoreSynced = nodeInformer.Informer().HasSynced 218 dsc.nodeLister = nodeInformer.Lister() 219 220 dsc.syncHandler = dsc.syncDaemonSet 221 dsc.enqueueDaemonSet = dsc.enqueue 222 223 dsc.failedPodsBackoff = failedPodsBackoff 224 225 return dsc, nil 226 } 227 228 func (dsc *DaemonSetsController) addDaemonset(logger klog.Logger, obj interface{}) { 229 ds := obj.(*apps.DaemonSet) 230 logger.V(4).Info("Adding daemon set", "daemonset", klog.KObj(ds)) 231 dsc.enqueueDaemonSet(ds) 232 } 233 234 func (dsc *DaemonSetsController) updateDaemonset(logger klog.Logger, cur, old interface{}) { 235 oldDS := old.(*apps.DaemonSet) 236 curDS := cur.(*apps.DaemonSet) 237 238 // TODO: make a KEP and fix informers to always call the delete event handler on re-create 239 if curDS.UID != oldDS.UID { 240 key, err := controller.KeyFunc(oldDS) 241 if err != nil { 242 utilruntime.HandleError(fmt.Errorf("couldn't get key for object %#v: %v", oldDS, err)) 243 return 244 } 245 dsc.deleteDaemonset(logger, cache.DeletedFinalStateUnknown{ 246 Key: key, 247 Obj: oldDS, 248 }) 249 } 250 251 logger.V(4).Info("Updating daemon set", "daemonset", klog.KObj(oldDS)) 252 dsc.enqueueDaemonSet(curDS) 253 } 254 255 func (dsc *DaemonSetsController) deleteDaemonset(logger klog.Logger, obj interface{}) { 256 ds, ok := obj.(*apps.DaemonSet) 257 if !ok { 258 tombstone, ok := obj.(cache.DeletedFinalStateUnknown) 259 if !ok { 260 utilruntime.HandleError(fmt.Errorf("couldn't get object from tombstone %#v", obj)) 261 return 262 } 263 ds, ok = tombstone.Obj.(*apps.DaemonSet) 264 if !ok { 265 utilruntime.HandleError(fmt.Errorf("tombstone contained object that is not a DaemonSet %#v", obj)) 266 return 267 } 268 } 269 logger.V(4).Info("Deleting daemon set", "daemonset", klog.KObj(ds)) 270 271 key, err := controller.KeyFunc(ds) 272 if err != nil { 273 utilruntime.HandleError(fmt.Errorf("couldn't get key for object %#v: %v", ds, err)) 274 return 275 } 276 277 // Delete expectations for the DaemonSet so if we create a new one with the same name it starts clean 278 dsc.expectations.DeleteExpectations(logger, key) 279 280 dsc.queue.Add(key) 281 } 282 283 // Run begins watching and syncing daemon sets. 284 func (dsc *DaemonSetsController) Run(ctx context.Context, workers int) { 285 defer utilruntime.HandleCrash() 286 287 dsc.eventBroadcaster.StartStructuredLogging(3) 288 dsc.eventBroadcaster.StartRecordingToSink(&v1core.EventSinkImpl{Interface: dsc.kubeClient.CoreV1().Events("")}) 289 defer dsc.eventBroadcaster.Shutdown() 290 291 defer dsc.queue.ShutDown() 292 293 logger := klog.FromContext(ctx) 294 logger.Info("Starting daemon sets controller") 295 defer logger.Info("Shutting down daemon sets controller") 296 297 if !cache.WaitForNamedCacheSync("daemon sets", ctx.Done(), dsc.podStoreSynced, dsc.nodeStoreSynced, dsc.historyStoreSynced, dsc.dsStoreSynced) { 298 return 299 } 300 301 for i := 0; i < workers; i++ { 302 go wait.UntilWithContext(ctx, dsc.runWorker, time.Second) 303 } 304 305 go wait.Until(dsc.failedPodsBackoff.GC, BackoffGCInterval, ctx.Done()) 306 307 <-ctx.Done() 308 } 309 310 func (dsc *DaemonSetsController) runWorker(ctx context.Context) { 311 for dsc.processNextWorkItem(ctx) { 312 } 313 } 314 315 // processNextWorkItem deals with one key off the queue. It returns false when it's time to quit. 316 func (dsc *DaemonSetsController) processNextWorkItem(ctx context.Context) bool { 317 dsKey, quit := dsc.queue.Get() 318 if quit { 319 return false 320 } 321 defer dsc.queue.Done(dsKey) 322 323 err := dsc.syncHandler(ctx, dsKey) 324 if err == nil { 325 dsc.queue.Forget(dsKey) 326 return true 327 } 328 329 utilruntime.HandleError(fmt.Errorf("%v failed with : %v", dsKey, err)) 330 dsc.queue.AddRateLimited(dsKey) 331 332 return true 333 } 334 335 func (dsc *DaemonSetsController) enqueue(ds *apps.DaemonSet) { 336 key, err := controller.KeyFunc(ds) 337 if err != nil { 338 utilruntime.HandleError(fmt.Errorf("Couldn't get key for object %#v: %v", ds, err)) 339 return 340 } 341 342 // TODO: Handle overlapping controllers better. See comment in ReplicationManager. 343 dsc.queue.Add(key) 344 } 345 346 func (dsc *DaemonSetsController) enqueueDaemonSetAfter(obj interface{}, after time.Duration) { 347 key, err := controller.KeyFunc(obj) 348 if err != nil { 349 utilruntime.HandleError(fmt.Errorf("Couldn't get key for object %+v: %v", obj, err)) 350 return 351 } 352 353 // TODO: Handle overlapping controllers better. See comment in ReplicationManager. 354 dsc.queue.AddAfter(key, after) 355 } 356 357 // getDaemonSetsForPod returns a list of DaemonSets that potentially match the pod. 358 func (dsc *DaemonSetsController) getDaemonSetsForPod(pod *v1.Pod) []*apps.DaemonSet { 359 sets, err := dsc.dsLister.GetPodDaemonSets(pod) 360 if err != nil { 361 return nil 362 } 363 if len(sets) > 1 { 364 // ControllerRef will ensure we don't do anything crazy, but more than one 365 // item in this list nevertheless constitutes user error. 366 utilruntime.HandleError(fmt.Errorf("user error! more than one daemon is selecting pods with labels: %+v", pod.Labels)) 367 } 368 return sets 369 } 370 371 // getDaemonSetsForHistory returns a list of DaemonSets that potentially 372 // match a ControllerRevision. 373 func (dsc *DaemonSetsController) getDaemonSetsForHistory(logger klog.Logger, history *apps.ControllerRevision) []*apps.DaemonSet { 374 daemonSets, err := dsc.dsLister.GetHistoryDaemonSets(history) 375 if err != nil || len(daemonSets) == 0 { 376 return nil 377 } 378 if len(daemonSets) > 1 { 379 // ControllerRef will ensure we don't do anything crazy, but more than one 380 // item in this list nevertheless constitutes user error. 381 logger.V(4).Info("Found more than one DaemonSet selecting the ControllerRevision. This is potentially a user error", 382 "controllerRevision", klog.KObj(history), "labels", history.Labels) 383 } 384 return daemonSets 385 } 386 387 // addHistory enqueues the DaemonSet that manages a ControllerRevision when the ControllerRevision is created 388 // or when the controller manager is restarted. 389 func (dsc *DaemonSetsController) addHistory(logger klog.Logger, obj interface{}) { 390 history := obj.(*apps.ControllerRevision) 391 if history.DeletionTimestamp != nil { 392 // On a restart of the controller manager, it's possible for an object to 393 // show up in a state that is already pending deletion. 394 dsc.deleteHistory(logger, history) 395 return 396 } 397 398 // If it has a ControllerRef, that's all that matters. 399 if controllerRef := metav1.GetControllerOf(history); controllerRef != nil { 400 ds := dsc.resolveControllerRef(history.Namespace, controllerRef) 401 if ds == nil { 402 return 403 } 404 logger.V(4).Info("Observed a ControllerRevision", "controllerRevision", klog.KObj(history)) 405 return 406 } 407 408 // Otherwise, it's an orphan. Get a list of all matching DaemonSets and sync 409 // them to see if anyone wants to adopt it. 410 daemonSets := dsc.getDaemonSetsForHistory(logger, history) 411 if len(daemonSets) == 0 { 412 return 413 } 414 logger.V(4).Info("Orphan ControllerRevision added", "controllerRevision", klog.KObj(history)) 415 for _, ds := range daemonSets { 416 dsc.enqueueDaemonSet(ds) 417 } 418 } 419 420 // updateHistory figures out what DaemonSet(s) manage a ControllerRevision when the ControllerRevision 421 // is updated and wake them up. If anything of the ControllerRevision has changed, we need to awaken 422 // both the old and new DaemonSets. 423 func (dsc *DaemonSetsController) updateHistory(logger klog.Logger, old, cur interface{}) { 424 curHistory := cur.(*apps.ControllerRevision) 425 oldHistory := old.(*apps.ControllerRevision) 426 if curHistory.ResourceVersion == oldHistory.ResourceVersion { 427 // Periodic resync will send update events for all known ControllerRevisions. 428 return 429 } 430 431 curControllerRef := metav1.GetControllerOf(curHistory) 432 oldControllerRef := metav1.GetControllerOf(oldHistory) 433 controllerRefChanged := !reflect.DeepEqual(curControllerRef, oldControllerRef) 434 if controllerRefChanged && oldControllerRef != nil { 435 // The ControllerRef was changed. Sync the old controller, if any. 436 if ds := dsc.resolveControllerRef(oldHistory.Namespace, oldControllerRef); ds != nil { 437 dsc.enqueueDaemonSet(ds) 438 } 439 } 440 441 // If it has a ControllerRef, that's all that matters. 442 if curControllerRef != nil { 443 ds := dsc.resolveControllerRef(curHistory.Namespace, curControllerRef) 444 if ds == nil { 445 return 446 } 447 logger.V(4).Info("Observed an update to a ControllerRevision", "controllerRevision", klog.KObj(curHistory)) 448 dsc.enqueueDaemonSet(ds) 449 return 450 } 451 452 // Otherwise, it's an orphan. If anything changed, sync matching controllers 453 // to see if anyone wants to adopt it now. 454 labelChanged := !reflect.DeepEqual(curHistory.Labels, oldHistory.Labels) 455 if labelChanged || controllerRefChanged { 456 daemonSets := dsc.getDaemonSetsForHistory(logger, curHistory) 457 if len(daemonSets) == 0 { 458 return 459 } 460 logger.V(4).Info("Orphan ControllerRevision updated", "controllerRevision", klog.KObj(curHistory)) 461 for _, ds := range daemonSets { 462 dsc.enqueueDaemonSet(ds) 463 } 464 } 465 } 466 467 // deleteHistory enqueues the DaemonSet that manages a ControllerRevision when 468 // the ControllerRevision is deleted. obj could be an *app.ControllerRevision, or 469 // a DeletionFinalStateUnknown marker item. 470 func (dsc *DaemonSetsController) deleteHistory(logger klog.Logger, obj interface{}) { 471 history, ok := obj.(*apps.ControllerRevision) 472 473 // When a delete is dropped, the relist will notice a ControllerRevision in the store not 474 // in the list, leading to the insertion of a tombstone object which contains 475 // the deleted key/value. Note that this value might be stale. If the ControllerRevision 476 // changed labels the new DaemonSet will not be woken up till the periodic resync. 477 if !ok { 478 tombstone, ok := obj.(cache.DeletedFinalStateUnknown) 479 if !ok { 480 utilruntime.HandleError(fmt.Errorf("Couldn't get object from tombstone %#v", obj)) 481 return 482 } 483 history, ok = tombstone.Obj.(*apps.ControllerRevision) 484 if !ok { 485 utilruntime.HandleError(fmt.Errorf("Tombstone contained object that is not a ControllerRevision %#v", obj)) 486 return 487 } 488 } 489 490 controllerRef := metav1.GetControllerOf(history) 491 if controllerRef == nil { 492 // No controller should care about orphans being deleted. 493 return 494 } 495 ds := dsc.resolveControllerRef(history.Namespace, controllerRef) 496 if ds == nil { 497 return 498 } 499 logger.V(4).Info("ControllerRevision deleted", "controllerRevision", klog.KObj(history)) 500 dsc.enqueueDaemonSet(ds) 501 } 502 503 func (dsc *DaemonSetsController) addPod(logger klog.Logger, obj interface{}) { 504 pod := obj.(*v1.Pod) 505 506 if pod.DeletionTimestamp != nil { 507 // on a restart of the controller manager, it's possible a new pod shows up in a state that 508 // is already pending deletion. Prevent the pod from being a creation observation. 509 dsc.deletePod(logger, pod) 510 return 511 } 512 513 // If it has a ControllerRef, that's all that matters. 514 if controllerRef := metav1.GetControllerOf(pod); controllerRef != nil { 515 ds := dsc.resolveControllerRef(pod.Namespace, controllerRef) 516 if ds == nil { 517 return 518 } 519 dsKey, err := controller.KeyFunc(ds) 520 if err != nil { 521 return 522 } 523 logger.V(4).Info("Pod added", "pod", klog.KObj(pod)) 524 dsc.expectations.CreationObserved(logger, dsKey) 525 dsc.enqueueDaemonSet(ds) 526 return 527 } 528 529 // Otherwise, it's an orphan. Get a list of all matching DaemonSets and sync 530 // them to see if anyone wants to adopt it. 531 // DO NOT observe creation because no controller should be waiting for an 532 // orphan. 533 dss := dsc.getDaemonSetsForPod(pod) 534 if len(dss) == 0 { 535 return 536 } 537 logger.V(4).Info("Orphan Pod added", "pod", klog.KObj(pod)) 538 for _, ds := range dss { 539 dsc.enqueueDaemonSet(ds) 540 } 541 } 542 543 // When a pod is updated, figure out what sets manage it and wake them 544 // up. If the labels of the pod have changed we need to awaken both the old 545 // and new set. old and cur must be *v1.Pod types. 546 func (dsc *DaemonSetsController) updatePod(logger klog.Logger, old, cur interface{}) { 547 curPod := cur.(*v1.Pod) 548 oldPod := old.(*v1.Pod) 549 if curPod.ResourceVersion == oldPod.ResourceVersion { 550 // Periodic resync will send update events for all known pods. 551 // Two different versions of the same pod will always have different RVs. 552 return 553 } 554 555 if curPod.DeletionTimestamp != nil { 556 // when a pod is deleted gracefully its deletion timestamp is first modified to reflect a grace period, 557 // and after such time has passed, the kubelet actually deletes it from the store. We receive an update 558 // for modification of the deletion timestamp and expect an ds to create more replicas asap, not wait 559 // until the kubelet actually deletes the pod. 560 dsc.deletePod(logger, curPod) 561 return 562 } 563 564 curControllerRef := metav1.GetControllerOf(curPod) 565 oldControllerRef := metav1.GetControllerOf(oldPod) 566 controllerRefChanged := !reflect.DeepEqual(curControllerRef, oldControllerRef) 567 if controllerRefChanged && oldControllerRef != nil { 568 // The ControllerRef was changed. Sync the old controller, if any. 569 if ds := dsc.resolveControllerRef(oldPod.Namespace, oldControllerRef); ds != nil { 570 dsc.enqueueDaemonSet(ds) 571 } 572 } 573 574 // If it has a ControllerRef, that's all that matters. 575 if curControllerRef != nil { 576 ds := dsc.resolveControllerRef(curPod.Namespace, curControllerRef) 577 if ds == nil { 578 return 579 } 580 logger.V(4).Info("Pod updated", "pod", klog.KObj(curPod)) 581 dsc.enqueueDaemonSet(ds) 582 changedToReady := !podutil.IsPodReady(oldPod) && podutil.IsPodReady(curPod) 583 // See https://github.com/kubernetes/kubernetes/pull/38076 for more details 584 if changedToReady && ds.Spec.MinReadySeconds > 0 { 585 // Add a second to avoid milliseconds skew in AddAfter. 586 // See https://github.com/kubernetes/kubernetes/issues/39785#issuecomment-279959133 for more info. 587 dsc.enqueueDaemonSetAfter(ds, (time.Duration(ds.Spec.MinReadySeconds)*time.Second)+time.Second) 588 } 589 return 590 } 591 592 // Otherwise, it's an orphan. If anything changed, sync matching controllers 593 // to see if anyone wants to adopt it now. 594 dss := dsc.getDaemonSetsForPod(curPod) 595 if len(dss) == 0 { 596 return 597 } 598 logger.V(4).Info("Orphan Pod updated", "pod", klog.KObj(curPod)) 599 labelChanged := !reflect.DeepEqual(curPod.Labels, oldPod.Labels) 600 if labelChanged || controllerRefChanged { 601 for _, ds := range dss { 602 dsc.enqueueDaemonSet(ds) 603 } 604 } 605 } 606 607 func (dsc *DaemonSetsController) deletePod(logger klog.Logger, obj interface{}) { 608 pod, ok := obj.(*v1.Pod) 609 // When a delete is dropped, the relist will notice a pod in the store not 610 // in the list, leading to the insertion of a tombstone object which contains 611 // the deleted key/value. Note that this value might be stale. If the pod 612 // changed labels the new daemonset will not be woken up till the periodic 613 // resync. 614 if !ok { 615 tombstone, ok := obj.(cache.DeletedFinalStateUnknown) 616 if !ok { 617 utilruntime.HandleError(fmt.Errorf("couldn't get object from tombstone %#v", obj)) 618 return 619 } 620 pod, ok = tombstone.Obj.(*v1.Pod) 621 if !ok { 622 utilruntime.HandleError(fmt.Errorf("tombstone contained object that is not a pod %#v", obj)) 623 return 624 } 625 } 626 627 controllerRef := metav1.GetControllerOf(pod) 628 if controllerRef == nil { 629 // No controller should care about orphans being deleted. 630 return 631 } 632 ds := dsc.resolveControllerRef(pod.Namespace, controllerRef) 633 if ds == nil { 634 return 635 } 636 dsKey, err := controller.KeyFunc(ds) 637 if err != nil { 638 return 639 } 640 logger.V(4).Info("Pod deleted", "pod", klog.KObj(pod)) 641 dsc.expectations.DeletionObserved(logger, dsKey) 642 dsc.enqueueDaemonSet(ds) 643 } 644 645 func (dsc *DaemonSetsController) addNode(logger klog.Logger, obj interface{}) { 646 // TODO: it'd be nice to pass a hint with these enqueues, so that each ds would only examine the added node (unless it has other work to do, too). 647 dsList, err := dsc.dsLister.List(labels.Everything()) 648 if err != nil { 649 logger.V(4).Info("Error enqueueing daemon sets", "err", err) 650 return 651 } 652 node := obj.(*v1.Node) 653 for _, ds := range dsList { 654 if shouldRun, _ := NodeShouldRunDaemonPod(node, ds); shouldRun { 655 dsc.enqueueDaemonSet(ds) 656 } 657 } 658 } 659 660 // shouldIgnoreNodeUpdate returns true if Node labels and taints have not changed, otherwise returns false. 661 // If other calling functions need to use other properties of Node, shouldIgnoreNodeUpdate needs to be updated. 662 func shouldIgnoreNodeUpdate(oldNode, curNode v1.Node) bool { 663 return apiequality.Semantic.DeepEqual(oldNode.Labels, curNode.Labels) && 664 apiequality.Semantic.DeepEqual(oldNode.Spec.Taints, curNode.Spec.Taints) 665 } 666 667 func (dsc *DaemonSetsController) updateNode(logger klog.Logger, old, cur interface{}) { 668 oldNode := old.(*v1.Node) 669 curNode := cur.(*v1.Node) 670 if shouldIgnoreNodeUpdate(*oldNode, *curNode) { 671 return 672 } 673 674 dsList, err := dsc.dsLister.List(labels.Everything()) 675 if err != nil { 676 logger.V(4).Info("Error listing daemon sets", "err", err) 677 return 678 } 679 // TODO: it'd be nice to pass a hint with these enqueues, so that each ds would only examine the added node (unless it has other work to do, too). 680 for _, ds := range dsList { 681 // If NodeShouldRunDaemonPod needs to uses other than Labels and Taints (mutable) properties of node, it needs to update shouldIgnoreNodeUpdate. 682 oldShouldRun, oldShouldContinueRunning := NodeShouldRunDaemonPod(oldNode, ds) 683 currentShouldRun, currentShouldContinueRunning := NodeShouldRunDaemonPod(curNode, ds) 684 if (oldShouldRun != currentShouldRun) || (oldShouldContinueRunning != currentShouldContinueRunning) { 685 dsc.enqueueDaemonSet(ds) 686 } 687 } 688 } 689 690 // getDaemonPods returns daemon pods owned by the given ds. 691 // This also reconciles ControllerRef by adopting/orphaning. 692 // Note that returned Pods are pointers to objects in the cache. 693 // If you want to modify one, you need to deep-copy it first. 694 func (dsc *DaemonSetsController) getDaemonPods(ctx context.Context, ds *apps.DaemonSet) ([]*v1.Pod, error) { 695 selector, err := metav1.LabelSelectorAsSelector(ds.Spec.Selector) 696 if err != nil { 697 return nil, err 698 } 699 700 // List all pods to include those that don't match the selector anymore but 701 // have a ControllerRef pointing to this controller. 702 pods, err := dsc.podLister.Pods(ds.Namespace).List(labels.Everything()) 703 if err != nil { 704 return nil, err 705 } 706 // If any adoptions are attempted, we should first recheck for deletion with 707 // an uncached quorum read sometime after listing Pods (see #42639). 708 dsNotDeleted := controller.RecheckDeletionTimestamp(func(ctx context.Context) (metav1.Object, error) { 709 fresh, err := dsc.kubeClient.AppsV1().DaemonSets(ds.Namespace).Get(ctx, ds.Name, metav1.GetOptions{}) 710 if err != nil { 711 return nil, err 712 } 713 if fresh.UID != ds.UID { 714 return nil, fmt.Errorf("original DaemonSet %v/%v is gone: got uid %v, wanted %v", ds.Namespace, ds.Name, fresh.UID, ds.UID) 715 } 716 return fresh, nil 717 }) 718 719 // Use ControllerRefManager to adopt/orphan as needed. 720 cm := controller.NewPodControllerRefManager(dsc.podControl, ds, selector, controllerKind, dsNotDeleted) 721 return cm.ClaimPods(ctx, pods) 722 } 723 724 // getNodesToDaemonPods returns a map from nodes to daemon pods (corresponding to ds) created for the nodes. 725 // This also reconciles ControllerRef by adopting/orphaning. 726 // Note that returned Pods are pointers to objects in the cache. 727 // If you want to modify one, you need to deep-copy it first. 728 func (dsc *DaemonSetsController) getNodesToDaemonPods(ctx context.Context, ds *apps.DaemonSet, includeDeletedTerminal bool) (map[string][]*v1.Pod, error) { 729 claimedPods, err := dsc.getDaemonPods(ctx, ds) 730 if err != nil { 731 return nil, err 732 } 733 // Group Pods by Node name. 734 nodeToDaemonPods := make(map[string][]*v1.Pod) 735 logger := klog.FromContext(ctx) 736 for _, pod := range claimedPods { 737 if !includeDeletedTerminal && podutil.IsPodTerminal(pod) && pod.DeletionTimestamp != nil { 738 // This Pod has a finalizer or is already scheduled for deletion from the 739 // store by the kubelet or the Pod GC. The DS controller doesn't have 740 // anything else to do with it. 741 continue 742 } 743 nodeName, err := util.GetTargetNodeName(pod) 744 if err != nil { 745 logger.V(4).Info("Failed to get target node name of Pod in DaemonSet", 746 "pod", klog.KObj(pod), "daemonset", klog.KObj(ds)) 747 continue 748 } 749 750 nodeToDaemonPods[nodeName] = append(nodeToDaemonPods[nodeName], pod) 751 } 752 753 return nodeToDaemonPods, nil 754 } 755 756 // resolveControllerRef returns the controller referenced by a ControllerRef, 757 // or nil if the ControllerRef could not be resolved to a matching controller 758 // of the correct Kind. 759 func (dsc *DaemonSetsController) resolveControllerRef(namespace string, controllerRef *metav1.OwnerReference) *apps.DaemonSet { 760 // We can't look up by UID, so look up by Name and then verify UID. 761 // Don't even try to look up by Name if it's the wrong Kind. 762 if controllerRef.Kind != controllerKind.Kind { 763 return nil 764 } 765 ds, err := dsc.dsLister.DaemonSets(namespace).Get(controllerRef.Name) 766 if err != nil { 767 return nil 768 } 769 if ds.UID != controllerRef.UID { 770 // The controller we found with this Name is not the same one that the 771 // ControllerRef points to. 772 return nil 773 } 774 return ds 775 } 776 777 // podsShouldBeOnNode figures out the DaemonSet pods to be created and deleted on the given node: 778 // - nodesNeedingDaemonPods: the pods need to start on the node 779 // - podsToDelete: the Pods need to be deleted on the node 780 // - err: unexpected error 781 func (dsc *DaemonSetsController) podsShouldBeOnNode( 782 logger klog.Logger, 783 node *v1.Node, 784 nodeToDaemonPods map[string][]*v1.Pod, 785 ds *apps.DaemonSet, 786 hash string, 787 ) (nodesNeedingDaemonPods, podsToDelete []string) { 788 789 shouldRun, shouldContinueRunning := NodeShouldRunDaemonPod(node, ds) 790 daemonPods, exists := nodeToDaemonPods[node.Name] 791 792 switch { 793 case shouldRun && !exists: 794 // If daemon pod is supposed to be running on node, but isn't, create daemon pod. 795 nodesNeedingDaemonPods = append(nodesNeedingDaemonPods, node.Name) 796 case shouldContinueRunning: 797 // If a daemon pod failed, delete it 798 // If there's non-daemon pods left on this node, we will create it in the next sync loop 799 var daemonPodsRunning []*v1.Pod 800 for _, pod := range daemonPods { 801 if pod.DeletionTimestamp != nil { 802 continue 803 } 804 if pod.Status.Phase == v1.PodFailed { 805 // This is a critical place where DS is often fighting with kubelet that rejects pods. 806 // We need to avoid hot looping and backoff. 807 backoffKey := failedPodsBackoffKey(ds, node.Name) 808 809 now := dsc.failedPodsBackoff.Clock.Now() 810 inBackoff := dsc.failedPodsBackoff.IsInBackOffSinceUpdate(backoffKey, now) 811 if inBackoff { 812 delay := dsc.failedPodsBackoff.Get(backoffKey) 813 logger.V(4).Info("Deleting failed pod on node has been limited by backoff", 814 "pod", klog.KObj(pod), "node", klog.KObj(node), "currentDelay", delay) 815 dsc.enqueueDaemonSetAfter(ds, delay) 816 continue 817 } 818 819 dsc.failedPodsBackoff.Next(backoffKey, now) 820 821 msg := fmt.Sprintf("Found failed daemon pod %s/%s on node %s, will try to kill it", pod.Namespace, pod.Name, node.Name) 822 logger.V(2).Info("Found failed daemon pod on node, will try to kill it", "pod", klog.KObj(pod), "node", klog.KObj(node)) 823 // Emit an event so that it's discoverable to users. 824 dsc.eventRecorder.Eventf(ds, v1.EventTypeWarning, FailedDaemonPodReason, msg) 825 podsToDelete = append(podsToDelete, pod.Name) 826 } else if pod.Status.Phase == v1.PodSucceeded { 827 msg := fmt.Sprintf("Found succeeded daemon pod %s/%s on node %s, will try to delete it", pod.Namespace, pod.Name, node.Name) 828 logger.V(2).Info("Found succeeded daemon pod on node, will try to delete it", "pod", klog.KObj(pod), "node", klog.KObj(node)) 829 // Emit an event so that it's discoverable to users. 830 dsc.eventRecorder.Eventf(ds, v1.EventTypeNormal, SucceededDaemonPodReason, msg) 831 podsToDelete = append(podsToDelete, pod.Name) 832 } else { 833 daemonPodsRunning = append(daemonPodsRunning, pod) 834 } 835 } 836 837 // When surge is not enabled, if there is more than 1 running pod on a node delete all but the oldest 838 if !util.AllowsSurge(ds) { 839 if len(daemonPodsRunning) <= 1 { 840 // There are no excess pods to be pruned, and no pods to create 841 break 842 } 843 844 sort.Sort(podByCreationTimestampAndPhase(daemonPodsRunning)) 845 for i := 1; i < len(daemonPodsRunning); i++ { 846 podsToDelete = append(podsToDelete, daemonPodsRunning[i].Name) 847 } 848 break 849 } 850 851 if len(daemonPodsRunning) <= 1 { 852 // // There are no excess pods to be pruned 853 if len(daemonPodsRunning) == 0 && shouldRun { 854 // We are surging so we need to have at least one non-deleted pod on the node 855 nodesNeedingDaemonPods = append(nodesNeedingDaemonPods, node.Name) 856 } 857 break 858 } 859 860 // When surge is enabled, we allow 2 pods if and only if the oldest pod matching the current hash state 861 // is not ready AND the oldest pod that doesn't match the current hash state is ready. All other pods are 862 // deleted. If neither pod is ready, only the one matching the current hash revision is kept. 863 var oldestNewPod, oldestOldPod *v1.Pod 864 sort.Sort(podByCreationTimestampAndPhase(daemonPodsRunning)) 865 for _, pod := range daemonPodsRunning { 866 if pod.Labels[apps.ControllerRevisionHashLabelKey] == hash { 867 if oldestNewPod == nil { 868 oldestNewPod = pod 869 continue 870 } 871 } else { 872 if oldestOldPod == nil { 873 oldestOldPod = pod 874 continue 875 } 876 } 877 podsToDelete = append(podsToDelete, pod.Name) 878 } 879 if oldestNewPod != nil && oldestOldPod != nil { 880 switch { 881 case !podutil.IsPodReady(oldestOldPod): 882 logger.V(5).Info("Pod from daemonset is no longer ready and will be replaced with newer pod", "oldPod", klog.KObj(oldestOldPod), "daemonset", klog.KObj(ds), "newPod", klog.KObj(oldestNewPod)) 883 podsToDelete = append(podsToDelete, oldestOldPod.Name) 884 case podutil.IsPodAvailable(oldestNewPod, ds.Spec.MinReadySeconds, metav1.Time{Time: dsc.failedPodsBackoff.Clock.Now()}): 885 logger.V(5).Info("Pod from daemonset is now ready and will replace older pod", "newPod", klog.KObj(oldestNewPod), "daemonset", klog.KObj(ds), "oldPod", klog.KObj(oldestOldPod)) 886 podsToDelete = append(podsToDelete, oldestOldPod.Name) 887 } 888 } 889 890 case !shouldContinueRunning && exists: 891 // If daemon pod isn't supposed to run on node, but it is, delete all daemon pods on node. 892 for _, pod := range daemonPods { 893 if pod.DeletionTimestamp != nil { 894 continue 895 } 896 podsToDelete = append(podsToDelete, pod.Name) 897 } 898 } 899 900 return nodesNeedingDaemonPods, podsToDelete 901 } 902 903 func (dsc *DaemonSetsController) updateDaemonSet(ctx context.Context, ds *apps.DaemonSet, nodeList []*v1.Node, hash, key string, old []*apps.ControllerRevision) error { 904 err := dsc.manage(ctx, ds, nodeList, hash) 905 if err != nil { 906 return err 907 } 908 909 // Process rolling updates if we're ready. 910 if dsc.expectations.SatisfiedExpectations(klog.FromContext(ctx), key) { 911 switch ds.Spec.UpdateStrategy.Type { 912 case apps.OnDeleteDaemonSetStrategyType: 913 case apps.RollingUpdateDaemonSetStrategyType: 914 err = dsc.rollingUpdate(ctx, ds, nodeList, hash) 915 } 916 if err != nil { 917 return err 918 } 919 } 920 921 err = dsc.cleanupHistory(ctx, ds, old) 922 if err != nil { 923 return fmt.Errorf("failed to clean up revisions of DaemonSet: %w", err) 924 } 925 926 return nil 927 } 928 929 // manage manages the scheduling and running of Pods of ds on nodes. 930 // After figuring out which nodes should run a Pod of ds but not yet running one and 931 // which nodes should not run a Pod of ds but currently running one, it calls function 932 // syncNodes with a list of pods to remove and a list of nodes to run a Pod of ds. 933 func (dsc *DaemonSetsController) manage(ctx context.Context, ds *apps.DaemonSet, nodeList []*v1.Node, hash string) error { 934 // Find out the pods which are created for the nodes by DaemonSet. 935 nodeToDaemonPods, err := dsc.getNodesToDaemonPods(ctx, ds, false) 936 if err != nil { 937 return fmt.Errorf("couldn't get node to daemon pod mapping for daemon set %q: %v", ds.Name, err) 938 } 939 940 // For each node, if the node is running the daemon pod but isn't supposed to, kill the daemon 941 // pod. If the node is supposed to run the daemon pod, but isn't, create the daemon pod on the node. 942 logger := klog.FromContext(ctx) 943 var nodesNeedingDaemonPods, podsToDelete []string 944 for _, node := range nodeList { 945 nodesNeedingDaemonPodsOnNode, podsToDeleteOnNode := dsc.podsShouldBeOnNode( 946 logger, node, nodeToDaemonPods, ds, hash) 947 948 nodesNeedingDaemonPods = append(nodesNeedingDaemonPods, nodesNeedingDaemonPodsOnNode...) 949 podsToDelete = append(podsToDelete, podsToDeleteOnNode...) 950 } 951 952 // Remove unscheduled pods assigned to not existing nodes when daemonset pods are scheduled by scheduler. 953 // If node doesn't exist then pods are never scheduled and can't be deleted by PodGCController. 954 podsToDelete = append(podsToDelete, getUnscheduledPodsWithoutNode(nodeList, nodeToDaemonPods)...) 955 956 // Label new pods using the hash label value of the current history when creating them 957 if err = dsc.syncNodes(ctx, ds, podsToDelete, nodesNeedingDaemonPods, hash); err != nil { 958 return err 959 } 960 961 return nil 962 } 963 964 // syncNodes deletes given pods and creates new daemon set pods on the given nodes 965 // returns slice with errors if any 966 func (dsc *DaemonSetsController) syncNodes(ctx context.Context, ds *apps.DaemonSet, podsToDelete, nodesNeedingDaemonPods []string, hash string) error { 967 // We need to set expectations before creating/deleting pods to avoid race conditions. 968 logger := klog.FromContext(ctx) 969 dsKey, err := controller.KeyFunc(ds) 970 if err != nil { 971 return fmt.Errorf("couldn't get key for object %#v: %v", ds, err) 972 } 973 974 createDiff := len(nodesNeedingDaemonPods) 975 deleteDiff := len(podsToDelete) 976 977 if createDiff > dsc.burstReplicas { 978 createDiff = dsc.burstReplicas 979 } 980 if deleteDiff > dsc.burstReplicas { 981 deleteDiff = dsc.burstReplicas 982 } 983 984 dsc.expectations.SetExpectations(logger, dsKey, createDiff, deleteDiff) 985 986 // error channel to communicate back failures. make the buffer big enough to avoid any blocking 987 errCh := make(chan error, createDiff+deleteDiff) 988 989 logger.V(4).Info("Nodes needing daemon pods for daemon set, creating", "daemonset", klog.KObj(ds), "needCount", nodesNeedingDaemonPods, "createCount", createDiff) 990 createWait := sync.WaitGroup{} 991 // If the returned error is not nil we have a parse error. 992 // The controller handles this via the hash. 993 generation, err := util.GetTemplateGeneration(ds) 994 if err != nil { 995 generation = nil 996 } 997 template := util.CreatePodTemplate(ds.Spec.Template, generation, hash) 998 // Batch the pod creates. Batch sizes start at SlowStartInitialBatchSize 999 // and double with each successful iteration in a kind of "slow start". 1000 // This handles attempts to start large numbers of pods that would 1001 // likely all fail with the same error. For example a project with a 1002 // low quota that attempts to create a large number of pods will be 1003 // prevented from spamming the API service with the pod create requests 1004 // after one of its pods fails. Conveniently, this also prevents the 1005 // event spam that those failures would generate. 1006 batchSize := min(createDiff, controller.SlowStartInitialBatchSize) 1007 for pos := 0; createDiff > pos; batchSize, pos = min(2*batchSize, createDiff-(pos+batchSize)), pos+batchSize { 1008 errorCount := len(errCh) 1009 createWait.Add(batchSize) 1010 for i := pos; i < pos+batchSize; i++ { 1011 go func(ix int) { 1012 defer createWait.Done() 1013 1014 podTemplate := template.DeepCopy() 1015 // The pod's NodeAffinity will be updated to make sure the Pod is bound 1016 // to the target node by default scheduler. It is safe to do so because there 1017 // should be no conflicting node affinity with the target node. 1018 podTemplate.Spec.Affinity = util.ReplaceDaemonSetPodNodeNameNodeAffinity( 1019 podTemplate.Spec.Affinity, nodesNeedingDaemonPods[ix]) 1020 1021 err := dsc.podControl.CreatePods(ctx, ds.Namespace, podTemplate, 1022 ds, metav1.NewControllerRef(ds, controllerKind)) 1023 1024 if err != nil { 1025 if apierrors.HasStatusCause(err, v1.NamespaceTerminatingCause) { 1026 // If the namespace is being torn down, we can safely ignore 1027 // this error since all subsequent creations will fail. 1028 return 1029 } 1030 } 1031 if err != nil { 1032 logger.V(2).Info("Failed creation, decrementing expectations for daemon set", "daemonset", klog.KObj(ds)) 1033 dsc.expectations.CreationObserved(logger, dsKey) 1034 errCh <- err 1035 utilruntime.HandleError(err) 1036 } 1037 }(i) 1038 } 1039 createWait.Wait() 1040 // any skipped pods that we never attempted to start shouldn't be expected. 1041 skippedPods := createDiff - (batchSize + pos) 1042 if errorCount < len(errCh) && skippedPods > 0 { 1043 logger.V(2).Info("Slow-start failure. Skipping creation pods, decrementing expectations for daemon set", "skippedPods", skippedPods, "daemonset", klog.KObj(ds)) 1044 dsc.expectations.LowerExpectations(logger, dsKey, skippedPods, 0) 1045 // The skipped pods will be retried later. The next controller resync will 1046 // retry the slow start process. 1047 break 1048 } 1049 } 1050 1051 logger.V(4).Info("Pods to delete for daemon set, deleting", "daemonset", klog.KObj(ds), "toDeleteCount", podsToDelete, "deleteCount", deleteDiff) 1052 deleteWait := sync.WaitGroup{} 1053 deleteWait.Add(deleteDiff) 1054 for i := 0; i < deleteDiff; i++ { 1055 go func(ix int) { 1056 defer deleteWait.Done() 1057 if err := dsc.podControl.DeletePod(ctx, ds.Namespace, podsToDelete[ix], ds); err != nil { 1058 dsc.expectations.DeletionObserved(logger, dsKey) 1059 if !apierrors.IsNotFound(err) { 1060 logger.V(2).Info("Failed deletion, decremented expectations for daemon set", "daemonset", klog.KObj(ds)) 1061 errCh <- err 1062 utilruntime.HandleError(err) 1063 } 1064 } 1065 }(i) 1066 } 1067 deleteWait.Wait() 1068 1069 // collect errors if any for proper reporting/retry logic in the controller 1070 errors := []error{} 1071 close(errCh) 1072 for err := range errCh { 1073 errors = append(errors, err) 1074 } 1075 return utilerrors.NewAggregate(errors) 1076 } 1077 1078 func storeDaemonSetStatus( 1079 ctx context.Context, 1080 dsClient unversionedapps.DaemonSetInterface, 1081 ds *apps.DaemonSet, desiredNumberScheduled, 1082 currentNumberScheduled, 1083 numberMisscheduled, 1084 numberReady, 1085 updatedNumberScheduled, 1086 numberAvailable, 1087 numberUnavailable int, 1088 updateObservedGen bool) error { 1089 if int(ds.Status.DesiredNumberScheduled) == desiredNumberScheduled && 1090 int(ds.Status.CurrentNumberScheduled) == currentNumberScheduled && 1091 int(ds.Status.NumberMisscheduled) == numberMisscheduled && 1092 int(ds.Status.NumberReady) == numberReady && 1093 int(ds.Status.UpdatedNumberScheduled) == updatedNumberScheduled && 1094 int(ds.Status.NumberAvailable) == numberAvailable && 1095 int(ds.Status.NumberUnavailable) == numberUnavailable && 1096 ds.Status.ObservedGeneration >= ds.Generation { 1097 return nil 1098 } 1099 1100 toUpdate := ds.DeepCopy() 1101 1102 var updateErr, getErr error 1103 for i := 0; ; i++ { 1104 if updateObservedGen { 1105 toUpdate.Status.ObservedGeneration = ds.Generation 1106 } 1107 toUpdate.Status.DesiredNumberScheduled = int32(desiredNumberScheduled) 1108 toUpdate.Status.CurrentNumberScheduled = int32(currentNumberScheduled) 1109 toUpdate.Status.NumberMisscheduled = int32(numberMisscheduled) 1110 toUpdate.Status.NumberReady = int32(numberReady) 1111 toUpdate.Status.UpdatedNumberScheduled = int32(updatedNumberScheduled) 1112 toUpdate.Status.NumberAvailable = int32(numberAvailable) 1113 toUpdate.Status.NumberUnavailable = int32(numberUnavailable) 1114 1115 if _, updateErr = dsClient.UpdateStatus(ctx, toUpdate, metav1.UpdateOptions{}); updateErr == nil { 1116 return nil 1117 } 1118 1119 // Stop retrying if we exceed statusUpdateRetries - the DaemonSet will be requeued with a rate limit. 1120 if i >= StatusUpdateRetries { 1121 break 1122 } 1123 // Update the set with the latest resource version for the next poll 1124 if toUpdate, getErr = dsClient.Get(ctx, ds.Name, metav1.GetOptions{}); getErr != nil { 1125 // If the GET fails we can't trust status.Replicas anymore. This error 1126 // is bound to be more interesting than the update failure. 1127 return getErr 1128 } 1129 } 1130 return updateErr 1131 } 1132 1133 func (dsc *DaemonSetsController) updateDaemonSetStatus(ctx context.Context, ds *apps.DaemonSet, nodeList []*v1.Node, hash string, updateObservedGen bool) error { 1134 logger := klog.FromContext(ctx) 1135 logger.V(4).Info("Updating daemon set status") 1136 nodeToDaemonPods, err := dsc.getNodesToDaemonPods(ctx, ds, false) 1137 if err != nil { 1138 return fmt.Errorf("couldn't get node to daemon pod mapping for daemon set %q: %v", ds.Name, err) 1139 } 1140 1141 var desiredNumberScheduled, currentNumberScheduled, numberMisscheduled, numberReady, updatedNumberScheduled, numberAvailable int 1142 now := dsc.failedPodsBackoff.Clock.Now() 1143 for _, node := range nodeList { 1144 shouldRun, _ := NodeShouldRunDaemonPod(node, ds) 1145 scheduled := len(nodeToDaemonPods[node.Name]) > 0 1146 1147 if shouldRun { 1148 desiredNumberScheduled++ 1149 if !scheduled { 1150 continue 1151 } 1152 1153 currentNumberScheduled++ 1154 // Sort the daemon pods by creation time, so that the oldest is first. 1155 daemonPods, _ := nodeToDaemonPods[node.Name] 1156 sort.Sort(podByCreationTimestampAndPhase(daemonPods)) 1157 pod := daemonPods[0] 1158 if podutil.IsPodReady(pod) { 1159 numberReady++ 1160 if podutil.IsPodAvailable(pod, ds.Spec.MinReadySeconds, metav1.Time{Time: now}) { 1161 numberAvailable++ 1162 } 1163 } 1164 // If the returned error is not nil we have a parse error. 1165 // The controller handles this via the hash. 1166 generation, err := util.GetTemplateGeneration(ds) 1167 if err != nil { 1168 generation = nil 1169 } 1170 if util.IsPodUpdated(pod, hash, generation) { 1171 updatedNumberScheduled++ 1172 } 1173 } else { 1174 if scheduled { 1175 numberMisscheduled++ 1176 } 1177 } 1178 } 1179 numberUnavailable := desiredNumberScheduled - numberAvailable 1180 1181 err = storeDaemonSetStatus(ctx, dsc.kubeClient.AppsV1().DaemonSets(ds.Namespace), ds, desiredNumberScheduled, currentNumberScheduled, numberMisscheduled, numberReady, updatedNumberScheduled, numberAvailable, numberUnavailable, updateObservedGen) 1182 if err != nil { 1183 return fmt.Errorf("error storing status for daemon set %#v: %w", ds, err) 1184 } 1185 1186 // Resync the DaemonSet after MinReadySeconds as a last line of defense to guard against clock-skew. 1187 if ds.Spec.MinReadySeconds > 0 && numberReady != numberAvailable { 1188 dsc.enqueueDaemonSetAfter(ds, time.Duration(ds.Spec.MinReadySeconds)*time.Second) 1189 } 1190 return nil 1191 } 1192 1193 func (dsc *DaemonSetsController) syncDaemonSet(ctx context.Context, key string) error { 1194 logger := klog.FromContext(ctx) 1195 startTime := dsc.failedPodsBackoff.Clock.Now() 1196 1197 defer func() { 1198 logger.V(4).Info("Finished syncing daemon set", "daemonset", key, "time", dsc.failedPodsBackoff.Clock.Now().Sub(startTime)) 1199 }() 1200 1201 namespace, name, err := cache.SplitMetaNamespaceKey(key) 1202 if err != nil { 1203 return err 1204 } 1205 ds, err := dsc.dsLister.DaemonSets(namespace).Get(name) 1206 if apierrors.IsNotFound(err) { 1207 logger.V(3).Info("Daemon set has been deleted", "daemonset", key) 1208 dsc.expectations.DeleteExpectations(logger, key) 1209 return nil 1210 } 1211 if err != nil { 1212 return fmt.Errorf("unable to retrieve ds %v from store: %v", key, err) 1213 } 1214 1215 nodeList, err := dsc.nodeLister.List(labels.Everything()) 1216 if err != nil { 1217 return fmt.Errorf("couldn't get list of nodes when syncing daemon set %#v: %v", ds, err) 1218 } 1219 1220 everything := metav1.LabelSelector{} 1221 if reflect.DeepEqual(ds.Spec.Selector, &everything) { 1222 dsc.eventRecorder.Eventf(ds, v1.EventTypeWarning, SelectingAllReason, "This daemon set is selecting all pods. A non-empty selector is required.") 1223 return nil 1224 } 1225 1226 // Don't process a daemon set until all its creations and deletions have been processed. 1227 // For example if daemon set foo asked for 3 new daemon pods in the previous call to manage, 1228 // then we do not want to call manage on foo until the daemon pods have been created. 1229 dsKey, err := controller.KeyFunc(ds) 1230 if err != nil { 1231 return fmt.Errorf("couldn't get key for object %#v: %v", ds, err) 1232 } 1233 1234 // If the DaemonSet is being deleted (either by foreground deletion or 1235 // orphan deletion), we cannot be sure if the DaemonSet history objects 1236 // it owned still exist -- those history objects can either be deleted 1237 // or orphaned. Garbage collector doesn't guarantee that it will delete 1238 // DaemonSet pods before deleting DaemonSet history objects, because 1239 // DaemonSet history doesn't own DaemonSet pods. We cannot reliably 1240 // calculate the status of a DaemonSet being deleted. Therefore, return 1241 // here without updating status for the DaemonSet being deleted. 1242 if ds.DeletionTimestamp != nil { 1243 return nil 1244 } 1245 1246 // Construct histories of the DaemonSet, and get the hash of current history 1247 cur, old, err := dsc.constructHistory(ctx, ds) 1248 if err != nil { 1249 return fmt.Errorf("failed to construct revisions of DaemonSet: %v", err) 1250 } 1251 hash := cur.Labels[apps.DefaultDaemonSetUniqueLabelKey] 1252 1253 if !dsc.expectations.SatisfiedExpectations(logger, dsKey) { 1254 // Only update status. Don't raise observedGeneration since controller didn't process object of that generation. 1255 return dsc.updateDaemonSetStatus(ctx, ds, nodeList, hash, false) 1256 } 1257 1258 err = dsc.updateDaemonSet(ctx, ds, nodeList, hash, dsKey, old) 1259 statusErr := dsc.updateDaemonSetStatus(ctx, ds, nodeList, hash, true) 1260 switch { 1261 case err != nil && statusErr != nil: 1262 // If there was an error, and we failed to update status, 1263 // log it and return the original error. 1264 logger.Error(statusErr, "Failed to update status", "daemonSet", klog.KObj(ds)) 1265 return err 1266 case err != nil: 1267 return err 1268 case statusErr != nil: 1269 return statusErr 1270 } 1271 1272 return nil 1273 } 1274 1275 // NodeShouldRunDaemonPod checks a set of preconditions against a (node,daemonset) and returns a 1276 // summary. Returned booleans are: 1277 // - shouldRun: 1278 // Returns true when a daemonset should run on the node if a daemonset pod is not already 1279 // running on that node. 1280 // - shouldContinueRunning: 1281 // Returns true when a daemonset should continue running on a node if a daemonset pod is already 1282 // running on that node. 1283 func NodeShouldRunDaemonPod(node *v1.Node, ds *apps.DaemonSet) (bool, bool) { 1284 pod := NewPod(ds, node.Name) 1285 1286 // If the daemon set specifies a node name, check that it matches with node.Name. 1287 if !(ds.Spec.Template.Spec.NodeName == "" || ds.Spec.Template.Spec.NodeName == node.Name) { 1288 return false, false 1289 } 1290 1291 taints := node.Spec.Taints 1292 fitsNodeName, fitsNodeAffinity, fitsTaints := predicates(pod, node, taints) 1293 if !fitsNodeName || !fitsNodeAffinity { 1294 return false, false 1295 } 1296 1297 if !fitsTaints { 1298 // Scheduled daemon pods should continue running if they tolerate NoExecute taint. 1299 _, hasUntoleratedTaint := v1helper.FindMatchingUntoleratedTaint(taints, pod.Spec.Tolerations, func(t *v1.Taint) bool { 1300 return t.Effect == v1.TaintEffectNoExecute 1301 }) 1302 return false, !hasUntoleratedTaint 1303 } 1304 1305 return true, true 1306 } 1307 1308 // predicates checks if a DaemonSet's pod can run on a node. 1309 func predicates(pod *v1.Pod, node *v1.Node, taints []v1.Taint) (fitsNodeName, fitsNodeAffinity, fitsTaints bool) { 1310 fitsNodeName = len(pod.Spec.NodeName) == 0 || pod.Spec.NodeName == node.Name 1311 // Ignore parsing errors for backwards compatibility. 1312 fitsNodeAffinity, _ = nodeaffinity.GetRequiredNodeAffinity(pod).Match(node) 1313 _, hasUntoleratedTaint := v1helper.FindMatchingUntoleratedTaint(taints, pod.Spec.Tolerations, func(t *v1.Taint) bool { 1314 return t.Effect == v1.TaintEffectNoExecute || t.Effect == v1.TaintEffectNoSchedule 1315 }) 1316 fitsTaints = !hasUntoleratedTaint 1317 return 1318 } 1319 1320 // NewPod creates a new pod 1321 func NewPod(ds *apps.DaemonSet, nodeName string) *v1.Pod { 1322 newPod := &v1.Pod{Spec: ds.Spec.Template.Spec, ObjectMeta: ds.Spec.Template.ObjectMeta} 1323 newPod.Namespace = ds.Namespace 1324 newPod.Spec.NodeName = nodeName 1325 1326 // Added default tolerations for DaemonSet pods. 1327 util.AddOrUpdateDaemonPodTolerations(&newPod.Spec) 1328 1329 return newPod 1330 } 1331 1332 type podByCreationTimestampAndPhase []*v1.Pod 1333 1334 func (o podByCreationTimestampAndPhase) Len() int { return len(o) } 1335 func (o podByCreationTimestampAndPhase) Swap(i, j int) { o[i], o[j] = o[j], o[i] } 1336 1337 func (o podByCreationTimestampAndPhase) Less(i, j int) bool { 1338 // Scheduled Pod first 1339 if len(o[i].Spec.NodeName) != 0 && len(o[j].Spec.NodeName) == 0 { 1340 return true 1341 } 1342 1343 if len(o[i].Spec.NodeName) == 0 && len(o[j].Spec.NodeName) != 0 { 1344 return false 1345 } 1346 1347 if o[i].CreationTimestamp.Equal(&o[j].CreationTimestamp) { 1348 return o[i].Name < o[j].Name 1349 } 1350 return o[i].CreationTimestamp.Before(&o[j].CreationTimestamp) 1351 } 1352 1353 func failedPodsBackoffKey(ds *apps.DaemonSet, nodeName string) string { 1354 return fmt.Sprintf("%s/%d/%s", ds.UID, ds.Status.ObservedGeneration, nodeName) 1355 } 1356 1357 // getUnscheduledPodsWithoutNode returns list of unscheduled pods assigned to not existing nodes. 1358 // Returned pods can't be deleted by PodGCController so they should be deleted by DaemonSetController. 1359 func getUnscheduledPodsWithoutNode(runningNodesList []*v1.Node, nodeToDaemonPods map[string][]*v1.Pod) []string { 1360 var results []string 1361 isNodeRunning := make(map[string]bool, len(runningNodesList)) 1362 for _, node := range runningNodesList { 1363 isNodeRunning[node.Name] = true 1364 } 1365 1366 for n, pods := range nodeToDaemonPods { 1367 if isNodeRunning[n] { 1368 continue 1369 } 1370 for _, pod := range pods { 1371 if len(pod.Spec.NodeName) == 0 { 1372 results = append(results, pod.Name) 1373 } 1374 } 1375 } 1376 1377 return results 1378 }