k8s.io/kubernetes@v1.29.3/pkg/controller/daemon/daemon_controller.go (about) 1 /* 2 Copyright 2015 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package daemon 18 19 import ( 20 "context" 21 "fmt" 22 "reflect" 23 "sort" 24 "sync" 25 "time" 26 27 "k8s.io/klog/v2" 28 29 apps "k8s.io/api/apps/v1" 30 v1 "k8s.io/api/core/v1" 31 apiequality "k8s.io/apimachinery/pkg/api/equality" 32 apierrors "k8s.io/apimachinery/pkg/api/errors" 33 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 34 "k8s.io/apimachinery/pkg/labels" 35 utilerrors "k8s.io/apimachinery/pkg/util/errors" 36 utilruntime "k8s.io/apimachinery/pkg/util/runtime" 37 "k8s.io/apimachinery/pkg/util/wait" 38 appsinformers "k8s.io/client-go/informers/apps/v1" 39 coreinformers "k8s.io/client-go/informers/core/v1" 40 clientset "k8s.io/client-go/kubernetes" 41 "k8s.io/client-go/kubernetes/scheme" 42 unversionedapps "k8s.io/client-go/kubernetes/typed/apps/v1" 43 v1core "k8s.io/client-go/kubernetes/typed/core/v1" 44 appslisters "k8s.io/client-go/listers/apps/v1" 45 corelisters "k8s.io/client-go/listers/core/v1" 46 "k8s.io/client-go/tools/cache" 47 "k8s.io/client-go/tools/record" 48 "k8s.io/client-go/util/flowcontrol" 49 "k8s.io/client-go/util/workqueue" 50 v1helper "k8s.io/component-helpers/scheduling/corev1" 51 "k8s.io/component-helpers/scheduling/corev1/nodeaffinity" 52 podutil "k8s.io/kubernetes/pkg/api/v1/pod" 53 "k8s.io/kubernetes/pkg/controller" 54 "k8s.io/kubernetes/pkg/controller/daemon/util" 55 "k8s.io/utils/integer" 56 ) 57 58 const ( 59 // BurstReplicas is a rate limiter for booting pods on a lot of pods. 60 // The value of 250 is chosen b/c values that are too high can cause registry DoS issues. 61 BurstReplicas = 250 62 63 // StatusUpdateRetries limits the number of retries if sending a status update to API server fails. 64 StatusUpdateRetries = 1 65 66 // BackoffGCInterval is the time that has to pass before next iteration of backoff GC is run 67 BackoffGCInterval = 1 * time.Minute 68 ) 69 70 // Reasons for DaemonSet events 71 const ( 72 // SelectingAllReason is added to an event when a DaemonSet selects all Pods. 73 SelectingAllReason = "SelectingAll" 74 // FailedPlacementReason is added to an event when a DaemonSet can't schedule a Pod to a specified node. 75 FailedPlacementReason = "FailedPlacement" 76 // FailedDaemonPodReason is added to an event when the status of a Pod of a DaemonSet is 'Failed'. 77 FailedDaemonPodReason = "FailedDaemonPod" 78 // SucceededDaemonPodReason is added to an event when the status of a Pod of a DaemonSet is 'Succeeded'. 79 SucceededDaemonPodReason = "SucceededDaemonPod" 80 ) 81 82 // controllerKind contains the schema.GroupVersionKind for this controller type. 83 var controllerKind = apps.SchemeGroupVersion.WithKind("DaemonSet") 84 85 // DaemonSetsController is responsible for synchronizing DaemonSet objects stored 86 // in the system with actual running pods. 87 type DaemonSetsController struct { 88 kubeClient clientset.Interface 89 90 eventBroadcaster record.EventBroadcaster 91 eventRecorder record.EventRecorder 92 93 podControl controller.PodControlInterface 94 crControl controller.ControllerRevisionControlInterface 95 96 // An dsc is temporarily suspended after creating/deleting these many replicas. 97 // It resumes normal action after observing the watch events for them. 98 burstReplicas int 99 100 // To allow injection of syncDaemonSet for testing. 101 syncHandler func(ctx context.Context, dsKey string) error 102 // used for unit testing 103 enqueueDaemonSet func(ds *apps.DaemonSet) 104 // A TTLCache of pod creates/deletes each ds expects to see 105 expectations controller.ControllerExpectationsInterface 106 // dsLister can list/get daemonsets from the shared informer's store 107 dsLister appslisters.DaemonSetLister 108 // dsStoreSynced returns true if the daemonset store has been synced at least once. 109 // Added as a member to the struct to allow injection for testing. 110 dsStoreSynced cache.InformerSynced 111 // historyLister get list/get history from the shared informers's store 112 historyLister appslisters.ControllerRevisionLister 113 // historyStoreSynced returns true if the history store has been synced at least once. 114 // Added as a member to the struct to allow injection for testing. 115 historyStoreSynced cache.InformerSynced 116 // podLister get list/get pods from the shared informers's store 117 podLister corelisters.PodLister 118 // podStoreSynced returns true if the pod store has been synced at least once. 119 // Added as a member to the struct to allow injection for testing. 120 podStoreSynced cache.InformerSynced 121 // nodeLister can list/get nodes from the shared informer's store 122 nodeLister corelisters.NodeLister 123 // nodeStoreSynced returns true if the node store has been synced at least once. 124 // Added as a member to the struct to allow injection for testing. 125 nodeStoreSynced cache.InformerSynced 126 127 // DaemonSet keys that need to be synced. 128 queue workqueue.RateLimitingInterface 129 130 failedPodsBackoff *flowcontrol.Backoff 131 } 132 133 // NewDaemonSetsController creates a new DaemonSetsController 134 func NewDaemonSetsController( 135 ctx context.Context, 136 daemonSetInformer appsinformers.DaemonSetInformer, 137 historyInformer appsinformers.ControllerRevisionInformer, 138 podInformer coreinformers.PodInformer, 139 nodeInformer coreinformers.NodeInformer, 140 kubeClient clientset.Interface, 141 failedPodsBackoff *flowcontrol.Backoff, 142 ) (*DaemonSetsController, error) { 143 eventBroadcaster := record.NewBroadcaster() 144 logger := klog.FromContext(ctx) 145 dsc := &DaemonSetsController{ 146 kubeClient: kubeClient, 147 eventBroadcaster: eventBroadcaster, 148 eventRecorder: eventBroadcaster.NewRecorder(scheme.Scheme, v1.EventSource{Component: "daemonset-controller"}), 149 podControl: controller.RealPodControl{ 150 KubeClient: kubeClient, 151 Recorder: eventBroadcaster.NewRecorder(scheme.Scheme, v1.EventSource{Component: "daemonset-controller"}), 152 }, 153 crControl: controller.RealControllerRevisionControl{ 154 KubeClient: kubeClient, 155 }, 156 burstReplicas: BurstReplicas, 157 expectations: controller.NewControllerExpectations(), 158 queue: workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "daemonset"), 159 } 160 161 daemonSetInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ 162 AddFunc: func(obj interface{}) { 163 dsc.addDaemonset(logger, obj) 164 }, 165 UpdateFunc: func(oldObj, newObj interface{}) { 166 dsc.updateDaemonset(logger, oldObj, newObj) 167 }, 168 DeleteFunc: func(obj interface{}) { 169 dsc.deleteDaemonset(logger, obj) 170 }, 171 }) 172 dsc.dsLister = daemonSetInformer.Lister() 173 dsc.dsStoreSynced = daemonSetInformer.Informer().HasSynced 174 175 historyInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ 176 AddFunc: func(obj interface{}) { 177 dsc.addHistory(logger, obj) 178 }, 179 UpdateFunc: func(oldObj, newObj interface{}) { 180 dsc.updateHistory(logger, oldObj, newObj) 181 }, 182 DeleteFunc: func(obj interface{}) { 183 dsc.deleteHistory(logger, obj) 184 }, 185 }) 186 dsc.historyLister = historyInformer.Lister() 187 dsc.historyStoreSynced = historyInformer.Informer().HasSynced 188 189 // Watch for creation/deletion of pods. The reason we watch is that we don't want a daemon set to create/delete 190 // more pods until all the effects (expectations) of a daemon set's create/delete have been observed. 191 podInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ 192 AddFunc: func(obj interface{}) { 193 dsc.addPod(logger, obj) 194 }, 195 UpdateFunc: func(oldObj, newObj interface{}) { 196 dsc.updatePod(logger, oldObj, newObj) 197 }, 198 DeleteFunc: func(obj interface{}) { 199 dsc.deletePod(logger, obj) 200 }, 201 }) 202 dsc.podLister = podInformer.Lister() 203 dsc.podStoreSynced = podInformer.Informer().HasSynced 204 205 nodeInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ 206 AddFunc: func(obj interface{}) { 207 dsc.addNode(logger, obj) 208 }, 209 UpdateFunc: func(oldObj, newObj interface{}) { 210 dsc.updateNode(logger, oldObj, newObj) 211 }, 212 }, 213 ) 214 dsc.nodeStoreSynced = nodeInformer.Informer().HasSynced 215 dsc.nodeLister = nodeInformer.Lister() 216 217 dsc.syncHandler = dsc.syncDaemonSet 218 dsc.enqueueDaemonSet = dsc.enqueue 219 220 dsc.failedPodsBackoff = failedPodsBackoff 221 222 return dsc, nil 223 } 224 225 func (dsc *DaemonSetsController) addDaemonset(logger klog.Logger, obj interface{}) { 226 ds := obj.(*apps.DaemonSet) 227 logger.V(4).Info("Adding daemon set", "daemonset", klog.KObj(ds)) 228 dsc.enqueueDaemonSet(ds) 229 } 230 231 func (dsc *DaemonSetsController) updateDaemonset(logger klog.Logger, cur, old interface{}) { 232 oldDS := old.(*apps.DaemonSet) 233 curDS := cur.(*apps.DaemonSet) 234 235 // TODO: make a KEP and fix informers to always call the delete event handler on re-create 236 if curDS.UID != oldDS.UID { 237 key, err := controller.KeyFunc(oldDS) 238 if err != nil { 239 utilruntime.HandleError(fmt.Errorf("couldn't get key for object %#v: %v", oldDS, err)) 240 return 241 } 242 dsc.deleteDaemonset(logger, cache.DeletedFinalStateUnknown{ 243 Key: key, 244 Obj: oldDS, 245 }) 246 } 247 248 logger.V(4).Info("Updating daemon set", "daemonset", klog.KObj(oldDS)) 249 dsc.enqueueDaemonSet(curDS) 250 } 251 252 func (dsc *DaemonSetsController) deleteDaemonset(logger klog.Logger, obj interface{}) { 253 ds, ok := obj.(*apps.DaemonSet) 254 if !ok { 255 tombstone, ok := obj.(cache.DeletedFinalStateUnknown) 256 if !ok { 257 utilruntime.HandleError(fmt.Errorf("couldn't get object from tombstone %#v", obj)) 258 return 259 } 260 ds, ok = tombstone.Obj.(*apps.DaemonSet) 261 if !ok { 262 utilruntime.HandleError(fmt.Errorf("tombstone contained object that is not a DaemonSet %#v", obj)) 263 return 264 } 265 } 266 logger.V(4).Info("Deleting daemon set", "daemonset", klog.KObj(ds)) 267 268 key, err := controller.KeyFunc(ds) 269 if err != nil { 270 utilruntime.HandleError(fmt.Errorf("couldn't get key for object %#v: %v", ds, err)) 271 return 272 } 273 274 // Delete expectations for the DaemonSet so if we create a new one with the same name it starts clean 275 dsc.expectations.DeleteExpectations(logger, key) 276 277 dsc.queue.Add(key) 278 } 279 280 // Run begins watching and syncing daemon sets. 281 func (dsc *DaemonSetsController) Run(ctx context.Context, workers int) { 282 defer utilruntime.HandleCrash() 283 284 dsc.eventBroadcaster.StartStructuredLogging(0) 285 dsc.eventBroadcaster.StartRecordingToSink(&v1core.EventSinkImpl{Interface: dsc.kubeClient.CoreV1().Events("")}) 286 defer dsc.eventBroadcaster.Shutdown() 287 288 defer dsc.queue.ShutDown() 289 290 logger := klog.FromContext(ctx) 291 logger.Info("Starting daemon sets controller") 292 defer logger.Info("Shutting down daemon sets controller") 293 294 if !cache.WaitForNamedCacheSync("daemon sets", ctx.Done(), dsc.podStoreSynced, dsc.nodeStoreSynced, dsc.historyStoreSynced, dsc.dsStoreSynced) { 295 return 296 } 297 298 for i := 0; i < workers; i++ { 299 go wait.UntilWithContext(ctx, dsc.runWorker, time.Second) 300 } 301 302 go wait.Until(dsc.failedPodsBackoff.GC, BackoffGCInterval, ctx.Done()) 303 304 <-ctx.Done() 305 } 306 307 func (dsc *DaemonSetsController) runWorker(ctx context.Context) { 308 for dsc.processNextWorkItem(ctx) { 309 } 310 } 311 312 // processNextWorkItem deals with one key off the queue. It returns false when it's time to quit. 313 func (dsc *DaemonSetsController) processNextWorkItem(ctx context.Context) bool { 314 dsKey, quit := dsc.queue.Get() 315 if quit { 316 return false 317 } 318 defer dsc.queue.Done(dsKey) 319 320 err := dsc.syncHandler(ctx, dsKey.(string)) 321 if err == nil { 322 dsc.queue.Forget(dsKey) 323 return true 324 } 325 326 utilruntime.HandleError(fmt.Errorf("%v failed with : %v", dsKey, err)) 327 dsc.queue.AddRateLimited(dsKey) 328 329 return true 330 } 331 332 func (dsc *DaemonSetsController) enqueue(ds *apps.DaemonSet) { 333 key, err := controller.KeyFunc(ds) 334 if err != nil { 335 utilruntime.HandleError(fmt.Errorf("Couldn't get key for object %#v: %v", ds, err)) 336 return 337 } 338 339 // TODO: Handle overlapping controllers better. See comment in ReplicationManager. 340 dsc.queue.Add(key) 341 } 342 343 func (dsc *DaemonSetsController) enqueueDaemonSetAfter(obj interface{}, after time.Duration) { 344 key, err := controller.KeyFunc(obj) 345 if err != nil { 346 utilruntime.HandleError(fmt.Errorf("Couldn't get key for object %+v: %v", obj, err)) 347 return 348 } 349 350 // TODO: Handle overlapping controllers better. See comment in ReplicationManager. 351 dsc.queue.AddAfter(key, after) 352 } 353 354 // getDaemonSetsForPod returns a list of DaemonSets that potentially match the pod. 355 func (dsc *DaemonSetsController) getDaemonSetsForPod(pod *v1.Pod) []*apps.DaemonSet { 356 sets, err := dsc.dsLister.GetPodDaemonSets(pod) 357 if err != nil { 358 return nil 359 } 360 if len(sets) > 1 { 361 // ControllerRef will ensure we don't do anything crazy, but more than one 362 // item in this list nevertheless constitutes user error. 363 utilruntime.HandleError(fmt.Errorf("user error! more than one daemon is selecting pods with labels: %+v", pod.Labels)) 364 } 365 return sets 366 } 367 368 // getDaemonSetsForHistory returns a list of DaemonSets that potentially 369 // match a ControllerRevision. 370 func (dsc *DaemonSetsController) getDaemonSetsForHistory(logger klog.Logger, history *apps.ControllerRevision) []*apps.DaemonSet { 371 daemonSets, err := dsc.dsLister.GetHistoryDaemonSets(history) 372 if err != nil || len(daemonSets) == 0 { 373 return nil 374 } 375 if len(daemonSets) > 1 { 376 // ControllerRef will ensure we don't do anything crazy, but more than one 377 // item in this list nevertheless constitutes user error. 378 logger.V(4).Info("Found more than one DaemonSet selecting the ControllerRevision. This is potentially a user error", 379 "controllerRevision", klog.KObj(history), "labels", history.Labels) 380 } 381 return daemonSets 382 } 383 384 // addHistory enqueues the DaemonSet that manages a ControllerRevision when the ControllerRevision is created 385 // or when the controller manager is restarted. 386 func (dsc *DaemonSetsController) addHistory(logger klog.Logger, obj interface{}) { 387 history := obj.(*apps.ControllerRevision) 388 if history.DeletionTimestamp != nil { 389 // On a restart of the controller manager, it's possible for an object to 390 // show up in a state that is already pending deletion. 391 dsc.deleteHistory(logger, history) 392 return 393 } 394 395 // If it has a ControllerRef, that's all that matters. 396 if controllerRef := metav1.GetControllerOf(history); controllerRef != nil { 397 ds := dsc.resolveControllerRef(history.Namespace, controllerRef) 398 if ds == nil { 399 return 400 } 401 logger.V(4).Info("Observed a ControllerRevision", "controllerRevision", klog.KObj(history)) 402 return 403 } 404 405 // Otherwise, it's an orphan. Get a list of all matching DaemonSets and sync 406 // them to see if anyone wants to adopt it. 407 daemonSets := dsc.getDaemonSetsForHistory(logger, history) 408 if len(daemonSets) == 0 { 409 return 410 } 411 logger.V(4).Info("Orphan ControllerRevision added", "controllerRevision", klog.KObj(history)) 412 for _, ds := range daemonSets { 413 dsc.enqueueDaemonSet(ds) 414 } 415 } 416 417 // updateHistory figures out what DaemonSet(s) manage a ControllerRevision when the ControllerRevision 418 // is updated and wake them up. If anything of the ControllerRevision has changed, we need to awaken 419 // both the old and new DaemonSets. 420 func (dsc *DaemonSetsController) updateHistory(logger klog.Logger, old, cur interface{}) { 421 curHistory := cur.(*apps.ControllerRevision) 422 oldHistory := old.(*apps.ControllerRevision) 423 if curHistory.ResourceVersion == oldHistory.ResourceVersion { 424 // Periodic resync will send update events for all known ControllerRevisions. 425 return 426 } 427 428 curControllerRef := metav1.GetControllerOf(curHistory) 429 oldControllerRef := metav1.GetControllerOf(oldHistory) 430 controllerRefChanged := !reflect.DeepEqual(curControllerRef, oldControllerRef) 431 if controllerRefChanged && oldControllerRef != nil { 432 // The ControllerRef was changed. Sync the old controller, if any. 433 if ds := dsc.resolveControllerRef(oldHistory.Namespace, oldControllerRef); ds != nil { 434 dsc.enqueueDaemonSet(ds) 435 } 436 } 437 438 // If it has a ControllerRef, that's all that matters. 439 if curControllerRef != nil { 440 ds := dsc.resolveControllerRef(curHistory.Namespace, curControllerRef) 441 if ds == nil { 442 return 443 } 444 logger.V(4).Info("Observed an update to a ControllerRevision", "controllerRevision", klog.KObj(curHistory)) 445 dsc.enqueueDaemonSet(ds) 446 return 447 } 448 449 // Otherwise, it's an orphan. If anything changed, sync matching controllers 450 // to see if anyone wants to adopt it now. 451 labelChanged := !reflect.DeepEqual(curHistory.Labels, oldHistory.Labels) 452 if labelChanged || controllerRefChanged { 453 daemonSets := dsc.getDaemonSetsForHistory(logger, curHistory) 454 if len(daemonSets) == 0 { 455 return 456 } 457 logger.V(4).Info("Orphan ControllerRevision updated", "controllerRevision", klog.KObj(curHistory)) 458 for _, ds := range daemonSets { 459 dsc.enqueueDaemonSet(ds) 460 } 461 } 462 } 463 464 // deleteHistory enqueues the DaemonSet that manages a ControllerRevision when 465 // the ControllerRevision is deleted. obj could be an *app.ControllerRevision, or 466 // a DeletionFinalStateUnknown marker item. 467 func (dsc *DaemonSetsController) deleteHistory(logger klog.Logger, obj interface{}) { 468 history, ok := obj.(*apps.ControllerRevision) 469 470 // When a delete is dropped, the relist will notice a ControllerRevision in the store not 471 // in the list, leading to the insertion of a tombstone object which contains 472 // the deleted key/value. Note that this value might be stale. If the ControllerRevision 473 // changed labels the new DaemonSet will not be woken up till the periodic resync. 474 if !ok { 475 tombstone, ok := obj.(cache.DeletedFinalStateUnknown) 476 if !ok { 477 utilruntime.HandleError(fmt.Errorf("Couldn't get object from tombstone %#v", obj)) 478 return 479 } 480 history, ok = tombstone.Obj.(*apps.ControllerRevision) 481 if !ok { 482 utilruntime.HandleError(fmt.Errorf("Tombstone contained object that is not a ControllerRevision %#v", obj)) 483 return 484 } 485 } 486 487 controllerRef := metav1.GetControllerOf(history) 488 if controllerRef == nil { 489 // No controller should care about orphans being deleted. 490 return 491 } 492 ds := dsc.resolveControllerRef(history.Namespace, controllerRef) 493 if ds == nil { 494 return 495 } 496 logger.V(4).Info("ControllerRevision deleted", "controllerRevision", klog.KObj(history)) 497 dsc.enqueueDaemonSet(ds) 498 } 499 500 func (dsc *DaemonSetsController) addPod(logger klog.Logger, obj interface{}) { 501 pod := obj.(*v1.Pod) 502 503 if pod.DeletionTimestamp != nil { 504 // on a restart of the controller manager, it's possible a new pod shows up in a state that 505 // is already pending deletion. Prevent the pod from being a creation observation. 506 dsc.deletePod(logger, pod) 507 return 508 } 509 510 // If it has a ControllerRef, that's all that matters. 511 if controllerRef := metav1.GetControllerOf(pod); controllerRef != nil { 512 ds := dsc.resolveControllerRef(pod.Namespace, controllerRef) 513 if ds == nil { 514 return 515 } 516 dsKey, err := controller.KeyFunc(ds) 517 if err != nil { 518 return 519 } 520 logger.V(4).Info("Pod added", "pod", klog.KObj(pod)) 521 dsc.expectations.CreationObserved(logger, dsKey) 522 dsc.enqueueDaemonSet(ds) 523 return 524 } 525 526 // Otherwise, it's an orphan. Get a list of all matching DaemonSets and sync 527 // them to see if anyone wants to adopt it. 528 // DO NOT observe creation because no controller should be waiting for an 529 // orphan. 530 dss := dsc.getDaemonSetsForPod(pod) 531 if len(dss) == 0 { 532 return 533 } 534 logger.V(4).Info("Orphan Pod added", "pod", klog.KObj(pod)) 535 for _, ds := range dss { 536 dsc.enqueueDaemonSet(ds) 537 } 538 } 539 540 // When a pod is updated, figure out what sets manage it and wake them 541 // up. If the labels of the pod have changed we need to awaken both the old 542 // and new set. old and cur must be *v1.Pod types. 543 func (dsc *DaemonSetsController) updatePod(logger klog.Logger, old, cur interface{}) { 544 curPod := cur.(*v1.Pod) 545 oldPod := old.(*v1.Pod) 546 if curPod.ResourceVersion == oldPod.ResourceVersion { 547 // Periodic resync will send update events for all known pods. 548 // Two different versions of the same pod will always have different RVs. 549 return 550 } 551 552 if curPod.DeletionTimestamp != nil { 553 // when a pod is deleted gracefully its deletion timestamp is first modified to reflect a grace period, 554 // and after such time has passed, the kubelet actually deletes it from the store. We receive an update 555 // for modification of the deletion timestamp and expect an ds to create more replicas asap, not wait 556 // until the kubelet actually deletes the pod. 557 dsc.deletePod(logger, curPod) 558 return 559 } 560 561 curControllerRef := metav1.GetControllerOf(curPod) 562 oldControllerRef := metav1.GetControllerOf(oldPod) 563 controllerRefChanged := !reflect.DeepEqual(curControllerRef, oldControllerRef) 564 if controllerRefChanged && oldControllerRef != nil { 565 // The ControllerRef was changed. Sync the old controller, if any. 566 if ds := dsc.resolveControllerRef(oldPod.Namespace, oldControllerRef); ds != nil { 567 dsc.enqueueDaemonSet(ds) 568 } 569 } 570 571 // If it has a ControllerRef, that's all that matters. 572 if curControllerRef != nil { 573 ds := dsc.resolveControllerRef(curPod.Namespace, curControllerRef) 574 if ds == nil { 575 return 576 } 577 logger.V(4).Info("Pod updated", "pod", klog.KObj(curPod)) 578 dsc.enqueueDaemonSet(ds) 579 changedToReady := !podutil.IsPodReady(oldPod) && podutil.IsPodReady(curPod) 580 // See https://github.com/kubernetes/kubernetes/pull/38076 for more details 581 if changedToReady && ds.Spec.MinReadySeconds > 0 { 582 // Add a second to avoid milliseconds skew in AddAfter. 583 // See https://github.com/kubernetes/kubernetes/issues/39785#issuecomment-279959133 for more info. 584 dsc.enqueueDaemonSetAfter(ds, (time.Duration(ds.Spec.MinReadySeconds)*time.Second)+time.Second) 585 } 586 return 587 } 588 589 // Otherwise, it's an orphan. If anything changed, sync matching controllers 590 // to see if anyone wants to adopt it now. 591 dss := dsc.getDaemonSetsForPod(curPod) 592 if len(dss) == 0 { 593 return 594 } 595 logger.V(4).Info("Orphan Pod updated", "pod", klog.KObj(curPod)) 596 labelChanged := !reflect.DeepEqual(curPod.Labels, oldPod.Labels) 597 if labelChanged || controllerRefChanged { 598 for _, ds := range dss { 599 dsc.enqueueDaemonSet(ds) 600 } 601 } 602 } 603 604 func (dsc *DaemonSetsController) deletePod(logger klog.Logger, obj interface{}) { 605 pod, ok := obj.(*v1.Pod) 606 // When a delete is dropped, the relist will notice a pod in the store not 607 // in the list, leading to the insertion of a tombstone object which contains 608 // the deleted key/value. Note that this value might be stale. If the pod 609 // changed labels the new daemonset will not be woken up till the periodic 610 // resync. 611 if !ok { 612 tombstone, ok := obj.(cache.DeletedFinalStateUnknown) 613 if !ok { 614 utilruntime.HandleError(fmt.Errorf("couldn't get object from tombstone %#v", obj)) 615 return 616 } 617 pod, ok = tombstone.Obj.(*v1.Pod) 618 if !ok { 619 utilruntime.HandleError(fmt.Errorf("tombstone contained object that is not a pod %#v", obj)) 620 return 621 } 622 } 623 624 controllerRef := metav1.GetControllerOf(pod) 625 if controllerRef == nil { 626 // No controller should care about orphans being deleted. 627 return 628 } 629 ds := dsc.resolveControllerRef(pod.Namespace, controllerRef) 630 if ds == nil { 631 return 632 } 633 dsKey, err := controller.KeyFunc(ds) 634 if err != nil { 635 return 636 } 637 logger.V(4).Info("Pod deleted", "pod", klog.KObj(pod)) 638 dsc.expectations.DeletionObserved(logger, dsKey) 639 dsc.enqueueDaemonSet(ds) 640 } 641 642 func (dsc *DaemonSetsController) addNode(logger klog.Logger, obj interface{}) { 643 // TODO: it'd be nice to pass a hint with these enqueues, so that each ds would only examine the added node (unless it has other work to do, too). 644 dsList, err := dsc.dsLister.List(labels.Everything()) 645 if err != nil { 646 logger.V(4).Info("Error enqueueing daemon sets", "err", err) 647 return 648 } 649 node := obj.(*v1.Node) 650 for _, ds := range dsList { 651 if shouldRun, _ := NodeShouldRunDaemonPod(node, ds); shouldRun { 652 dsc.enqueueDaemonSet(ds) 653 } 654 } 655 } 656 657 // nodeInSameCondition returns true if all effective types ("Status" is true) equals; 658 // otherwise, returns false. 659 func nodeInSameCondition(old []v1.NodeCondition, cur []v1.NodeCondition) bool { 660 if len(old) == 0 && len(cur) == 0 { 661 return true 662 } 663 664 c1map := map[v1.NodeConditionType]v1.ConditionStatus{} 665 for _, c := range old { 666 if c.Status == v1.ConditionTrue { 667 c1map[c.Type] = c.Status 668 } 669 } 670 671 for _, c := range cur { 672 if c.Status != v1.ConditionTrue { 673 continue 674 } 675 676 if _, found := c1map[c.Type]; !found { 677 return false 678 } 679 680 delete(c1map, c.Type) 681 } 682 683 return len(c1map) == 0 684 } 685 686 func shouldIgnoreNodeUpdate(oldNode, curNode v1.Node) bool { 687 if !nodeInSameCondition(oldNode.Status.Conditions, curNode.Status.Conditions) { 688 return false 689 } 690 oldNode.ResourceVersion = curNode.ResourceVersion 691 oldNode.Status.Conditions = curNode.Status.Conditions 692 return apiequality.Semantic.DeepEqual(oldNode, curNode) 693 } 694 695 func (dsc *DaemonSetsController) updateNode(logger klog.Logger, old, cur interface{}) { 696 oldNode := old.(*v1.Node) 697 curNode := cur.(*v1.Node) 698 if shouldIgnoreNodeUpdate(*oldNode, *curNode) { 699 return 700 } 701 702 dsList, err := dsc.dsLister.List(labels.Everything()) 703 if err != nil { 704 logger.V(4).Info("Error listing daemon sets", "err", err) 705 return 706 } 707 // TODO: it'd be nice to pass a hint with these enqueues, so that each ds would only examine the added node (unless it has other work to do, too). 708 for _, ds := range dsList { 709 oldShouldRun, oldShouldContinueRunning := NodeShouldRunDaemonPod(oldNode, ds) 710 currentShouldRun, currentShouldContinueRunning := NodeShouldRunDaemonPod(curNode, ds) 711 if (oldShouldRun != currentShouldRun) || (oldShouldContinueRunning != currentShouldContinueRunning) { 712 dsc.enqueueDaemonSet(ds) 713 } 714 } 715 } 716 717 // getDaemonPods returns daemon pods owned by the given ds. 718 // This also reconciles ControllerRef by adopting/orphaning. 719 // Note that returned Pods are pointers to objects in the cache. 720 // If you want to modify one, you need to deep-copy it first. 721 func (dsc *DaemonSetsController) getDaemonPods(ctx context.Context, ds *apps.DaemonSet) ([]*v1.Pod, error) { 722 selector, err := metav1.LabelSelectorAsSelector(ds.Spec.Selector) 723 if err != nil { 724 return nil, err 725 } 726 727 // List all pods to include those that don't match the selector anymore but 728 // have a ControllerRef pointing to this controller. 729 pods, err := dsc.podLister.Pods(ds.Namespace).List(labels.Everything()) 730 if err != nil { 731 return nil, err 732 } 733 // If any adoptions are attempted, we should first recheck for deletion with 734 // an uncached quorum read sometime after listing Pods (see #42639). 735 dsNotDeleted := controller.RecheckDeletionTimestamp(func(ctx context.Context) (metav1.Object, error) { 736 fresh, err := dsc.kubeClient.AppsV1().DaemonSets(ds.Namespace).Get(ctx, ds.Name, metav1.GetOptions{}) 737 if err != nil { 738 return nil, err 739 } 740 if fresh.UID != ds.UID { 741 return nil, fmt.Errorf("original DaemonSet %v/%v is gone: got uid %v, wanted %v", ds.Namespace, ds.Name, fresh.UID, ds.UID) 742 } 743 return fresh, nil 744 }) 745 746 // Use ControllerRefManager to adopt/orphan as needed. 747 cm := controller.NewPodControllerRefManager(dsc.podControl, ds, selector, controllerKind, dsNotDeleted) 748 return cm.ClaimPods(ctx, pods) 749 } 750 751 // getNodesToDaemonPods returns a map from nodes to daemon pods (corresponding to ds) created for the nodes. 752 // This also reconciles ControllerRef by adopting/orphaning. 753 // Note that returned Pods are pointers to objects in the cache. 754 // If you want to modify one, you need to deep-copy it first. 755 func (dsc *DaemonSetsController) getNodesToDaemonPods(ctx context.Context, ds *apps.DaemonSet, includeDeletedTerminal bool) (map[string][]*v1.Pod, error) { 756 claimedPods, err := dsc.getDaemonPods(ctx, ds) 757 if err != nil { 758 return nil, err 759 } 760 // Group Pods by Node name. 761 nodeToDaemonPods := make(map[string][]*v1.Pod) 762 logger := klog.FromContext(ctx) 763 for _, pod := range claimedPods { 764 if !includeDeletedTerminal && podutil.IsPodTerminal(pod) && pod.DeletionTimestamp != nil { 765 // This Pod has a finalizer or is already scheduled for deletion from the 766 // store by the kubelet or the Pod GC. The DS controller doesn't have 767 // anything else to do with it. 768 continue 769 } 770 nodeName, err := util.GetTargetNodeName(pod) 771 if err != nil { 772 logger.V(4).Info("Failed to get target node name of Pod in DaemonSet", 773 "pod", klog.KObj(pod), "daemonset", klog.KObj(ds)) 774 continue 775 } 776 777 nodeToDaemonPods[nodeName] = append(nodeToDaemonPods[nodeName], pod) 778 } 779 780 return nodeToDaemonPods, nil 781 } 782 783 // resolveControllerRef returns the controller referenced by a ControllerRef, 784 // or nil if the ControllerRef could not be resolved to a matching controller 785 // of the correct Kind. 786 func (dsc *DaemonSetsController) resolveControllerRef(namespace string, controllerRef *metav1.OwnerReference) *apps.DaemonSet { 787 // We can't look up by UID, so look up by Name and then verify UID. 788 // Don't even try to look up by Name if it's the wrong Kind. 789 if controllerRef.Kind != controllerKind.Kind { 790 return nil 791 } 792 ds, err := dsc.dsLister.DaemonSets(namespace).Get(controllerRef.Name) 793 if err != nil { 794 return nil 795 } 796 if ds.UID != controllerRef.UID { 797 // The controller we found with this Name is not the same one that the 798 // ControllerRef points to. 799 return nil 800 } 801 return ds 802 } 803 804 // podsShouldBeOnNode figures out the DaemonSet pods to be created and deleted on the given node: 805 // - nodesNeedingDaemonPods: the pods need to start on the node 806 // - podsToDelete: the Pods need to be deleted on the node 807 // - err: unexpected error 808 func (dsc *DaemonSetsController) podsShouldBeOnNode( 809 logger klog.Logger, 810 node *v1.Node, 811 nodeToDaemonPods map[string][]*v1.Pod, 812 ds *apps.DaemonSet, 813 hash string, 814 ) (nodesNeedingDaemonPods, podsToDelete []string) { 815 816 shouldRun, shouldContinueRunning := NodeShouldRunDaemonPod(node, ds) 817 daemonPods, exists := nodeToDaemonPods[node.Name] 818 819 switch { 820 case shouldRun && !exists: 821 // If daemon pod is supposed to be running on node, but isn't, create daemon pod. 822 nodesNeedingDaemonPods = append(nodesNeedingDaemonPods, node.Name) 823 case shouldContinueRunning: 824 // If a daemon pod failed, delete it 825 // If there's non-daemon pods left on this node, we will create it in the next sync loop 826 var daemonPodsRunning []*v1.Pod 827 for _, pod := range daemonPods { 828 if pod.DeletionTimestamp != nil { 829 continue 830 } 831 if pod.Status.Phase == v1.PodFailed { 832 // This is a critical place where DS is often fighting with kubelet that rejects pods. 833 // We need to avoid hot looping and backoff. 834 backoffKey := failedPodsBackoffKey(ds, node.Name) 835 836 now := dsc.failedPodsBackoff.Clock.Now() 837 inBackoff := dsc.failedPodsBackoff.IsInBackOffSinceUpdate(backoffKey, now) 838 if inBackoff { 839 delay := dsc.failedPodsBackoff.Get(backoffKey) 840 logger.V(4).Info("Deleting failed pod on node has been limited by backoff", 841 "pod", klog.KObj(pod), "node", klog.KObj(node), "currentDelay", delay) 842 dsc.enqueueDaemonSetAfter(ds, delay) 843 continue 844 } 845 846 dsc.failedPodsBackoff.Next(backoffKey, now) 847 848 msg := fmt.Sprintf("Found failed daemon pod %s/%s on node %s, will try to kill it", pod.Namespace, pod.Name, node.Name) 849 logger.V(2).Info("Found failed daemon pod on node, will try to kill it", "pod", klog.KObj(pod), "node", klog.KObj(node)) 850 // Emit an event so that it's discoverable to users. 851 dsc.eventRecorder.Eventf(ds, v1.EventTypeWarning, FailedDaemonPodReason, msg) 852 podsToDelete = append(podsToDelete, pod.Name) 853 } else if pod.Status.Phase == v1.PodSucceeded { 854 msg := fmt.Sprintf("Found succeeded daemon pod %s/%s on node %s, will try to delete it", pod.Namespace, pod.Name, node.Name) 855 logger.V(2).Info("Found succeeded daemon pod on node, will try to delete it", "pod", klog.KObj(pod), "node", klog.KObj(node)) 856 // Emit an event so that it's discoverable to users. 857 dsc.eventRecorder.Eventf(ds, v1.EventTypeNormal, SucceededDaemonPodReason, msg) 858 podsToDelete = append(podsToDelete, pod.Name) 859 } else { 860 daemonPodsRunning = append(daemonPodsRunning, pod) 861 } 862 } 863 864 // When surge is not enabled, if there is more than 1 running pod on a node delete all but the oldest 865 if !util.AllowsSurge(ds) { 866 if len(daemonPodsRunning) <= 1 { 867 // There are no excess pods to be pruned, and no pods to create 868 break 869 } 870 871 sort.Sort(podByCreationTimestampAndPhase(daemonPodsRunning)) 872 for i := 1; i < len(daemonPodsRunning); i++ { 873 podsToDelete = append(podsToDelete, daemonPodsRunning[i].Name) 874 } 875 break 876 } 877 878 if len(daemonPodsRunning) <= 1 { 879 // // There are no excess pods to be pruned 880 if len(daemonPodsRunning) == 0 && shouldRun { 881 // We are surging so we need to have at least one non-deleted pod on the node 882 nodesNeedingDaemonPods = append(nodesNeedingDaemonPods, node.Name) 883 } 884 break 885 } 886 887 // When surge is enabled, we allow 2 pods if and only if the oldest pod matching the current hash state 888 // is not ready AND the oldest pod that doesn't match the current hash state is ready. All other pods are 889 // deleted. If neither pod is ready, only the one matching the current hash revision is kept. 890 var oldestNewPod, oldestOldPod *v1.Pod 891 sort.Sort(podByCreationTimestampAndPhase(daemonPodsRunning)) 892 for _, pod := range daemonPodsRunning { 893 if pod.Labels[apps.ControllerRevisionHashLabelKey] == hash { 894 if oldestNewPod == nil { 895 oldestNewPod = pod 896 continue 897 } 898 } else { 899 if oldestOldPod == nil { 900 oldestOldPod = pod 901 continue 902 } 903 } 904 podsToDelete = append(podsToDelete, pod.Name) 905 } 906 if oldestNewPod != nil && oldestOldPod != nil { 907 switch { 908 case !podutil.IsPodReady(oldestOldPod): 909 logger.V(5).Info("Pod from daemonset is no longer ready and will be replaced with newer pod", "oldPod", klog.KObj(oldestOldPod), "daemonset", klog.KObj(ds), "newPod", klog.KObj(oldestNewPod)) 910 podsToDelete = append(podsToDelete, oldestOldPod.Name) 911 case podutil.IsPodAvailable(oldestNewPod, ds.Spec.MinReadySeconds, metav1.Time{Time: dsc.failedPodsBackoff.Clock.Now()}): 912 logger.V(5).Info("Pod from daemonset is now ready and will replace older pod", "newPod", klog.KObj(oldestNewPod), "daemonset", klog.KObj(ds), "oldPod", klog.KObj(oldestOldPod)) 913 podsToDelete = append(podsToDelete, oldestOldPod.Name) 914 } 915 } 916 917 case !shouldContinueRunning && exists: 918 // If daemon pod isn't supposed to run on node, but it is, delete all daemon pods on node. 919 for _, pod := range daemonPods { 920 if pod.DeletionTimestamp != nil { 921 continue 922 } 923 podsToDelete = append(podsToDelete, pod.Name) 924 } 925 } 926 927 return nodesNeedingDaemonPods, podsToDelete 928 } 929 930 func (dsc *DaemonSetsController) updateDaemonSet(ctx context.Context, ds *apps.DaemonSet, nodeList []*v1.Node, hash, key string, old []*apps.ControllerRevision) error { 931 err := dsc.manage(ctx, ds, nodeList, hash) 932 if err != nil { 933 return err 934 } 935 936 // Process rolling updates if we're ready. 937 if dsc.expectations.SatisfiedExpectations(klog.FromContext(ctx), key) { 938 switch ds.Spec.UpdateStrategy.Type { 939 case apps.OnDeleteDaemonSetStrategyType: 940 case apps.RollingUpdateDaemonSetStrategyType: 941 err = dsc.rollingUpdate(ctx, ds, nodeList, hash) 942 } 943 if err != nil { 944 return err 945 } 946 } 947 948 err = dsc.cleanupHistory(ctx, ds, old) 949 if err != nil { 950 return fmt.Errorf("failed to clean up revisions of DaemonSet: %w", err) 951 } 952 953 return nil 954 } 955 956 // manage manages the scheduling and running of Pods of ds on nodes. 957 // After figuring out which nodes should run a Pod of ds but not yet running one and 958 // which nodes should not run a Pod of ds but currently running one, it calls function 959 // syncNodes with a list of pods to remove and a list of nodes to run a Pod of ds. 960 func (dsc *DaemonSetsController) manage(ctx context.Context, ds *apps.DaemonSet, nodeList []*v1.Node, hash string) error { 961 // Find out the pods which are created for the nodes by DaemonSet. 962 nodeToDaemonPods, err := dsc.getNodesToDaemonPods(ctx, ds, false) 963 if err != nil { 964 return fmt.Errorf("couldn't get node to daemon pod mapping for daemon set %q: %v", ds.Name, err) 965 } 966 967 // For each node, if the node is running the daemon pod but isn't supposed to, kill the daemon 968 // pod. If the node is supposed to run the daemon pod, but isn't, create the daemon pod on the node. 969 logger := klog.FromContext(ctx) 970 var nodesNeedingDaemonPods, podsToDelete []string 971 for _, node := range nodeList { 972 nodesNeedingDaemonPodsOnNode, podsToDeleteOnNode := dsc.podsShouldBeOnNode( 973 logger, node, nodeToDaemonPods, ds, hash) 974 975 nodesNeedingDaemonPods = append(nodesNeedingDaemonPods, nodesNeedingDaemonPodsOnNode...) 976 podsToDelete = append(podsToDelete, podsToDeleteOnNode...) 977 } 978 979 // Remove unscheduled pods assigned to not existing nodes when daemonset pods are scheduled by scheduler. 980 // If node doesn't exist then pods are never scheduled and can't be deleted by PodGCController. 981 podsToDelete = append(podsToDelete, getUnscheduledPodsWithoutNode(nodeList, nodeToDaemonPods)...) 982 983 // Label new pods using the hash label value of the current history when creating them 984 if err = dsc.syncNodes(ctx, ds, podsToDelete, nodesNeedingDaemonPods, hash); err != nil { 985 return err 986 } 987 988 return nil 989 } 990 991 // syncNodes deletes given pods and creates new daemon set pods on the given nodes 992 // returns slice with errors if any 993 func (dsc *DaemonSetsController) syncNodes(ctx context.Context, ds *apps.DaemonSet, podsToDelete, nodesNeedingDaemonPods []string, hash string) error { 994 // We need to set expectations before creating/deleting pods to avoid race conditions. 995 logger := klog.FromContext(ctx) 996 dsKey, err := controller.KeyFunc(ds) 997 if err != nil { 998 return fmt.Errorf("couldn't get key for object %#v: %v", ds, err) 999 } 1000 1001 createDiff := len(nodesNeedingDaemonPods) 1002 deleteDiff := len(podsToDelete) 1003 1004 if createDiff > dsc.burstReplicas { 1005 createDiff = dsc.burstReplicas 1006 } 1007 if deleteDiff > dsc.burstReplicas { 1008 deleteDiff = dsc.burstReplicas 1009 } 1010 1011 dsc.expectations.SetExpectations(logger, dsKey, createDiff, deleteDiff) 1012 1013 // error channel to communicate back failures. make the buffer big enough to avoid any blocking 1014 errCh := make(chan error, createDiff+deleteDiff) 1015 1016 logger.V(4).Info("Nodes needing daemon pods for daemon set, creating", "daemonset", klog.KObj(ds), "needCount", nodesNeedingDaemonPods, "createCount", createDiff) 1017 createWait := sync.WaitGroup{} 1018 // If the returned error is not nil we have a parse error. 1019 // The controller handles this via the hash. 1020 generation, err := util.GetTemplateGeneration(ds) 1021 if err != nil { 1022 generation = nil 1023 } 1024 template := util.CreatePodTemplate(ds.Spec.Template, generation, hash) 1025 // Batch the pod creates. Batch sizes start at SlowStartInitialBatchSize 1026 // and double with each successful iteration in a kind of "slow start". 1027 // This handles attempts to start large numbers of pods that would 1028 // likely all fail with the same error. For example a project with a 1029 // low quota that attempts to create a large number of pods will be 1030 // prevented from spamming the API service with the pod create requests 1031 // after one of its pods fails. Conveniently, this also prevents the 1032 // event spam that those failures would generate. 1033 batchSize := integer.IntMin(createDiff, controller.SlowStartInitialBatchSize) 1034 for pos := 0; createDiff > pos; batchSize, pos = integer.IntMin(2*batchSize, createDiff-(pos+batchSize)), pos+batchSize { 1035 errorCount := len(errCh) 1036 createWait.Add(batchSize) 1037 for i := pos; i < pos+batchSize; i++ { 1038 go func(ix int) { 1039 defer createWait.Done() 1040 1041 podTemplate := template.DeepCopy() 1042 // The pod's NodeAffinity will be updated to make sure the Pod is bound 1043 // to the target node by default scheduler. It is safe to do so because there 1044 // should be no conflicting node affinity with the target node. 1045 podTemplate.Spec.Affinity = util.ReplaceDaemonSetPodNodeNameNodeAffinity( 1046 podTemplate.Spec.Affinity, nodesNeedingDaemonPods[ix]) 1047 1048 err := dsc.podControl.CreatePods(ctx, ds.Namespace, podTemplate, 1049 ds, metav1.NewControllerRef(ds, controllerKind)) 1050 1051 if err != nil { 1052 if apierrors.HasStatusCause(err, v1.NamespaceTerminatingCause) { 1053 // If the namespace is being torn down, we can safely ignore 1054 // this error since all subsequent creations will fail. 1055 return 1056 } 1057 } 1058 if err != nil { 1059 logger.V(2).Info("Failed creation, decrementing expectations for daemon set", "daemonset", klog.KObj(ds)) 1060 dsc.expectations.CreationObserved(logger, dsKey) 1061 errCh <- err 1062 utilruntime.HandleError(err) 1063 } 1064 }(i) 1065 } 1066 createWait.Wait() 1067 // any skipped pods that we never attempted to start shouldn't be expected. 1068 skippedPods := createDiff - (batchSize + pos) 1069 if errorCount < len(errCh) && skippedPods > 0 { 1070 logger.V(2).Info("Slow-start failure. Skipping creation pods, decrementing expectations for daemon set", "skippedPods", skippedPods, "daemonset", klog.KObj(ds)) 1071 dsc.expectations.LowerExpectations(logger, dsKey, skippedPods, 0) 1072 // The skipped pods will be retried later. The next controller resync will 1073 // retry the slow start process. 1074 break 1075 } 1076 } 1077 1078 logger.V(4).Info("Pods to delete for daemon set, deleting", "daemonset", klog.KObj(ds), "toDeleteCount", podsToDelete, "deleteCount", deleteDiff) 1079 deleteWait := sync.WaitGroup{} 1080 deleteWait.Add(deleteDiff) 1081 for i := 0; i < deleteDiff; i++ { 1082 go func(ix int) { 1083 defer deleteWait.Done() 1084 if err := dsc.podControl.DeletePod(ctx, ds.Namespace, podsToDelete[ix], ds); err != nil { 1085 dsc.expectations.DeletionObserved(logger, dsKey) 1086 if !apierrors.IsNotFound(err) { 1087 logger.V(2).Info("Failed deletion, decremented expectations for daemon set", "daemonset", klog.KObj(ds)) 1088 errCh <- err 1089 utilruntime.HandleError(err) 1090 } 1091 } 1092 }(i) 1093 } 1094 deleteWait.Wait() 1095 1096 // collect errors if any for proper reporting/retry logic in the controller 1097 errors := []error{} 1098 close(errCh) 1099 for err := range errCh { 1100 errors = append(errors, err) 1101 } 1102 return utilerrors.NewAggregate(errors) 1103 } 1104 1105 func storeDaemonSetStatus( 1106 ctx context.Context, 1107 dsClient unversionedapps.DaemonSetInterface, 1108 ds *apps.DaemonSet, desiredNumberScheduled, 1109 currentNumberScheduled, 1110 numberMisscheduled, 1111 numberReady, 1112 updatedNumberScheduled, 1113 numberAvailable, 1114 numberUnavailable int, 1115 updateObservedGen bool) error { 1116 if int(ds.Status.DesiredNumberScheduled) == desiredNumberScheduled && 1117 int(ds.Status.CurrentNumberScheduled) == currentNumberScheduled && 1118 int(ds.Status.NumberMisscheduled) == numberMisscheduled && 1119 int(ds.Status.NumberReady) == numberReady && 1120 int(ds.Status.UpdatedNumberScheduled) == updatedNumberScheduled && 1121 int(ds.Status.NumberAvailable) == numberAvailable && 1122 int(ds.Status.NumberUnavailable) == numberUnavailable && 1123 ds.Status.ObservedGeneration >= ds.Generation { 1124 return nil 1125 } 1126 1127 toUpdate := ds.DeepCopy() 1128 1129 var updateErr, getErr error 1130 for i := 0; ; i++ { 1131 if updateObservedGen { 1132 toUpdate.Status.ObservedGeneration = ds.Generation 1133 } 1134 toUpdate.Status.DesiredNumberScheduled = int32(desiredNumberScheduled) 1135 toUpdate.Status.CurrentNumberScheduled = int32(currentNumberScheduled) 1136 toUpdate.Status.NumberMisscheduled = int32(numberMisscheduled) 1137 toUpdate.Status.NumberReady = int32(numberReady) 1138 toUpdate.Status.UpdatedNumberScheduled = int32(updatedNumberScheduled) 1139 toUpdate.Status.NumberAvailable = int32(numberAvailable) 1140 toUpdate.Status.NumberUnavailable = int32(numberUnavailable) 1141 1142 if _, updateErr = dsClient.UpdateStatus(ctx, toUpdate, metav1.UpdateOptions{}); updateErr == nil { 1143 return nil 1144 } 1145 1146 // Stop retrying if we exceed statusUpdateRetries - the DaemonSet will be requeued with a rate limit. 1147 if i >= StatusUpdateRetries { 1148 break 1149 } 1150 // Update the set with the latest resource version for the next poll 1151 if toUpdate, getErr = dsClient.Get(ctx, ds.Name, metav1.GetOptions{}); getErr != nil { 1152 // If the GET fails we can't trust status.Replicas anymore. This error 1153 // is bound to be more interesting than the update failure. 1154 return getErr 1155 } 1156 } 1157 return updateErr 1158 } 1159 1160 func (dsc *DaemonSetsController) updateDaemonSetStatus(ctx context.Context, ds *apps.DaemonSet, nodeList []*v1.Node, hash string, updateObservedGen bool) error { 1161 logger := klog.FromContext(ctx) 1162 logger.V(4).Info("Updating daemon set status") 1163 nodeToDaemonPods, err := dsc.getNodesToDaemonPods(ctx, ds, false) 1164 if err != nil { 1165 return fmt.Errorf("couldn't get node to daemon pod mapping for daemon set %q: %v", ds.Name, err) 1166 } 1167 1168 var desiredNumberScheduled, currentNumberScheduled, numberMisscheduled, numberReady, updatedNumberScheduled, numberAvailable int 1169 now := dsc.failedPodsBackoff.Clock.Now() 1170 for _, node := range nodeList { 1171 shouldRun, _ := NodeShouldRunDaemonPod(node, ds) 1172 scheduled := len(nodeToDaemonPods[node.Name]) > 0 1173 1174 if shouldRun { 1175 desiredNumberScheduled++ 1176 if !scheduled { 1177 continue 1178 } 1179 1180 currentNumberScheduled++ 1181 // Sort the daemon pods by creation time, so that the oldest is first. 1182 daemonPods, _ := nodeToDaemonPods[node.Name] 1183 sort.Sort(podByCreationTimestampAndPhase(daemonPods)) 1184 pod := daemonPods[0] 1185 if podutil.IsPodReady(pod) { 1186 numberReady++ 1187 if podutil.IsPodAvailable(pod, ds.Spec.MinReadySeconds, metav1.Time{Time: now}) { 1188 numberAvailable++ 1189 } 1190 } 1191 // If the returned error is not nil we have a parse error. 1192 // The controller handles this via the hash. 1193 generation, err := util.GetTemplateGeneration(ds) 1194 if err != nil { 1195 generation = nil 1196 } 1197 if util.IsPodUpdated(pod, hash, generation) { 1198 updatedNumberScheduled++ 1199 } 1200 } else { 1201 if scheduled { 1202 numberMisscheduled++ 1203 } 1204 } 1205 } 1206 numberUnavailable := desiredNumberScheduled - numberAvailable 1207 1208 err = storeDaemonSetStatus(ctx, dsc.kubeClient.AppsV1().DaemonSets(ds.Namespace), ds, desiredNumberScheduled, currentNumberScheduled, numberMisscheduled, numberReady, updatedNumberScheduled, numberAvailable, numberUnavailable, updateObservedGen) 1209 if err != nil { 1210 return fmt.Errorf("error storing status for daemon set %#v: %w", ds, err) 1211 } 1212 1213 // Resync the DaemonSet after MinReadySeconds as a last line of defense to guard against clock-skew. 1214 if ds.Spec.MinReadySeconds > 0 && numberReady != numberAvailable { 1215 dsc.enqueueDaemonSetAfter(ds, time.Duration(ds.Spec.MinReadySeconds)*time.Second) 1216 } 1217 return nil 1218 } 1219 1220 func (dsc *DaemonSetsController) syncDaemonSet(ctx context.Context, key string) error { 1221 logger := klog.FromContext(ctx) 1222 startTime := dsc.failedPodsBackoff.Clock.Now() 1223 1224 defer func() { 1225 logger.V(4).Info("Finished syncing daemon set", "daemonset", key, "time", dsc.failedPodsBackoff.Clock.Now().Sub(startTime)) 1226 }() 1227 1228 namespace, name, err := cache.SplitMetaNamespaceKey(key) 1229 if err != nil { 1230 return err 1231 } 1232 ds, err := dsc.dsLister.DaemonSets(namespace).Get(name) 1233 if apierrors.IsNotFound(err) { 1234 logger.V(3).Info("Daemon set has been deleted", "daemonset", key) 1235 dsc.expectations.DeleteExpectations(logger, key) 1236 return nil 1237 } 1238 if err != nil { 1239 return fmt.Errorf("unable to retrieve ds %v from store: %v", key, err) 1240 } 1241 1242 nodeList, err := dsc.nodeLister.List(labels.Everything()) 1243 if err != nil { 1244 return fmt.Errorf("couldn't get list of nodes when syncing daemon set %#v: %v", ds, err) 1245 } 1246 1247 everything := metav1.LabelSelector{} 1248 if reflect.DeepEqual(ds.Spec.Selector, &everything) { 1249 dsc.eventRecorder.Eventf(ds, v1.EventTypeWarning, SelectingAllReason, "This daemon set is selecting all pods. A non-empty selector is required.") 1250 return nil 1251 } 1252 1253 // Don't process a daemon set until all its creations and deletions have been processed. 1254 // For example if daemon set foo asked for 3 new daemon pods in the previous call to manage, 1255 // then we do not want to call manage on foo until the daemon pods have been created. 1256 dsKey, err := controller.KeyFunc(ds) 1257 if err != nil { 1258 return fmt.Errorf("couldn't get key for object %#v: %v", ds, err) 1259 } 1260 1261 // If the DaemonSet is being deleted (either by foreground deletion or 1262 // orphan deletion), we cannot be sure if the DaemonSet history objects 1263 // it owned still exist -- those history objects can either be deleted 1264 // or orphaned. Garbage collector doesn't guarantee that it will delete 1265 // DaemonSet pods before deleting DaemonSet history objects, because 1266 // DaemonSet history doesn't own DaemonSet pods. We cannot reliably 1267 // calculate the status of a DaemonSet being deleted. Therefore, return 1268 // here without updating status for the DaemonSet being deleted. 1269 if ds.DeletionTimestamp != nil { 1270 return nil 1271 } 1272 1273 // Construct histories of the DaemonSet, and get the hash of current history 1274 cur, old, err := dsc.constructHistory(ctx, ds) 1275 if err != nil { 1276 return fmt.Errorf("failed to construct revisions of DaemonSet: %v", err) 1277 } 1278 hash := cur.Labels[apps.DefaultDaemonSetUniqueLabelKey] 1279 1280 if !dsc.expectations.SatisfiedExpectations(logger, dsKey) { 1281 // Only update status. Don't raise observedGeneration since controller didn't process object of that generation. 1282 return dsc.updateDaemonSetStatus(ctx, ds, nodeList, hash, false) 1283 } 1284 1285 err = dsc.updateDaemonSet(ctx, ds, nodeList, hash, dsKey, old) 1286 statusErr := dsc.updateDaemonSetStatus(ctx, ds, nodeList, hash, true) 1287 switch { 1288 case err != nil && statusErr != nil: 1289 // If there was an error, and we failed to update status, 1290 // log it and return the original error. 1291 logger.Error(statusErr, "Failed to update status", "daemonSet", klog.KObj(ds)) 1292 return err 1293 case err != nil: 1294 return err 1295 case statusErr != nil: 1296 return statusErr 1297 } 1298 1299 return nil 1300 } 1301 1302 // NodeShouldRunDaemonPod checks a set of preconditions against a (node,daemonset) and returns a 1303 // summary. Returned booleans are: 1304 // - shouldRun: 1305 // Returns true when a daemonset should run on the node if a daemonset pod is not already 1306 // running on that node. 1307 // - shouldContinueRunning: 1308 // Returns true when a daemonset should continue running on a node if a daemonset pod is already 1309 // running on that node. 1310 func NodeShouldRunDaemonPod(node *v1.Node, ds *apps.DaemonSet) (bool, bool) { 1311 pod := NewPod(ds, node.Name) 1312 1313 // If the daemon set specifies a node name, check that it matches with node.Name. 1314 if !(ds.Spec.Template.Spec.NodeName == "" || ds.Spec.Template.Spec.NodeName == node.Name) { 1315 return false, false 1316 } 1317 1318 taints := node.Spec.Taints 1319 fitsNodeName, fitsNodeAffinity, fitsTaints := predicates(pod, node, taints) 1320 if !fitsNodeName || !fitsNodeAffinity { 1321 return false, false 1322 } 1323 1324 if !fitsTaints { 1325 // Scheduled daemon pods should continue running if they tolerate NoExecute taint. 1326 _, hasUntoleratedTaint := v1helper.FindMatchingUntoleratedTaint(taints, pod.Spec.Tolerations, func(t *v1.Taint) bool { 1327 return t.Effect == v1.TaintEffectNoExecute 1328 }) 1329 return false, !hasUntoleratedTaint 1330 } 1331 1332 return true, true 1333 } 1334 1335 // predicates checks if a DaemonSet's pod can run on a node. 1336 func predicates(pod *v1.Pod, node *v1.Node, taints []v1.Taint) (fitsNodeName, fitsNodeAffinity, fitsTaints bool) { 1337 fitsNodeName = len(pod.Spec.NodeName) == 0 || pod.Spec.NodeName == node.Name 1338 // Ignore parsing errors for backwards compatibility. 1339 fitsNodeAffinity, _ = nodeaffinity.GetRequiredNodeAffinity(pod).Match(node) 1340 _, hasUntoleratedTaint := v1helper.FindMatchingUntoleratedTaint(taints, pod.Spec.Tolerations, func(t *v1.Taint) bool { 1341 return t.Effect == v1.TaintEffectNoExecute || t.Effect == v1.TaintEffectNoSchedule 1342 }) 1343 fitsTaints = !hasUntoleratedTaint 1344 return 1345 } 1346 1347 // NewPod creates a new pod 1348 func NewPod(ds *apps.DaemonSet, nodeName string) *v1.Pod { 1349 newPod := &v1.Pod{Spec: ds.Spec.Template.Spec, ObjectMeta: ds.Spec.Template.ObjectMeta} 1350 newPod.Namespace = ds.Namespace 1351 newPod.Spec.NodeName = nodeName 1352 1353 // Added default tolerations for DaemonSet pods. 1354 util.AddOrUpdateDaemonPodTolerations(&newPod.Spec) 1355 1356 return newPod 1357 } 1358 1359 type podByCreationTimestampAndPhase []*v1.Pod 1360 1361 func (o podByCreationTimestampAndPhase) Len() int { return len(o) } 1362 func (o podByCreationTimestampAndPhase) Swap(i, j int) { o[i], o[j] = o[j], o[i] } 1363 1364 func (o podByCreationTimestampAndPhase) Less(i, j int) bool { 1365 // Scheduled Pod first 1366 if len(o[i].Spec.NodeName) != 0 && len(o[j].Spec.NodeName) == 0 { 1367 return true 1368 } 1369 1370 if len(o[i].Spec.NodeName) == 0 && len(o[j].Spec.NodeName) != 0 { 1371 return false 1372 } 1373 1374 if o[i].CreationTimestamp.Equal(&o[j].CreationTimestamp) { 1375 return o[i].Name < o[j].Name 1376 } 1377 return o[i].CreationTimestamp.Before(&o[j].CreationTimestamp) 1378 } 1379 1380 func failedPodsBackoffKey(ds *apps.DaemonSet, nodeName string) string { 1381 return fmt.Sprintf("%s/%d/%s", ds.UID, ds.Status.ObservedGeneration, nodeName) 1382 } 1383 1384 // getUnscheduledPodsWithoutNode returns list of unscheduled pods assigned to not existing nodes. 1385 // Returned pods can't be deleted by PodGCController so they should be deleted by DaemonSetController. 1386 func getUnscheduledPodsWithoutNode(runningNodesList []*v1.Node, nodeToDaemonPods map[string][]*v1.Pod) []string { 1387 var results []string 1388 isNodeRunning := make(map[string]bool, len(runningNodesList)) 1389 for _, node := range runningNodesList { 1390 isNodeRunning[node.Name] = true 1391 } 1392 1393 for n, pods := range nodeToDaemonPods { 1394 if isNodeRunning[n] { 1395 continue 1396 } 1397 for _, pod := range pods { 1398 if len(pod.Spec.NodeName) == 0 { 1399 results = append(results, pod.Name) 1400 } 1401 } 1402 } 1403 1404 return results 1405 }