k8s.io/kubernetes@v1.31.0-alpha.0.0.20240520171757-56147500dadc/pkg/controller/replicaset/replica_set.go (about) 1 /* 2 Copyright 2016 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 // ### ATTENTION ### 18 // 19 // This code implements both ReplicaSet and ReplicationController. 20 // 21 // For RC, the objects are converted on the way in and out (see ../replication/), 22 // as if ReplicationController were just an older API version of ReplicaSet. 23 // However, RC and RS still have separate storage and separate instantiations 24 // of the ReplicaSetController object. 25 // 26 // Use rsc.Kind in log messages rather than hard-coding "ReplicaSet". 27 28 package replicaset 29 30 import ( 31 "context" 32 "fmt" 33 "reflect" 34 "sort" 35 "strings" 36 "sync" 37 "time" 38 39 apps "k8s.io/api/apps/v1" 40 v1 "k8s.io/api/core/v1" 41 apierrors "k8s.io/apimachinery/pkg/api/errors" 42 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 43 "k8s.io/apimachinery/pkg/labels" 44 "k8s.io/apimachinery/pkg/runtime/schema" 45 "k8s.io/apimachinery/pkg/types" 46 utilruntime "k8s.io/apimachinery/pkg/util/runtime" 47 "k8s.io/apimachinery/pkg/util/wait" 48 appsinformers "k8s.io/client-go/informers/apps/v1" 49 coreinformers "k8s.io/client-go/informers/core/v1" 50 clientset "k8s.io/client-go/kubernetes" 51 "k8s.io/client-go/kubernetes/scheme" 52 v1core "k8s.io/client-go/kubernetes/typed/core/v1" 53 appslisters "k8s.io/client-go/listers/apps/v1" 54 corelisters "k8s.io/client-go/listers/core/v1" 55 "k8s.io/client-go/tools/cache" 56 "k8s.io/client-go/tools/record" 57 "k8s.io/client-go/util/workqueue" 58 "k8s.io/component-base/metrics/legacyregistry" 59 "k8s.io/klog/v2" 60 podutil "k8s.io/kubernetes/pkg/api/v1/pod" 61 "k8s.io/kubernetes/pkg/controller" 62 "k8s.io/kubernetes/pkg/controller/replicaset/metrics" 63 ) 64 65 const ( 66 // Realistic value of the burstReplica field for the replica set manager based off 67 // performance requirements for kubernetes 1.0. 68 BurstReplicas = 500 69 70 // The number of times we retry updating a ReplicaSet's status. 71 statusUpdateRetries = 1 72 73 // controllerUIDIndex is the name for the ReplicaSet store's index function, 74 // which is to index by ReplicaSet's controllerUID. 75 controllerUIDIndex = "controllerUID" 76 ) 77 78 // ReplicaSetController is responsible for synchronizing ReplicaSet objects stored 79 // in the system with actual running pods. 80 type ReplicaSetController struct { 81 // GroupVersionKind indicates the controller type. 82 // Different instances of this struct may handle different GVKs. 83 // For example, this struct can be used (with adapters) to handle ReplicationController. 84 schema.GroupVersionKind 85 86 kubeClient clientset.Interface 87 podControl controller.PodControlInterface 88 89 eventBroadcaster record.EventBroadcaster 90 91 // A ReplicaSet is temporarily suspended after creating/deleting these many replicas. 92 // It resumes normal action after observing the watch events for them. 93 burstReplicas int 94 // To allow injection of syncReplicaSet for testing. 95 syncHandler func(ctx context.Context, rsKey string) error 96 97 // A TTLCache of pod creates/deletes each rc expects to see. 98 expectations *controller.UIDTrackingControllerExpectations 99 100 // A store of ReplicaSets, populated by the shared informer passed to NewReplicaSetController 101 rsLister appslisters.ReplicaSetLister 102 // rsListerSynced returns true if the pod store has been synced at least once. 103 // Added as a member to the struct to allow injection for testing. 104 rsListerSynced cache.InformerSynced 105 rsIndexer cache.Indexer 106 107 // A store of pods, populated by the shared informer passed to NewReplicaSetController 108 podLister corelisters.PodLister 109 // podListerSynced returns true if the pod store has been synced at least once. 110 // Added as a member to the struct to allow injection for testing. 111 podListerSynced cache.InformerSynced 112 113 // Controllers that need to be synced 114 queue workqueue.TypedRateLimitingInterface[string] 115 } 116 117 // NewReplicaSetController configures a replica set controller with the specified event recorder 118 func NewReplicaSetController(ctx context.Context, rsInformer appsinformers.ReplicaSetInformer, podInformer coreinformers.PodInformer, kubeClient clientset.Interface, burstReplicas int) *ReplicaSetController { 119 logger := klog.FromContext(ctx) 120 eventBroadcaster := record.NewBroadcaster(record.WithContext(ctx)) 121 if err := metrics.Register(legacyregistry.Register); err != nil { 122 logger.Error(err, "unable to register metrics") 123 } 124 return NewBaseController(logger, rsInformer, podInformer, kubeClient, burstReplicas, 125 apps.SchemeGroupVersion.WithKind("ReplicaSet"), 126 "replicaset_controller", 127 "replicaset", 128 controller.RealPodControl{ 129 KubeClient: kubeClient, 130 Recorder: eventBroadcaster.NewRecorder(scheme.Scheme, v1.EventSource{Component: "replicaset-controller"}), 131 }, 132 eventBroadcaster, 133 ) 134 } 135 136 // NewBaseController is the implementation of NewReplicaSetController with additional injected 137 // parameters so that it can also serve as the implementation of NewReplicationController. 138 func NewBaseController(logger klog.Logger, rsInformer appsinformers.ReplicaSetInformer, podInformer coreinformers.PodInformer, kubeClient clientset.Interface, burstReplicas int, 139 gvk schema.GroupVersionKind, metricOwnerName, queueName string, podControl controller.PodControlInterface, eventBroadcaster record.EventBroadcaster) *ReplicaSetController { 140 141 rsc := &ReplicaSetController{ 142 GroupVersionKind: gvk, 143 kubeClient: kubeClient, 144 podControl: podControl, 145 eventBroadcaster: eventBroadcaster, 146 burstReplicas: burstReplicas, 147 expectations: controller.NewUIDTrackingControllerExpectations(controller.NewControllerExpectations()), 148 queue: workqueue.NewTypedRateLimitingQueueWithConfig( 149 workqueue.DefaultTypedControllerRateLimiter[string](), 150 workqueue.TypedRateLimitingQueueConfig[string]{Name: queueName}, 151 ), 152 } 153 154 rsInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ 155 AddFunc: func(obj interface{}) { 156 rsc.addRS(logger, obj) 157 }, 158 UpdateFunc: func(oldObj, newObj interface{}) { 159 rsc.updateRS(logger, oldObj, newObj) 160 }, 161 DeleteFunc: func(obj interface{}) { 162 rsc.deleteRS(logger, obj) 163 }, 164 }) 165 rsInformer.Informer().AddIndexers(cache.Indexers{ 166 controllerUIDIndex: func(obj interface{}) ([]string, error) { 167 rs, ok := obj.(*apps.ReplicaSet) 168 if !ok { 169 return []string{}, nil 170 } 171 controllerRef := metav1.GetControllerOf(rs) 172 if controllerRef == nil { 173 return []string{}, nil 174 } 175 return []string{string(controllerRef.UID)}, nil 176 }, 177 }) 178 rsc.rsIndexer = rsInformer.Informer().GetIndexer() 179 rsc.rsLister = rsInformer.Lister() 180 rsc.rsListerSynced = rsInformer.Informer().HasSynced 181 182 podInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ 183 AddFunc: func(obj interface{}) { 184 rsc.addPod(logger, obj) 185 }, 186 // This invokes the ReplicaSet for every pod change, eg: host assignment. Though this might seem like 187 // overkill the most frequent pod update is status, and the associated ReplicaSet will only list from 188 // local storage, so it should be ok. 189 UpdateFunc: func(oldObj, newObj interface{}) { 190 rsc.updatePod(logger, oldObj, newObj) 191 }, 192 DeleteFunc: func(obj interface{}) { 193 rsc.deletePod(logger, obj) 194 }, 195 }) 196 rsc.podLister = podInformer.Lister() 197 rsc.podListerSynced = podInformer.Informer().HasSynced 198 199 rsc.syncHandler = rsc.syncReplicaSet 200 201 return rsc 202 } 203 204 // Run begins watching and syncing. 205 func (rsc *ReplicaSetController) Run(ctx context.Context, workers int) { 206 defer utilruntime.HandleCrash() 207 208 // Start events processing pipeline. 209 rsc.eventBroadcaster.StartStructuredLogging(3) 210 rsc.eventBroadcaster.StartRecordingToSink(&v1core.EventSinkImpl{Interface: rsc.kubeClient.CoreV1().Events("")}) 211 defer rsc.eventBroadcaster.Shutdown() 212 213 defer rsc.queue.ShutDown() 214 215 controllerName := strings.ToLower(rsc.Kind) 216 logger := klog.FromContext(ctx) 217 logger.Info("Starting controller", "name", controllerName) 218 defer logger.Info("Shutting down controller", "name", controllerName) 219 220 if !cache.WaitForNamedCacheSync(rsc.Kind, ctx.Done(), rsc.podListerSynced, rsc.rsListerSynced) { 221 return 222 } 223 224 for i := 0; i < workers; i++ { 225 go wait.UntilWithContext(ctx, rsc.worker, time.Second) 226 } 227 228 <-ctx.Done() 229 } 230 231 // getReplicaSetsWithSameController returns a list of ReplicaSets with the same 232 // owner as the given ReplicaSet. 233 func (rsc *ReplicaSetController) getReplicaSetsWithSameController(logger klog.Logger, rs *apps.ReplicaSet) []*apps.ReplicaSet { 234 controllerRef := metav1.GetControllerOf(rs) 235 if controllerRef == nil { 236 utilruntime.HandleError(fmt.Errorf("ReplicaSet has no controller: %v", rs)) 237 return nil 238 } 239 240 objects, err := rsc.rsIndexer.ByIndex(controllerUIDIndex, string(controllerRef.UID)) 241 if err != nil { 242 utilruntime.HandleError(err) 243 return nil 244 } 245 relatedRSs := make([]*apps.ReplicaSet, 0, len(objects)) 246 for _, obj := range objects { 247 relatedRSs = append(relatedRSs, obj.(*apps.ReplicaSet)) 248 } 249 250 if klogV := logger.V(2); klogV.Enabled() { 251 klogV.Info("Found related ReplicaSets", "replicaSet", klog.KObj(rs), "relatedReplicaSets", klog.KObjSlice(relatedRSs)) 252 } 253 254 return relatedRSs 255 } 256 257 // getPodReplicaSets returns a list of ReplicaSets matching the given pod. 258 func (rsc *ReplicaSetController) getPodReplicaSets(pod *v1.Pod) []*apps.ReplicaSet { 259 rss, err := rsc.rsLister.GetPodReplicaSets(pod) 260 if err != nil { 261 return nil 262 } 263 if len(rss) > 1 { 264 // ControllerRef will ensure we don't do anything crazy, but more than one 265 // item in this list nevertheless constitutes user error. 266 utilruntime.HandleError(fmt.Errorf("user error! more than one %v is selecting pods with labels: %+v", rsc.Kind, pod.Labels)) 267 } 268 return rss 269 } 270 271 // resolveControllerRef returns the controller referenced by a ControllerRef, 272 // or nil if the ControllerRef could not be resolved to a matching controller 273 // of the correct Kind. 274 func (rsc *ReplicaSetController) resolveControllerRef(namespace string, controllerRef *metav1.OwnerReference) *apps.ReplicaSet { 275 // We can't look up by UID, so look up by Name and then verify UID. 276 // Don't even try to look up by Name if it's the wrong Kind. 277 if controllerRef.Kind != rsc.Kind { 278 return nil 279 } 280 rs, err := rsc.rsLister.ReplicaSets(namespace).Get(controllerRef.Name) 281 if err != nil { 282 return nil 283 } 284 if rs.UID != controllerRef.UID { 285 // The controller we found with this Name is not the same one that the 286 // ControllerRef points to. 287 return nil 288 } 289 return rs 290 } 291 292 func (rsc *ReplicaSetController) enqueueRS(rs *apps.ReplicaSet) { 293 key, err := controller.KeyFunc(rs) 294 if err != nil { 295 utilruntime.HandleError(fmt.Errorf("couldn't get key for object %#v: %v", rs, err)) 296 return 297 } 298 299 rsc.queue.Add(key) 300 } 301 302 func (rsc *ReplicaSetController) enqueueRSAfter(rs *apps.ReplicaSet, duration time.Duration) { 303 key, err := controller.KeyFunc(rs) 304 if err != nil { 305 utilruntime.HandleError(fmt.Errorf("couldn't get key for object %#v: %v", rs, err)) 306 return 307 } 308 309 rsc.queue.AddAfter(key, duration) 310 } 311 312 func (rsc *ReplicaSetController) addRS(logger klog.Logger, obj interface{}) { 313 rs := obj.(*apps.ReplicaSet) 314 logger.V(4).Info("Adding", "replicaSet", klog.KObj(rs)) 315 rsc.enqueueRS(rs) 316 } 317 318 // callback when RS is updated 319 func (rsc *ReplicaSetController) updateRS(logger klog.Logger, old, cur interface{}) { 320 oldRS := old.(*apps.ReplicaSet) 321 curRS := cur.(*apps.ReplicaSet) 322 323 // TODO: make a KEP and fix informers to always call the delete event handler on re-create 324 if curRS.UID != oldRS.UID { 325 key, err := controller.KeyFunc(oldRS) 326 if err != nil { 327 utilruntime.HandleError(fmt.Errorf("couldn't get key for object %#v: %v", oldRS, err)) 328 return 329 } 330 rsc.deleteRS(logger, cache.DeletedFinalStateUnknown{ 331 Key: key, 332 Obj: oldRS, 333 }) 334 } 335 336 // You might imagine that we only really need to enqueue the 337 // replica set when Spec changes, but it is safer to sync any 338 // time this function is triggered. That way a full informer 339 // resync can requeue any replica set that don't yet have pods 340 // but whose last attempts at creating a pod have failed (since 341 // we don't block on creation of pods) instead of those 342 // replica sets stalling indefinitely. Enqueueing every time 343 // does result in some spurious syncs (like when Status.Replica 344 // is updated and the watch notification from it retriggers 345 // this function), but in general extra resyncs shouldn't be 346 // that bad as ReplicaSets that haven't met expectations yet won't 347 // sync, and all the listing is done using local stores. 348 if *(oldRS.Spec.Replicas) != *(curRS.Spec.Replicas) { 349 logger.V(4).Info("replicaSet updated. Desired pod count change.", "replicaSet", klog.KObj(oldRS), "oldReplicas", *(oldRS.Spec.Replicas), "newReplicas", *(curRS.Spec.Replicas)) 350 } 351 rsc.enqueueRS(curRS) 352 } 353 354 func (rsc *ReplicaSetController) deleteRS(logger klog.Logger, obj interface{}) { 355 rs, ok := obj.(*apps.ReplicaSet) 356 if !ok { 357 tombstone, ok := obj.(cache.DeletedFinalStateUnknown) 358 if !ok { 359 utilruntime.HandleError(fmt.Errorf("couldn't get object from tombstone %#v", obj)) 360 return 361 } 362 rs, ok = tombstone.Obj.(*apps.ReplicaSet) 363 if !ok { 364 utilruntime.HandleError(fmt.Errorf("tombstone contained object that is not a ReplicaSet %#v", obj)) 365 return 366 } 367 } 368 369 key, err := controller.KeyFunc(rs) 370 if err != nil { 371 utilruntime.HandleError(fmt.Errorf("couldn't get key for object %#v: %v", rs, err)) 372 return 373 } 374 375 logger.V(4).Info("Deleting", "replicaSet", klog.KObj(rs)) 376 377 // Delete expectations for the ReplicaSet so if we create a new one with the same name it starts clean 378 rsc.expectations.DeleteExpectations(logger, key) 379 380 rsc.queue.Add(key) 381 } 382 383 // When a pod is created, enqueue the replica set that manages it and update its expectations. 384 func (rsc *ReplicaSetController) addPod(logger klog.Logger, obj interface{}) { 385 pod := obj.(*v1.Pod) 386 387 if pod.DeletionTimestamp != nil { 388 // on a restart of the controller manager, it's possible a new pod shows up in a state that 389 // is already pending deletion. Prevent the pod from being a creation observation. 390 rsc.deletePod(logger, pod) 391 return 392 } 393 394 // If it has a ControllerRef, that's all that matters. 395 if controllerRef := metav1.GetControllerOf(pod); controllerRef != nil { 396 rs := rsc.resolveControllerRef(pod.Namespace, controllerRef) 397 if rs == nil { 398 return 399 } 400 rsKey, err := controller.KeyFunc(rs) 401 if err != nil { 402 return 403 } 404 logger.V(4).Info("Pod created", "pod", klog.KObj(pod), "detail", pod) 405 rsc.expectations.CreationObserved(logger, rsKey) 406 rsc.queue.Add(rsKey) 407 return 408 } 409 410 // Otherwise, it's an orphan. Get a list of all matching ReplicaSets and sync 411 // them to see if anyone wants to adopt it. 412 // DO NOT observe creation because no controller should be waiting for an 413 // orphan. 414 rss := rsc.getPodReplicaSets(pod) 415 if len(rss) == 0 { 416 return 417 } 418 logger.V(4).Info("Orphan Pod created", "pod", klog.KObj(pod), "detail", pod) 419 for _, rs := range rss { 420 rsc.enqueueRS(rs) 421 } 422 } 423 424 // When a pod is updated, figure out what replica set/s manage it and wake them 425 // up. If the labels of the pod have changed we need to awaken both the old 426 // and new replica set. old and cur must be *v1.Pod types. 427 func (rsc *ReplicaSetController) updatePod(logger klog.Logger, old, cur interface{}) { 428 curPod := cur.(*v1.Pod) 429 oldPod := old.(*v1.Pod) 430 if curPod.ResourceVersion == oldPod.ResourceVersion { 431 // Periodic resync will send update events for all known pods. 432 // Two different versions of the same pod will always have different RVs. 433 return 434 } 435 436 labelChanged := !reflect.DeepEqual(curPod.Labels, oldPod.Labels) 437 if curPod.DeletionTimestamp != nil { 438 // when a pod is deleted gracefully it's deletion timestamp is first modified to reflect a grace period, 439 // and after such time has passed, the kubelet actually deletes it from the store. We receive an update 440 // for modification of the deletion timestamp and expect an rs to create more replicas asap, not wait 441 // until the kubelet actually deletes the pod. This is different from the Phase of a pod changing, because 442 // an rs never initiates a phase change, and so is never asleep waiting for the same. 443 rsc.deletePod(logger, curPod) 444 if labelChanged { 445 // we don't need to check the oldPod.DeletionTimestamp because DeletionTimestamp cannot be unset. 446 rsc.deletePod(logger, oldPod) 447 } 448 return 449 } 450 451 curControllerRef := metav1.GetControllerOf(curPod) 452 oldControllerRef := metav1.GetControllerOf(oldPod) 453 controllerRefChanged := !reflect.DeepEqual(curControllerRef, oldControllerRef) 454 if controllerRefChanged && oldControllerRef != nil { 455 // The ControllerRef was changed. Sync the old controller, if any. 456 if rs := rsc.resolveControllerRef(oldPod.Namespace, oldControllerRef); rs != nil { 457 rsc.enqueueRS(rs) 458 } 459 } 460 461 // If it has a ControllerRef, that's all that matters. 462 if curControllerRef != nil { 463 rs := rsc.resolveControllerRef(curPod.Namespace, curControllerRef) 464 if rs == nil { 465 return 466 } 467 logger.V(4).Info("Pod objectMeta updated.", "pod", klog.KObj(oldPod), "oldObjectMeta", oldPod.ObjectMeta, "curObjectMeta", curPod.ObjectMeta) 468 rsc.enqueueRS(rs) 469 // TODO: MinReadySeconds in the Pod will generate an Available condition to be added in 470 // the Pod status which in turn will trigger a requeue of the owning replica set thus 471 // having its status updated with the newly available replica. For now, we can fake the 472 // update by resyncing the controller MinReadySeconds after the it is requeued because 473 // a Pod transitioned to Ready. 474 // Note that this still suffers from #29229, we are just moving the problem one level 475 // "closer" to kubelet (from the deployment to the replica set controller). 476 if !podutil.IsPodReady(oldPod) && podutil.IsPodReady(curPod) && rs.Spec.MinReadySeconds > 0 { 477 logger.V(2).Info("pod will be enqueued after a while for availability check", "duration", rs.Spec.MinReadySeconds, "kind", rsc.Kind, "pod", klog.KObj(oldPod)) 478 // Add a second to avoid milliseconds skew in AddAfter. 479 // See https://github.com/kubernetes/kubernetes/issues/39785#issuecomment-279959133 for more info. 480 rsc.enqueueRSAfter(rs, (time.Duration(rs.Spec.MinReadySeconds)*time.Second)+time.Second) 481 } 482 return 483 } 484 485 // Otherwise, it's an orphan. If anything changed, sync matching controllers 486 // to see if anyone wants to adopt it now. 487 if labelChanged || controllerRefChanged { 488 rss := rsc.getPodReplicaSets(curPod) 489 if len(rss) == 0 { 490 return 491 } 492 logger.V(4).Info("Orphan Pod objectMeta updated.", "pod", klog.KObj(oldPod), "oldObjectMeta", oldPod.ObjectMeta, "curObjectMeta", curPod.ObjectMeta) 493 for _, rs := range rss { 494 rsc.enqueueRS(rs) 495 } 496 } 497 } 498 499 // When a pod is deleted, enqueue the replica set that manages the pod and update its expectations. 500 // obj could be an *v1.Pod, or a DeletionFinalStateUnknown marker item. 501 func (rsc *ReplicaSetController) deletePod(logger klog.Logger, obj interface{}) { 502 pod, ok := obj.(*v1.Pod) 503 504 // When a delete is dropped, the relist will notice a pod in the store not 505 // in the list, leading to the insertion of a tombstone object which contains 506 // the deleted key/value. Note that this value might be stale. If the pod 507 // changed labels the new ReplicaSet will not be woken up till the periodic resync. 508 if !ok { 509 tombstone, ok := obj.(cache.DeletedFinalStateUnknown) 510 if !ok { 511 utilruntime.HandleError(fmt.Errorf("couldn't get object from tombstone %+v", obj)) 512 return 513 } 514 pod, ok = tombstone.Obj.(*v1.Pod) 515 if !ok { 516 utilruntime.HandleError(fmt.Errorf("tombstone contained object that is not a pod %#v", obj)) 517 return 518 } 519 } 520 521 controllerRef := metav1.GetControllerOf(pod) 522 if controllerRef == nil { 523 // No controller should care about orphans being deleted. 524 return 525 } 526 rs := rsc.resolveControllerRef(pod.Namespace, controllerRef) 527 if rs == nil { 528 return 529 } 530 rsKey, err := controller.KeyFunc(rs) 531 if err != nil { 532 utilruntime.HandleError(fmt.Errorf("couldn't get key for object %#v: %v", rs, err)) 533 return 534 } 535 logger.V(4).Info("Pod deleted", "delete_by", utilruntime.GetCaller(), "deletion_timestamp", pod.DeletionTimestamp, "pod", klog.KObj(pod)) 536 rsc.expectations.DeletionObserved(logger, rsKey, controller.PodKey(pod)) 537 rsc.queue.Add(rsKey) 538 } 539 540 // worker runs a worker thread that just dequeues items, processes them, and marks them done. 541 // It enforces that the syncHandler is never invoked concurrently with the same key. 542 func (rsc *ReplicaSetController) worker(ctx context.Context) { 543 for rsc.processNextWorkItem(ctx) { 544 } 545 } 546 547 func (rsc *ReplicaSetController) processNextWorkItem(ctx context.Context) bool { 548 key, quit := rsc.queue.Get() 549 if quit { 550 return false 551 } 552 defer rsc.queue.Done(key) 553 554 err := rsc.syncHandler(ctx, key) 555 if err == nil { 556 rsc.queue.Forget(key) 557 return true 558 } 559 560 utilruntime.HandleError(fmt.Errorf("sync %q failed with %v", key, err)) 561 rsc.queue.AddRateLimited(key) 562 563 return true 564 } 565 566 // manageReplicas checks and updates replicas for the given ReplicaSet. 567 // Does NOT modify <filteredPods>. 568 // It will requeue the replica set in case of an error while creating/deleting pods. 569 func (rsc *ReplicaSetController) manageReplicas(ctx context.Context, filteredPods []*v1.Pod, rs *apps.ReplicaSet) error { 570 diff := len(filteredPods) - int(*(rs.Spec.Replicas)) 571 rsKey, err := controller.KeyFunc(rs) 572 if err != nil { 573 utilruntime.HandleError(fmt.Errorf("couldn't get key for %v %#v: %v", rsc.Kind, rs, err)) 574 return nil 575 } 576 logger := klog.FromContext(ctx) 577 if diff < 0 { 578 diff *= -1 579 if diff > rsc.burstReplicas { 580 diff = rsc.burstReplicas 581 } 582 // TODO: Track UIDs of creates just like deletes. The problem currently 583 // is we'd need to wait on the result of a create to record the pod's 584 // UID, which would require locking *across* the create, which will turn 585 // into a performance bottleneck. We should generate a UID for the pod 586 // beforehand and store it via ExpectCreations. 587 rsc.expectations.ExpectCreations(logger, rsKey, diff) 588 logger.V(2).Info("Too few replicas", "replicaSet", klog.KObj(rs), "need", *(rs.Spec.Replicas), "creating", diff) 589 // Batch the pod creates. Batch sizes start at SlowStartInitialBatchSize 590 // and double with each successful iteration in a kind of "slow start". 591 // This handles attempts to start large numbers of pods that would 592 // likely all fail with the same error. For example a project with a 593 // low quota that attempts to create a large number of pods will be 594 // prevented from spamming the API service with the pod create requests 595 // after one of its pods fails. Conveniently, this also prevents the 596 // event spam that those failures would generate. 597 successfulCreations, err := slowStartBatch(diff, controller.SlowStartInitialBatchSize, func() error { 598 err := rsc.podControl.CreatePods(ctx, rs.Namespace, &rs.Spec.Template, rs, metav1.NewControllerRef(rs, rsc.GroupVersionKind)) 599 if err != nil { 600 if apierrors.HasStatusCause(err, v1.NamespaceTerminatingCause) { 601 // if the namespace is being terminated, we don't have to do 602 // anything because any creation will fail 603 return nil 604 } 605 } 606 return err 607 }) 608 609 // Any skipped pods that we never attempted to start shouldn't be expected. 610 // The skipped pods will be retried later. The next controller resync will 611 // retry the slow start process. 612 if skippedPods := diff - successfulCreations; skippedPods > 0 { 613 logger.V(2).Info("Slow-start failure. Skipping creation of pods, decrementing expectations", "podsSkipped", skippedPods, "kind", rsc.Kind, "replicaSet", klog.KObj(rs)) 614 for i := 0; i < skippedPods; i++ { 615 // Decrement the expected number of creates because the informer won't observe this pod 616 rsc.expectations.CreationObserved(logger, rsKey) 617 } 618 } 619 return err 620 } else if diff > 0 { 621 if diff > rsc.burstReplicas { 622 diff = rsc.burstReplicas 623 } 624 logger.V(2).Info("Too many replicas", "replicaSet", klog.KObj(rs), "need", *(rs.Spec.Replicas), "deleting", diff) 625 626 relatedPods, err := rsc.getIndirectlyRelatedPods(logger, rs) 627 utilruntime.HandleError(err) 628 629 // Choose which Pods to delete, preferring those in earlier phases of startup. 630 podsToDelete := getPodsToDelete(filteredPods, relatedPods, diff) 631 632 // Snapshot the UIDs (ns/name) of the pods we're expecting to see 633 // deleted, so we know to record their expectations exactly once either 634 // when we see it as an update of the deletion timestamp, or as a delete. 635 // Note that if the labels on a pod/rs change in a way that the pod gets 636 // orphaned, the rs will only wake up after the expectations have 637 // expired even if other pods are deleted. 638 rsc.expectations.ExpectDeletions(logger, rsKey, getPodKeys(podsToDelete)) 639 640 errCh := make(chan error, diff) 641 var wg sync.WaitGroup 642 wg.Add(diff) 643 for _, pod := range podsToDelete { 644 go func(targetPod *v1.Pod) { 645 defer wg.Done() 646 if err := rsc.podControl.DeletePod(ctx, rs.Namespace, targetPod.Name, rs); err != nil { 647 // Decrement the expected number of deletes because the informer won't observe this deletion 648 podKey := controller.PodKey(targetPod) 649 rsc.expectations.DeletionObserved(logger, rsKey, podKey) 650 if !apierrors.IsNotFound(err) { 651 logger.V(2).Info("Failed to delete pod, decremented expectations", "pod", podKey, "kind", rsc.Kind, "replicaSet", klog.KObj(rs)) 652 errCh <- err 653 } 654 } 655 }(pod) 656 } 657 wg.Wait() 658 659 select { 660 case err := <-errCh: 661 // all errors have been reported before and they're likely to be the same, so we'll only return the first one we hit. 662 if err != nil { 663 return err 664 } 665 default: 666 } 667 } 668 669 return nil 670 } 671 672 // syncReplicaSet will sync the ReplicaSet with the given key if it has had its expectations fulfilled, 673 // meaning it did not expect to see any more of its pods created or deleted. This function is not meant to be 674 // invoked concurrently with the same key. 675 func (rsc *ReplicaSetController) syncReplicaSet(ctx context.Context, key string) error { 676 logger := klog.FromContext(ctx) 677 startTime := time.Now() 678 defer func() { 679 logger.Info("Finished syncing", "kind", rsc.Kind, "key", key, "duration", time.Since(startTime)) 680 }() 681 682 namespace, name, err := cache.SplitMetaNamespaceKey(key) 683 if err != nil { 684 return err 685 } 686 rs, err := rsc.rsLister.ReplicaSets(namespace).Get(name) 687 if apierrors.IsNotFound(err) { 688 logger.V(4).Info("deleted", "kind", rsc.Kind, "key", key) 689 rsc.expectations.DeleteExpectations(logger, key) 690 return nil 691 } 692 if err != nil { 693 return err 694 } 695 696 rsNeedsSync := rsc.expectations.SatisfiedExpectations(logger, key) 697 selector, err := metav1.LabelSelectorAsSelector(rs.Spec.Selector) 698 if err != nil { 699 utilruntime.HandleError(fmt.Errorf("error converting pod selector to selector for rs %v/%v: %v", namespace, name, err)) 700 return nil 701 } 702 703 // list all pods to include the pods that don't match the rs`s selector 704 // anymore but has the stale controller ref. 705 // TODO: Do the List and Filter in a single pass, or use an index. 706 allPods, err := rsc.podLister.Pods(rs.Namespace).List(labels.Everything()) 707 if err != nil { 708 return err 709 } 710 // Ignore inactive pods. 711 filteredPods := controller.FilterActivePods(logger, allPods) 712 713 // NOTE: filteredPods are pointing to objects from cache - if you need to 714 // modify them, you need to copy it first. 715 filteredPods, err = rsc.claimPods(ctx, rs, selector, filteredPods) 716 if err != nil { 717 return err 718 } 719 720 var manageReplicasErr error 721 if rsNeedsSync && rs.DeletionTimestamp == nil { 722 manageReplicasErr = rsc.manageReplicas(ctx, filteredPods, rs) 723 } 724 rs = rs.DeepCopy() 725 newStatus := calculateStatus(rs, filteredPods, manageReplicasErr) 726 727 // Always updates status as pods come up or die. 728 updatedRS, err := updateReplicaSetStatus(logger, rsc.kubeClient.AppsV1().ReplicaSets(rs.Namespace), rs, newStatus) 729 if err != nil { 730 // Multiple things could lead to this update failing. Requeuing the replica set ensures 731 // Returning an error causes a requeue without forcing a hotloop 732 return err 733 } 734 // Resync the ReplicaSet after MinReadySeconds as a last line of defense to guard against clock-skew. 735 if manageReplicasErr == nil && updatedRS.Spec.MinReadySeconds > 0 && 736 updatedRS.Status.ReadyReplicas == *(updatedRS.Spec.Replicas) && 737 updatedRS.Status.AvailableReplicas != *(updatedRS.Spec.Replicas) { 738 rsc.queue.AddAfter(key, time.Duration(updatedRS.Spec.MinReadySeconds)*time.Second) 739 } 740 return manageReplicasErr 741 } 742 743 func (rsc *ReplicaSetController) claimPods(ctx context.Context, rs *apps.ReplicaSet, selector labels.Selector, filteredPods []*v1.Pod) ([]*v1.Pod, error) { 744 // If any adoptions are attempted, we should first recheck for deletion with 745 // an uncached quorum read sometime after listing Pods (see #42639). 746 canAdoptFunc := controller.RecheckDeletionTimestamp(func(ctx context.Context) (metav1.Object, error) { 747 fresh, err := rsc.kubeClient.AppsV1().ReplicaSets(rs.Namespace).Get(ctx, rs.Name, metav1.GetOptions{}) 748 if err != nil { 749 return nil, err 750 } 751 if fresh.UID != rs.UID { 752 return nil, fmt.Errorf("original %v %v/%v is gone: got uid %v, wanted %v", rsc.Kind, rs.Namespace, rs.Name, fresh.UID, rs.UID) 753 } 754 return fresh, nil 755 }) 756 cm := controller.NewPodControllerRefManager(rsc.podControl, rs, selector, rsc.GroupVersionKind, canAdoptFunc) 757 return cm.ClaimPods(ctx, filteredPods) 758 } 759 760 // slowStartBatch tries to call the provided function a total of 'count' times, 761 // starting slow to check for errors, then speeding up if calls succeed. 762 // 763 // It groups the calls into batches, starting with a group of initialBatchSize. 764 // Within each batch, it may call the function multiple times concurrently. 765 // 766 // If a whole batch succeeds, the next batch may get exponentially larger. 767 // If there are any failures in a batch, all remaining batches are skipped 768 // after waiting for the current batch to complete. 769 // 770 // It returns the number of successful calls to the function. 771 func slowStartBatch(count int, initialBatchSize int, fn func() error) (int, error) { 772 remaining := count 773 successes := 0 774 for batchSize := min(remaining, initialBatchSize); batchSize > 0; batchSize = min(2*batchSize, remaining) { 775 errCh := make(chan error, batchSize) 776 var wg sync.WaitGroup 777 wg.Add(batchSize) 778 for i := 0; i < batchSize; i++ { 779 go func() { 780 defer wg.Done() 781 if err := fn(); err != nil { 782 errCh <- err 783 } 784 }() 785 } 786 wg.Wait() 787 curSuccesses := batchSize - len(errCh) 788 successes += curSuccesses 789 if len(errCh) > 0 { 790 return successes, <-errCh 791 } 792 remaining -= batchSize 793 } 794 return successes, nil 795 } 796 797 // getIndirectlyRelatedPods returns all pods that are owned by any ReplicaSet 798 // that is owned by the given ReplicaSet's owner. 799 func (rsc *ReplicaSetController) getIndirectlyRelatedPods(logger klog.Logger, rs *apps.ReplicaSet) ([]*v1.Pod, error) { 800 var relatedPods []*v1.Pod 801 seen := make(map[types.UID]*apps.ReplicaSet) 802 for _, relatedRS := range rsc.getReplicaSetsWithSameController(logger, rs) { 803 selector, err := metav1.LabelSelectorAsSelector(relatedRS.Spec.Selector) 804 if err != nil { 805 // This object has an invalid selector, it does not match any pods 806 continue 807 } 808 pods, err := rsc.podLister.Pods(relatedRS.Namespace).List(selector) 809 if err != nil { 810 return nil, err 811 } 812 for _, pod := range pods { 813 if otherRS, found := seen[pod.UID]; found { 814 logger.V(5).Info("Pod is owned by both", "pod", klog.KObj(pod), "kind", rsc.Kind, "replicaSets", klog.KObjSlice([]klog.KMetadata{otherRS, relatedRS})) 815 continue 816 } 817 seen[pod.UID] = relatedRS 818 relatedPods = append(relatedPods, pod) 819 } 820 } 821 logger.V(4).Info("Found related pods", "kind", rsc.Kind, "replicaSet", klog.KObj(rs), "pods", klog.KObjSlice(relatedPods)) 822 return relatedPods, nil 823 } 824 825 func getPodsToDelete(filteredPods, relatedPods []*v1.Pod, diff int) []*v1.Pod { 826 // No need to sort pods if we are about to delete all of them. 827 // diff will always be <= len(filteredPods), so not need to handle > case. 828 if diff < len(filteredPods) { 829 podsWithRanks := getPodsRankedByRelatedPodsOnSameNode(filteredPods, relatedPods) 830 sort.Sort(podsWithRanks) 831 reportSortingDeletionAgeRatioMetric(filteredPods, diff) 832 } 833 return filteredPods[:diff] 834 } 835 836 func reportSortingDeletionAgeRatioMetric(filteredPods []*v1.Pod, diff int) { 837 now := time.Now() 838 youngestTime := time.Time{} 839 // first we need to check all of the ready pods to get the youngest, as they may not necessarily be sorted by timestamp alone 840 for _, pod := range filteredPods { 841 if pod.CreationTimestamp.Time.After(youngestTime) && podutil.IsPodReady(pod) { 842 youngestTime = pod.CreationTimestamp.Time 843 } 844 } 845 846 // for each pod chosen for deletion, report the ratio of its age to the youngest pod's age 847 for _, pod := range filteredPods[:diff] { 848 if !podutil.IsPodReady(pod) { 849 continue 850 } 851 ratio := float64(now.Sub(pod.CreationTimestamp.Time).Milliseconds() / now.Sub(youngestTime).Milliseconds()) 852 metrics.SortingDeletionAgeRatio.Observe(ratio) 853 } 854 } 855 856 // getPodsRankedByRelatedPodsOnSameNode returns an ActivePodsWithRanks value 857 // that wraps podsToRank and assigns each pod a rank equal to the number of 858 // active pods in relatedPods that are colocated on the same node with the pod. 859 // relatedPods generally should be a superset of podsToRank. 860 func getPodsRankedByRelatedPodsOnSameNode(podsToRank, relatedPods []*v1.Pod) controller.ActivePodsWithRanks { 861 podsOnNode := make(map[string]int) 862 for _, pod := range relatedPods { 863 if controller.IsPodActive(pod) { 864 podsOnNode[pod.Spec.NodeName]++ 865 } 866 } 867 ranks := make([]int, len(podsToRank)) 868 for i, pod := range podsToRank { 869 ranks[i] = podsOnNode[pod.Spec.NodeName] 870 } 871 return controller.ActivePodsWithRanks{Pods: podsToRank, Rank: ranks, Now: metav1.Now()} 872 } 873 874 func getPodKeys(pods []*v1.Pod) []string { 875 podKeys := make([]string, 0, len(pods)) 876 for _, pod := range pods { 877 podKeys = append(podKeys, controller.PodKey(pod)) 878 } 879 return podKeys 880 }