k8s.io/kubernetes@v1.31.0-alpha.0.0.20240520171757-56147500dadc/pkg/controller/statefulset/stateful_set.go (about) 1 /* 2 Copyright 2016 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package statefulset 18 19 import ( 20 "context" 21 "fmt" 22 "reflect" 23 "time" 24 25 apps "k8s.io/api/apps/v1" 26 v1 "k8s.io/api/core/v1" 27 "k8s.io/apimachinery/pkg/api/errors" 28 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 29 "k8s.io/apimachinery/pkg/labels" 30 utilruntime "k8s.io/apimachinery/pkg/util/runtime" 31 "k8s.io/apimachinery/pkg/util/wait" 32 appsinformers "k8s.io/client-go/informers/apps/v1" 33 coreinformers "k8s.io/client-go/informers/core/v1" 34 clientset "k8s.io/client-go/kubernetes" 35 "k8s.io/client-go/kubernetes/scheme" 36 v1core "k8s.io/client-go/kubernetes/typed/core/v1" 37 appslisters "k8s.io/client-go/listers/apps/v1" 38 corelisters "k8s.io/client-go/listers/core/v1" 39 "k8s.io/client-go/tools/cache" 40 "k8s.io/client-go/tools/record" 41 "k8s.io/client-go/util/workqueue" 42 podutil "k8s.io/kubernetes/pkg/api/v1/pod" 43 "k8s.io/kubernetes/pkg/controller" 44 "k8s.io/kubernetes/pkg/controller/history" 45 46 "k8s.io/klog/v2" 47 ) 48 49 // controllerKind contains the schema.GroupVersionKind for this controller type. 50 var controllerKind = apps.SchemeGroupVersion.WithKind("StatefulSet") 51 52 // StatefulSetController controls statefulsets. 53 type StatefulSetController struct { 54 // client interface 55 kubeClient clientset.Interface 56 // control returns an interface capable of syncing a stateful set. 57 // Abstracted out for testing. 58 control StatefulSetControlInterface 59 // podControl is used for patching pods. 60 podControl controller.PodControlInterface 61 // podLister is able to list/get pods from a shared informer's store 62 podLister corelisters.PodLister 63 // podListerSynced returns true if the pod shared informer has synced at least once 64 podListerSynced cache.InformerSynced 65 // setLister is able to list/get stateful sets from a shared informer's store 66 setLister appslisters.StatefulSetLister 67 // setListerSynced returns true if the stateful set shared informer has synced at least once 68 setListerSynced cache.InformerSynced 69 // pvcListerSynced returns true if the pvc shared informer has synced at least once 70 pvcListerSynced cache.InformerSynced 71 // revListerSynced returns true if the rev shared informer has synced at least once 72 revListerSynced cache.InformerSynced 73 // StatefulSets that need to be synced. 74 queue workqueue.TypedRateLimitingInterface[string] 75 // eventBroadcaster is the core of event processing pipeline. 76 eventBroadcaster record.EventBroadcaster 77 } 78 79 // NewStatefulSetController creates a new statefulset controller. 80 func NewStatefulSetController( 81 ctx context.Context, 82 podInformer coreinformers.PodInformer, 83 setInformer appsinformers.StatefulSetInformer, 84 pvcInformer coreinformers.PersistentVolumeClaimInformer, 85 revInformer appsinformers.ControllerRevisionInformer, 86 kubeClient clientset.Interface, 87 ) *StatefulSetController { 88 logger := klog.FromContext(ctx) 89 eventBroadcaster := record.NewBroadcaster(record.WithContext(ctx)) 90 recorder := eventBroadcaster.NewRecorder(scheme.Scheme, v1.EventSource{Component: "statefulset-controller"}) 91 ssc := &StatefulSetController{ 92 kubeClient: kubeClient, 93 control: NewDefaultStatefulSetControl( 94 NewStatefulPodControl( 95 kubeClient, 96 podInformer.Lister(), 97 pvcInformer.Lister(), 98 recorder), 99 NewRealStatefulSetStatusUpdater(kubeClient, setInformer.Lister()), 100 history.NewHistory(kubeClient, revInformer.Lister()), 101 ), 102 pvcListerSynced: pvcInformer.Informer().HasSynced, 103 revListerSynced: revInformer.Informer().HasSynced, 104 queue: workqueue.NewTypedRateLimitingQueueWithConfig( 105 workqueue.DefaultTypedControllerRateLimiter[string](), 106 workqueue.TypedRateLimitingQueueConfig[string]{Name: "statefulset"}, 107 ), 108 podControl: controller.RealPodControl{KubeClient: kubeClient, Recorder: recorder}, 109 110 eventBroadcaster: eventBroadcaster, 111 } 112 113 podInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ 114 // lookup the statefulset and enqueue 115 AddFunc: func(obj interface{}) { 116 ssc.addPod(logger, obj) 117 }, 118 // lookup current and old statefulset if labels changed 119 UpdateFunc: func(oldObj, newObj interface{}) { 120 ssc.updatePod(logger, oldObj, newObj) 121 }, 122 // lookup statefulset accounting for deletion tombstones 123 DeleteFunc: func(obj interface{}) { 124 ssc.deletePod(logger, obj) 125 }, 126 }) 127 ssc.podLister = podInformer.Lister() 128 ssc.podListerSynced = podInformer.Informer().HasSynced 129 130 setInformer.Informer().AddEventHandler( 131 cache.ResourceEventHandlerFuncs{ 132 AddFunc: ssc.enqueueStatefulSet, 133 UpdateFunc: func(old, cur interface{}) { 134 oldPS := old.(*apps.StatefulSet) 135 curPS := cur.(*apps.StatefulSet) 136 if oldPS.Status.Replicas != curPS.Status.Replicas { 137 logger.V(4).Info("Observed updated replica count for StatefulSet", "statefulSet", klog.KObj(curPS), "oldReplicas", oldPS.Status.Replicas, "newReplicas", curPS.Status.Replicas) 138 } 139 ssc.enqueueStatefulSet(cur) 140 }, 141 DeleteFunc: ssc.enqueueStatefulSet, 142 }, 143 ) 144 ssc.setLister = setInformer.Lister() 145 ssc.setListerSynced = setInformer.Informer().HasSynced 146 147 // TODO: Watch volumes 148 return ssc 149 } 150 151 // Run runs the statefulset controller. 152 func (ssc *StatefulSetController) Run(ctx context.Context, workers int) { 153 defer utilruntime.HandleCrash() 154 155 // Start events processing pipeline. 156 ssc.eventBroadcaster.StartStructuredLogging(3) 157 ssc.eventBroadcaster.StartRecordingToSink(&v1core.EventSinkImpl{Interface: ssc.kubeClient.CoreV1().Events("")}) 158 defer ssc.eventBroadcaster.Shutdown() 159 160 defer ssc.queue.ShutDown() 161 162 logger := klog.FromContext(ctx) 163 logger.Info("Starting stateful set controller") 164 defer logger.Info("Shutting down statefulset controller") 165 166 if !cache.WaitForNamedCacheSync("stateful set", ctx.Done(), ssc.podListerSynced, ssc.setListerSynced, ssc.pvcListerSynced, ssc.revListerSynced) { 167 return 168 } 169 170 for i := 0; i < workers; i++ { 171 go wait.UntilWithContext(ctx, ssc.worker, time.Second) 172 } 173 174 <-ctx.Done() 175 } 176 177 // addPod adds the statefulset for the pod to the sync queue 178 func (ssc *StatefulSetController) addPod(logger klog.Logger, obj interface{}) { 179 pod := obj.(*v1.Pod) 180 181 if pod.DeletionTimestamp != nil { 182 // on a restart of the controller manager, it's possible a new pod shows up in a state that 183 // is already pending deletion. Prevent the pod from being a creation observation. 184 ssc.deletePod(logger, pod) 185 return 186 } 187 188 // If it has a ControllerRef, that's all that matters. 189 if controllerRef := metav1.GetControllerOf(pod); controllerRef != nil { 190 set := ssc.resolveControllerRef(pod.Namespace, controllerRef) 191 if set == nil { 192 return 193 } 194 logger.V(4).Info("Pod created with labels", "pod", klog.KObj(pod), "labels", pod.Labels) 195 ssc.enqueueStatefulSet(set) 196 return 197 } 198 199 // Otherwise, it's an orphan. Get a list of all matching controllers and sync 200 // them to see if anyone wants to adopt it. 201 sets := ssc.getStatefulSetsForPod(pod) 202 if len(sets) == 0 { 203 return 204 } 205 logger.V(4).Info("Orphan Pod created with labels", "pod", klog.KObj(pod), "labels", pod.Labels) 206 for _, set := range sets { 207 ssc.enqueueStatefulSet(set) 208 } 209 } 210 211 // updatePod adds the statefulset for the current and old pods to the sync queue. 212 func (ssc *StatefulSetController) updatePod(logger klog.Logger, old, cur interface{}) { 213 curPod := cur.(*v1.Pod) 214 oldPod := old.(*v1.Pod) 215 if curPod.ResourceVersion == oldPod.ResourceVersion { 216 // In the event of a re-list we may receive update events for all known pods. 217 // Two different versions of the same pod will always have different RVs. 218 return 219 } 220 221 labelChanged := !reflect.DeepEqual(curPod.Labels, oldPod.Labels) 222 223 curControllerRef := metav1.GetControllerOf(curPod) 224 oldControllerRef := metav1.GetControllerOf(oldPod) 225 controllerRefChanged := !reflect.DeepEqual(curControllerRef, oldControllerRef) 226 if controllerRefChanged && oldControllerRef != nil { 227 // The ControllerRef was changed. Sync the old controller, if any. 228 if set := ssc.resolveControllerRef(oldPod.Namespace, oldControllerRef); set != nil { 229 ssc.enqueueStatefulSet(set) 230 } 231 } 232 233 // If it has a ControllerRef, that's all that matters. 234 if curControllerRef != nil { 235 set := ssc.resolveControllerRef(curPod.Namespace, curControllerRef) 236 if set == nil { 237 return 238 } 239 logger.V(4).Info("Pod objectMeta updated", "pod", klog.KObj(curPod), "oldObjectMeta", oldPod.ObjectMeta, "newObjectMeta", curPod.ObjectMeta) 240 if oldPod.Status.Phase != curPod.Status.Phase { 241 logger.V(4).Info("StatefulSet Pod phase changed", "pod", klog.KObj(curPod), "statefulSet", klog.KObj(set), "podPhase", curPod.Status.Phase) 242 } 243 ssc.enqueueStatefulSet(set) 244 // TODO: MinReadySeconds in the Pod will generate an Available condition to be added in 245 // the Pod status which in turn will trigger a requeue of the owning replica set thus 246 // having its status updated with the newly available replica. 247 if !podutil.IsPodReady(oldPod) && podutil.IsPodReady(curPod) && set.Spec.MinReadySeconds > 0 { 248 logger.V(2).Info("StatefulSet will be enqueued after minReadySeconds for availability check", "statefulSet", klog.KObj(set), "minReadySeconds", set.Spec.MinReadySeconds) 249 // Add a second to avoid milliseconds skew in AddAfter. 250 // See https://github.com/kubernetes/kubernetes/issues/39785#issuecomment-279959133 for more info. 251 ssc.enqueueSSAfter(set, (time.Duration(set.Spec.MinReadySeconds)*time.Second)+time.Second) 252 } 253 return 254 } 255 256 // Otherwise, it's an orphan. If anything changed, sync matching controllers 257 // to see if anyone wants to adopt it now. 258 if labelChanged || controllerRefChanged { 259 sets := ssc.getStatefulSetsForPod(curPod) 260 if len(sets) == 0 { 261 return 262 } 263 logger.V(4).Info("Orphan Pod objectMeta updated", "pod", klog.KObj(curPod), "oldObjectMeta", oldPod.ObjectMeta, "newObjectMeta", curPod.ObjectMeta) 264 for _, set := range sets { 265 ssc.enqueueStatefulSet(set) 266 } 267 } 268 } 269 270 // deletePod enqueues the statefulset for the pod accounting for deletion tombstones. 271 func (ssc *StatefulSetController) deletePod(logger klog.Logger, obj interface{}) { 272 pod, ok := obj.(*v1.Pod) 273 274 // When a delete is dropped, the relist will notice a pod in the store not 275 // in the list, leading to the insertion of a tombstone object which contains 276 // the deleted key/value. Note that this value might be stale. 277 if !ok { 278 tombstone, ok := obj.(cache.DeletedFinalStateUnknown) 279 if !ok { 280 utilruntime.HandleError(fmt.Errorf("couldn't get object from tombstone %+v", obj)) 281 return 282 } 283 pod, ok = tombstone.Obj.(*v1.Pod) 284 if !ok { 285 utilruntime.HandleError(fmt.Errorf("tombstone contained object that is not a pod %+v", obj)) 286 return 287 } 288 } 289 290 controllerRef := metav1.GetControllerOf(pod) 291 if controllerRef == nil { 292 // No controller should care about orphans being deleted. 293 return 294 } 295 set := ssc.resolveControllerRef(pod.Namespace, controllerRef) 296 if set == nil { 297 return 298 } 299 logger.V(4).Info("Pod deleted.", "pod", klog.KObj(pod), "caller", utilruntime.GetCaller()) 300 ssc.enqueueStatefulSet(set) 301 } 302 303 // getPodsForStatefulSet returns the Pods that a given StatefulSet should manage. 304 // It also reconciles ControllerRef by adopting/orphaning. 305 // 306 // NOTE: Returned Pods are pointers to objects from the cache. 307 // If you need to modify one, you need to copy it first. 308 func (ssc *StatefulSetController) getPodsForStatefulSet(ctx context.Context, set *apps.StatefulSet, selector labels.Selector) ([]*v1.Pod, error) { 309 // List all pods to include the pods that don't match the selector anymore but 310 // has a ControllerRef pointing to this StatefulSet. 311 pods, err := ssc.podLister.Pods(set.Namespace).List(labels.Everything()) 312 if err != nil { 313 return nil, err 314 } 315 316 filter := func(pod *v1.Pod) bool { 317 // Only claim if it matches our StatefulSet name. Otherwise release/ignore. 318 return isMemberOf(set, pod) 319 } 320 321 cm := controller.NewPodControllerRefManager(ssc.podControl, set, selector, controllerKind, ssc.canAdoptFunc(ctx, set)) 322 return cm.ClaimPods(ctx, pods, filter) 323 } 324 325 // If any adoptions are attempted, we should first recheck for deletion with 326 // an uncached quorum read sometime after listing Pods/ControllerRevisions (see #42639). 327 func (ssc *StatefulSetController) canAdoptFunc(ctx context.Context, set *apps.StatefulSet) func(ctx2 context.Context) error { 328 return controller.RecheckDeletionTimestamp(func(ctx context.Context) (metav1.Object, error) { 329 fresh, err := ssc.kubeClient.AppsV1().StatefulSets(set.Namespace).Get(ctx, set.Name, metav1.GetOptions{}) 330 if err != nil { 331 return nil, err 332 } 333 if fresh.UID != set.UID { 334 return nil, fmt.Errorf("original StatefulSet %v/%v is gone: got uid %v, wanted %v", set.Namespace, set.Name, fresh.UID, set.UID) 335 } 336 return fresh, nil 337 }) 338 } 339 340 // adoptOrphanRevisions adopts any orphaned ControllerRevisions matched by set's Selector. 341 func (ssc *StatefulSetController) adoptOrphanRevisions(ctx context.Context, set *apps.StatefulSet) error { 342 revisions, err := ssc.control.ListRevisions(set) 343 if err != nil { 344 return err 345 } 346 orphanRevisions := make([]*apps.ControllerRevision, 0) 347 for i := range revisions { 348 if metav1.GetControllerOf(revisions[i]) == nil { 349 orphanRevisions = append(orphanRevisions, revisions[i]) 350 } 351 } 352 if len(orphanRevisions) > 0 { 353 canAdoptErr := ssc.canAdoptFunc(ctx, set)(ctx) 354 if canAdoptErr != nil { 355 return fmt.Errorf("can't adopt ControllerRevisions: %v", canAdoptErr) 356 } 357 return ssc.control.AdoptOrphanRevisions(set, orphanRevisions) 358 } 359 return nil 360 } 361 362 // getStatefulSetsForPod returns a list of StatefulSets that potentially match 363 // a given pod. 364 func (ssc *StatefulSetController) getStatefulSetsForPod(pod *v1.Pod) []*apps.StatefulSet { 365 sets, err := ssc.setLister.GetPodStatefulSets(pod) 366 if err != nil { 367 return nil 368 } 369 // More than one set is selecting the same Pod 370 if len(sets) > 1 { 371 // ControllerRef will ensure we don't do anything crazy, but more than one 372 // item in this list nevertheless constitutes user error. 373 setNames := []string{} 374 for _, s := range sets { 375 setNames = append(setNames, s.Name) 376 } 377 utilruntime.HandleError( 378 fmt.Errorf( 379 "user error: more than one StatefulSet is selecting pods with labels: %+v. Sets: %v", 380 pod.Labels, setNames)) 381 } 382 return sets 383 } 384 385 // resolveControllerRef returns the controller referenced by a ControllerRef, 386 // or nil if the ControllerRef could not be resolved to a matching controller 387 // of the correct Kind. 388 func (ssc *StatefulSetController) resolveControllerRef(namespace string, controllerRef *metav1.OwnerReference) *apps.StatefulSet { 389 // We can't look up by UID, so look up by Name and then verify UID. 390 // Don't even try to look up by Name if it's the wrong Kind. 391 if controllerRef.Kind != controllerKind.Kind { 392 return nil 393 } 394 set, err := ssc.setLister.StatefulSets(namespace).Get(controllerRef.Name) 395 if err != nil { 396 return nil 397 } 398 if set.UID != controllerRef.UID { 399 // The controller we found with this Name is not the same one that the 400 // ControllerRef points to. 401 return nil 402 } 403 return set 404 } 405 406 // enqueueStatefulSet enqueues the given statefulset in the work queue. 407 func (ssc *StatefulSetController) enqueueStatefulSet(obj interface{}) { 408 key, err := controller.KeyFunc(obj) 409 if err != nil { 410 utilruntime.HandleError(fmt.Errorf("couldn't get key for object %+v: %v", obj, err)) 411 return 412 } 413 ssc.queue.Add(key) 414 } 415 416 // enqueueStatefulSet enqueues the given statefulset in the work queue after given time 417 func (ssc *StatefulSetController) enqueueSSAfter(ss *apps.StatefulSet, duration time.Duration) { 418 key, err := controller.KeyFunc(ss) 419 if err != nil { 420 utilruntime.HandleError(fmt.Errorf("couldn't get key for object %#v: %v", ss, err)) 421 return 422 } 423 ssc.queue.AddAfter(key, duration) 424 } 425 426 // processNextWorkItem dequeues items, processes them, and marks them done. It enforces that the syncHandler is never 427 // invoked concurrently with the same key. 428 func (ssc *StatefulSetController) processNextWorkItem(ctx context.Context) bool { 429 key, quit := ssc.queue.Get() 430 if quit { 431 return false 432 } 433 defer ssc.queue.Done(key) 434 if err := ssc.sync(ctx, key); err != nil { 435 utilruntime.HandleError(fmt.Errorf("error syncing StatefulSet %v, requeuing: %w", key, err)) 436 ssc.queue.AddRateLimited(key) 437 } else { 438 ssc.queue.Forget(key) 439 } 440 return true 441 } 442 443 // worker runs a worker goroutine that invokes processNextWorkItem until the controller's queue is closed 444 func (ssc *StatefulSetController) worker(ctx context.Context) { 445 for ssc.processNextWorkItem(ctx) { 446 } 447 } 448 449 // sync syncs the given statefulset. 450 func (ssc *StatefulSetController) sync(ctx context.Context, key string) error { 451 startTime := time.Now() 452 logger := klog.FromContext(ctx) 453 defer func() { 454 logger.V(4).Info("Finished syncing statefulset", "key", key, "time", time.Since(startTime)) 455 }() 456 457 namespace, name, err := cache.SplitMetaNamespaceKey(key) 458 if err != nil { 459 return err 460 } 461 set, err := ssc.setLister.StatefulSets(namespace).Get(name) 462 if errors.IsNotFound(err) { 463 logger.Info("StatefulSet has been deleted", "key", key) 464 return nil 465 } 466 if err != nil { 467 utilruntime.HandleError(fmt.Errorf("unable to retrieve StatefulSet %v from store: %v", key, err)) 468 return err 469 } 470 471 selector, err := metav1.LabelSelectorAsSelector(set.Spec.Selector) 472 if err != nil { 473 utilruntime.HandleError(fmt.Errorf("error converting StatefulSet %v selector: %v", key, err)) 474 // This is a non-transient error, so don't retry. 475 return nil 476 } 477 478 if err := ssc.adoptOrphanRevisions(ctx, set); err != nil { 479 return err 480 } 481 482 pods, err := ssc.getPodsForStatefulSet(ctx, set, selector) 483 if err != nil { 484 return err 485 } 486 487 return ssc.syncStatefulSet(ctx, set, pods) 488 } 489 490 // syncStatefulSet syncs a tuple of (statefulset, []*v1.Pod). 491 func (ssc *StatefulSetController) syncStatefulSet(ctx context.Context, set *apps.StatefulSet, pods []*v1.Pod) error { 492 logger := klog.FromContext(ctx) 493 logger.V(4).Info("Syncing StatefulSet with pods", "statefulSet", klog.KObj(set), "pods", len(pods)) 494 var status *apps.StatefulSetStatus 495 var err error 496 status, err = ssc.control.UpdateStatefulSet(ctx, set, pods) 497 if err != nil { 498 return err 499 } 500 logger.V(4).Info("Successfully synced StatefulSet", "statefulSet", klog.KObj(set)) 501 // One more sync to handle the clock skew. This is also helping in requeuing right after status update 502 if set.Spec.MinReadySeconds > 0 && status != nil && status.AvailableReplicas != *set.Spec.Replicas { 503 ssc.enqueueSSAfter(set, time.Duration(set.Spec.MinReadySeconds)*time.Second) 504 } 505 506 return nil 507 }