k8s.io/kubernetes@v1.31.0-alpha.0.0.20240520171757-56147500dadc/pkg/controller/statefulset/stateful_set_control.go (about)

     1  /*
     2  Copyright 2016 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package statefulset
    18  
    19  import (
    20  	"context"
    21  	"sort"
    22  	"sync"
    23  
    24  	apps "k8s.io/api/apps/v1"
    25  	v1 "k8s.io/api/core/v1"
    26  	"k8s.io/apimachinery/pkg/api/errors"
    27  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    28  	utilerrors "k8s.io/apimachinery/pkg/util/errors"
    29  	utilfeature "k8s.io/apiserver/pkg/util/feature"
    30  	"k8s.io/klog/v2"
    31  	"k8s.io/kubernetes/pkg/controller/history"
    32  	"k8s.io/kubernetes/pkg/features"
    33  )
    34  
    35  // Realistic value for maximum in-flight requests when processing in parallel mode.
    36  const MaxBatchSize = 500
    37  
    38  // StatefulSetControl implements the control logic for updating StatefulSets and their children Pods. It is implemented
    39  // as an interface to allow for extensions that provide different semantics. Currently, there is only one implementation.
    40  type StatefulSetControlInterface interface {
    41  	// UpdateStatefulSet implements the control logic for Pod creation, update, and deletion, and
    42  	// persistent volume creation, update, and deletion.
    43  	// If an implementation returns a non-nil error, the invocation will be retried using a rate-limited strategy.
    44  	// Implementors should sink any errors that they do not wish to trigger a retry, and they may feel free to
    45  	// exit exceptionally at any point provided they wish the update to be re-run at a later point in time.
    46  	UpdateStatefulSet(ctx context.Context, set *apps.StatefulSet, pods []*v1.Pod) (*apps.StatefulSetStatus, error)
    47  	// ListRevisions returns a array of the ControllerRevisions that represent the revisions of set. If the returned
    48  	// error is nil, the returns slice of ControllerRevisions is valid.
    49  	ListRevisions(set *apps.StatefulSet) ([]*apps.ControllerRevision, error)
    50  	// AdoptOrphanRevisions adopts any orphaned ControllerRevisions that match set's Selector. If all adoptions are
    51  	// successful the returned error is nil.
    52  	AdoptOrphanRevisions(set *apps.StatefulSet, revisions []*apps.ControllerRevision) error
    53  }
    54  
    55  // NewDefaultStatefulSetControl returns a new instance of the default implementation StatefulSetControlInterface that
    56  // implements the documented semantics for StatefulSets. podControl is the PodControlInterface used to create, update,
    57  // and delete Pods and to create PersistentVolumeClaims. statusUpdater is the StatefulSetStatusUpdaterInterface used
    58  // to update the status of StatefulSets. You should use an instance returned from NewRealStatefulPodControl() for any
    59  // scenario other than testing.
    60  func NewDefaultStatefulSetControl(
    61  	podControl *StatefulPodControl,
    62  	statusUpdater StatefulSetStatusUpdaterInterface,
    63  	controllerHistory history.Interface) StatefulSetControlInterface {
    64  	return &defaultStatefulSetControl{podControl, statusUpdater, controllerHistory}
    65  }
    66  
    67  type defaultStatefulSetControl struct {
    68  	podControl        *StatefulPodControl
    69  	statusUpdater     StatefulSetStatusUpdaterInterface
    70  	controllerHistory history.Interface
    71  }
    72  
    73  // UpdateStatefulSet executes the core logic loop for a stateful set, applying the predictable and
    74  // consistent monotonic update strategy by default - scale up proceeds in ordinal order, no new pod
    75  // is created while any pod is unhealthy, and pods are terminated in descending order. The burst
    76  // strategy allows these constraints to be relaxed - pods will be created and deleted eagerly and
    77  // in no particular order. Clients using the burst strategy should be careful to ensure they
    78  // understand the consistency implications of having unpredictable numbers of pods available.
    79  func (ssc *defaultStatefulSetControl) UpdateStatefulSet(ctx context.Context, set *apps.StatefulSet, pods []*v1.Pod) (*apps.StatefulSetStatus, error) {
    80  	set = set.DeepCopy() // set is modified when a new revision is created in performUpdate. Make a copy now to avoid mutation errors.
    81  
    82  	// list all revisions and sort them
    83  	revisions, err := ssc.ListRevisions(set)
    84  	if err != nil {
    85  		return nil, err
    86  	}
    87  	history.SortControllerRevisions(revisions)
    88  
    89  	currentRevision, updateRevision, status, err := ssc.performUpdate(ctx, set, pods, revisions)
    90  	if err != nil {
    91  		errs := []error{err}
    92  		if agg, ok := err.(utilerrors.Aggregate); ok {
    93  			errs = agg.Errors()
    94  		}
    95  		return nil, utilerrors.NewAggregate(append(errs, ssc.truncateHistory(set, pods, revisions, currentRevision, updateRevision)))
    96  	}
    97  
    98  	// maintain the set's revision history limit
    99  	return status, ssc.truncateHistory(set, pods, revisions, currentRevision, updateRevision)
   100  }
   101  
   102  func (ssc *defaultStatefulSetControl) performUpdate(
   103  	ctx context.Context, set *apps.StatefulSet, pods []*v1.Pod, revisions []*apps.ControllerRevision) (*apps.ControllerRevision, *apps.ControllerRevision, *apps.StatefulSetStatus, error) {
   104  	var currentStatus *apps.StatefulSetStatus
   105  	logger := klog.FromContext(ctx)
   106  	// get the current, and update revisions
   107  	currentRevision, updateRevision, collisionCount, err := ssc.getStatefulSetRevisions(set, revisions)
   108  	if err != nil {
   109  		return currentRevision, updateRevision, currentStatus, err
   110  	}
   111  
   112  	// perform the main update function and get the status
   113  	currentStatus, err = ssc.updateStatefulSet(ctx, set, currentRevision, updateRevision, collisionCount, pods)
   114  	if err != nil && currentStatus == nil {
   115  		return currentRevision, updateRevision, nil, err
   116  	}
   117  
   118  	// make sure to update the latest status even if there is an error with non-nil currentStatus
   119  	statusErr := ssc.updateStatefulSetStatus(ctx, set, currentStatus)
   120  	if statusErr == nil {
   121  		logger.V(4).Info("Updated status", "statefulSet", klog.KObj(set),
   122  			"replicas", currentStatus.Replicas,
   123  			"readyReplicas", currentStatus.ReadyReplicas,
   124  			"currentReplicas", currentStatus.CurrentReplicas,
   125  			"updatedReplicas", currentStatus.UpdatedReplicas)
   126  	}
   127  
   128  	switch {
   129  	case err != nil && statusErr != nil:
   130  		logger.Error(statusErr, "Could not update status", "statefulSet", klog.KObj(set))
   131  		return currentRevision, updateRevision, currentStatus, err
   132  	case err != nil:
   133  		return currentRevision, updateRevision, currentStatus, err
   134  	case statusErr != nil:
   135  		return currentRevision, updateRevision, currentStatus, statusErr
   136  	}
   137  
   138  	logger.V(4).Info("StatefulSet revisions", "statefulSet", klog.KObj(set),
   139  		"currentRevision", currentStatus.CurrentRevision,
   140  		"updateRevision", currentStatus.UpdateRevision)
   141  
   142  	return currentRevision, updateRevision, currentStatus, nil
   143  }
   144  
   145  func (ssc *defaultStatefulSetControl) ListRevisions(set *apps.StatefulSet) ([]*apps.ControllerRevision, error) {
   146  	selector, err := metav1.LabelSelectorAsSelector(set.Spec.Selector)
   147  	if err != nil {
   148  		return nil, err
   149  	}
   150  	return ssc.controllerHistory.ListControllerRevisions(set, selector)
   151  }
   152  
   153  func (ssc *defaultStatefulSetControl) AdoptOrphanRevisions(
   154  	set *apps.StatefulSet,
   155  	revisions []*apps.ControllerRevision) error {
   156  	for i := range revisions {
   157  		adopted, err := ssc.controllerHistory.AdoptControllerRevision(set, controllerKind, revisions[i])
   158  		if err != nil {
   159  			return err
   160  		}
   161  		revisions[i] = adopted
   162  	}
   163  	return nil
   164  }
   165  
   166  // truncateHistory truncates any non-live ControllerRevisions in revisions from set's history. The UpdateRevision and
   167  // CurrentRevision in set's Status are considered to be live. Any revisions associated with the Pods in pods are also
   168  // considered to be live. Non-live revisions are deleted, starting with the revision with the lowest Revision, until
   169  // only RevisionHistoryLimit revisions remain. If the returned error is nil the operation was successful. This method
   170  // expects that revisions is sorted when supplied.
   171  func (ssc *defaultStatefulSetControl) truncateHistory(
   172  	set *apps.StatefulSet,
   173  	pods []*v1.Pod,
   174  	revisions []*apps.ControllerRevision,
   175  	current *apps.ControllerRevision,
   176  	update *apps.ControllerRevision) error {
   177  	history := make([]*apps.ControllerRevision, 0, len(revisions))
   178  	// mark all live revisions
   179  	live := map[string]bool{}
   180  	if current != nil {
   181  		live[current.Name] = true
   182  	}
   183  	if update != nil {
   184  		live[update.Name] = true
   185  	}
   186  	for i := range pods {
   187  		live[getPodRevision(pods[i])] = true
   188  	}
   189  	// collect live revisions and historic revisions
   190  	for i := range revisions {
   191  		if !live[revisions[i].Name] {
   192  			history = append(history, revisions[i])
   193  		}
   194  	}
   195  	historyLen := len(history)
   196  	historyLimit := int(*set.Spec.RevisionHistoryLimit)
   197  	if historyLen <= historyLimit {
   198  		return nil
   199  	}
   200  	// delete any non-live history to maintain the revision limit.
   201  	history = history[:(historyLen - historyLimit)]
   202  	for i := 0; i < len(history); i++ {
   203  		if err := ssc.controllerHistory.DeleteControllerRevision(history[i]); err != nil {
   204  			return err
   205  		}
   206  	}
   207  	return nil
   208  }
   209  
   210  // getStatefulSetRevisions returns the current and update ControllerRevisions for set. It also
   211  // returns a collision count that records the number of name collisions set saw when creating
   212  // new ControllerRevisions. This count is incremented on every name collision and is used in
   213  // building the ControllerRevision names for name collision avoidance. This method may create
   214  // a new revision, or modify the Revision of an existing revision if an update to set is detected.
   215  // This method expects that revisions is sorted when supplied.
   216  func (ssc *defaultStatefulSetControl) getStatefulSetRevisions(
   217  	set *apps.StatefulSet,
   218  	revisions []*apps.ControllerRevision) (*apps.ControllerRevision, *apps.ControllerRevision, int32, error) {
   219  	var currentRevision, updateRevision *apps.ControllerRevision
   220  
   221  	revisionCount := len(revisions)
   222  	history.SortControllerRevisions(revisions)
   223  
   224  	// Use a local copy of set.Status.CollisionCount to avoid modifying set.Status directly.
   225  	// This copy is returned so the value gets carried over to set.Status in updateStatefulSet.
   226  	var collisionCount int32
   227  	if set.Status.CollisionCount != nil {
   228  		collisionCount = *set.Status.CollisionCount
   229  	}
   230  
   231  	// create a new revision from the current set
   232  	updateRevision, err := newRevision(set, nextRevision(revisions), &collisionCount)
   233  	if err != nil {
   234  		return nil, nil, collisionCount, err
   235  	}
   236  
   237  	// find any equivalent revisions
   238  	equalRevisions := history.FindEqualRevisions(revisions, updateRevision)
   239  	equalCount := len(equalRevisions)
   240  
   241  	if equalCount > 0 && history.EqualRevision(revisions[revisionCount-1], equalRevisions[equalCount-1]) {
   242  		// if the equivalent revision is immediately prior the update revision has not changed
   243  		updateRevision = revisions[revisionCount-1]
   244  	} else if equalCount > 0 {
   245  		// if the equivalent revision is not immediately prior we will roll back by incrementing the
   246  		// Revision of the equivalent revision
   247  		updateRevision, err = ssc.controllerHistory.UpdateControllerRevision(
   248  			equalRevisions[equalCount-1],
   249  			updateRevision.Revision)
   250  		if err != nil {
   251  			return nil, nil, collisionCount, err
   252  		}
   253  	} else {
   254  		//if there is no equivalent revision we create a new one
   255  		updateRevision, err = ssc.controllerHistory.CreateControllerRevision(set, updateRevision, &collisionCount)
   256  		if err != nil {
   257  			return nil, nil, collisionCount, err
   258  		}
   259  	}
   260  
   261  	// attempt to find the revision that corresponds to the current revision
   262  	for i := range revisions {
   263  		if revisions[i].Name == set.Status.CurrentRevision {
   264  			currentRevision = revisions[i]
   265  			break
   266  		}
   267  	}
   268  
   269  	// if the current revision is nil we initialize the history by setting it to the update revision
   270  	if currentRevision == nil {
   271  		currentRevision = updateRevision
   272  	}
   273  
   274  	return currentRevision, updateRevision, collisionCount, nil
   275  }
   276  
   277  func slowStartBatch(initialBatchSize int, remaining int, fn func(int) (bool, error)) (int, error) {
   278  	successes := 0
   279  	j := 0
   280  	for batchSize := min(remaining, initialBatchSize); batchSize > 0; batchSize = min(min(2*batchSize, remaining), MaxBatchSize) {
   281  		errCh := make(chan error, batchSize)
   282  		var wg sync.WaitGroup
   283  		wg.Add(batchSize)
   284  		for i := 0; i < batchSize; i++ {
   285  			go func(k int) {
   286  				defer wg.Done()
   287  				// Ignore the first parameter - relevant for monotonic only.
   288  				if _, err := fn(k); err != nil {
   289  					errCh <- err
   290  				}
   291  			}(j)
   292  			j++
   293  		}
   294  		wg.Wait()
   295  		successes += batchSize - len(errCh)
   296  		close(errCh)
   297  		if len(errCh) > 0 {
   298  			errs := make([]error, 0)
   299  			for err := range errCh {
   300  				errs = append(errs, err)
   301  			}
   302  			return successes, utilerrors.NewAggregate(errs)
   303  		}
   304  		remaining -= batchSize
   305  	}
   306  	return successes, nil
   307  }
   308  
   309  type replicaStatus struct {
   310  	replicas          int32
   311  	readyReplicas     int32
   312  	availableReplicas int32
   313  	currentReplicas   int32
   314  	updatedReplicas   int32
   315  }
   316  
   317  func computeReplicaStatus(pods []*v1.Pod, minReadySeconds int32, currentRevision, updateRevision *apps.ControllerRevision) replicaStatus {
   318  	status := replicaStatus{}
   319  	for _, pod := range pods {
   320  		if isCreated(pod) {
   321  			status.replicas++
   322  		}
   323  
   324  		// count the number of running and ready replicas
   325  		if isRunningAndReady(pod) {
   326  			status.readyReplicas++
   327  			// count the number of running and available replicas
   328  			if isRunningAndAvailable(pod, minReadySeconds) {
   329  				status.availableReplicas++
   330  			}
   331  
   332  		}
   333  
   334  		// count the number of current and update replicas
   335  		if isCreated(pod) && !isTerminating(pod) {
   336  			revision := getPodRevision(pod)
   337  			if revision == currentRevision.Name {
   338  				status.currentReplicas++
   339  			}
   340  			if revision == updateRevision.Name {
   341  				status.updatedReplicas++
   342  			}
   343  		}
   344  	}
   345  	return status
   346  }
   347  
   348  func updateStatus(status *apps.StatefulSetStatus, minReadySeconds int32, currentRevision, updateRevision *apps.ControllerRevision, podLists ...[]*v1.Pod) {
   349  	status.Replicas = 0
   350  	status.ReadyReplicas = 0
   351  	status.AvailableReplicas = 0
   352  	status.CurrentReplicas = 0
   353  	status.UpdatedReplicas = 0
   354  	for _, list := range podLists {
   355  		replicaStatus := computeReplicaStatus(list, minReadySeconds, currentRevision, updateRevision)
   356  		status.Replicas += replicaStatus.replicas
   357  		status.ReadyReplicas += replicaStatus.readyReplicas
   358  		status.AvailableReplicas += replicaStatus.availableReplicas
   359  		status.CurrentReplicas += replicaStatus.currentReplicas
   360  		status.UpdatedReplicas += replicaStatus.updatedReplicas
   361  	}
   362  }
   363  
   364  func (ssc *defaultStatefulSetControl) processReplica(
   365  	ctx context.Context,
   366  	set *apps.StatefulSet,
   367  	updateSet *apps.StatefulSet,
   368  	monotonic bool,
   369  	replicas []*v1.Pod,
   370  	i int) (bool, error) {
   371  	logger := klog.FromContext(ctx)
   372  
   373  	// Note that pods with phase Succeeded will also trigger this event. This is
   374  	// because final pod phase of evicted or otherwise forcibly stopped pods
   375  	// (e.g. terminated on node reboot) is determined by the exit code of the
   376  	// container, not by the reason for pod termination. We should restart the pod
   377  	// regardless of the exit code.
   378  	if isFailed(replicas[i]) || isSucceeded(replicas[i]) {
   379  		if replicas[i].DeletionTimestamp == nil {
   380  			if err := ssc.podControl.DeleteStatefulPod(set, replicas[i]); err != nil {
   381  				return true, err
   382  			}
   383  		}
   384  		// New pod should be generated on the next sync after the current pod is removed from etcd.
   385  		return true, nil
   386  	}
   387  	// If we find a Pod that has not been created we create the Pod
   388  	if !isCreated(replicas[i]) {
   389  		if utilfeature.DefaultFeatureGate.Enabled(features.StatefulSetAutoDeletePVC) {
   390  			if isStale, err := ssc.podControl.PodClaimIsStale(set, replicas[i]); err != nil {
   391  				return true, err
   392  			} else if isStale {
   393  				// If a pod has a stale PVC, no more work can be done this round.
   394  				return true, err
   395  			}
   396  		}
   397  		if err := ssc.podControl.CreateStatefulPod(ctx, set, replicas[i]); err != nil {
   398  			return true, err
   399  		}
   400  		if monotonic {
   401  			// if the set does not allow bursting, return immediately
   402  			return true, nil
   403  		}
   404  	}
   405  
   406  	// If the Pod is in pending state then trigger PVC creation to create missing PVCs
   407  	if isPending(replicas[i]) {
   408  		logger.V(4).Info(
   409  			"StatefulSet is triggering PVC creation for pending Pod",
   410  			"statefulSet", klog.KObj(set), "pod", klog.KObj(replicas[i]))
   411  		if err := ssc.podControl.createMissingPersistentVolumeClaims(ctx, set, replicas[i]); err != nil {
   412  			return true, err
   413  		}
   414  	}
   415  
   416  	// If we find a Pod that is currently terminating, we must wait until graceful deletion
   417  	// completes before we continue to make progress.
   418  	if isTerminating(replicas[i]) && monotonic {
   419  		logger.V(4).Info("StatefulSet is waiting for Pod to Terminate",
   420  			"statefulSet", klog.KObj(set), "pod", klog.KObj(replicas[i]))
   421  		return true, nil
   422  	}
   423  
   424  	// If we have a Pod that has been created but is not running and ready we can not make progress.
   425  	// We must ensure that all for each Pod, when we create it, all of its predecessors, with respect to its
   426  	// ordinal, are Running and Ready.
   427  	if !isRunningAndReady(replicas[i]) && monotonic {
   428  		logger.V(4).Info("StatefulSet is waiting for Pod to be Running and Ready",
   429  			"statefulSet", klog.KObj(set), "pod", klog.KObj(replicas[i]))
   430  		return true, nil
   431  	}
   432  
   433  	// If we have a Pod that has been created but is not available we can not make progress.
   434  	// We must ensure that all for each Pod, when we create it, all of its predecessors, with respect to its
   435  	// ordinal, are Available.
   436  	if !isRunningAndAvailable(replicas[i], set.Spec.MinReadySeconds) && monotonic {
   437  		logger.V(4).Info("StatefulSet is waiting for Pod to be Available",
   438  			"statefulSet", klog.KObj(set), "pod", klog.KObj(replicas[i]))
   439  		return true, nil
   440  	}
   441  
   442  	// Enforce the StatefulSet invariants
   443  	retentionMatch := true
   444  	if utilfeature.DefaultFeatureGate.Enabled(features.StatefulSetAutoDeletePVC) {
   445  		var err error
   446  		retentionMatch, err = ssc.podControl.ClaimsMatchRetentionPolicy(ctx, updateSet, replicas[i])
   447  		// An error is expected if the pod is not yet fully updated, and so return is treated as matching.
   448  		if err != nil {
   449  			retentionMatch = true
   450  		}
   451  	}
   452  
   453  	if identityMatches(set, replicas[i]) && storageMatches(set, replicas[i]) && retentionMatch {
   454  		return false, nil
   455  	}
   456  
   457  	// Make a deep copy so we don't mutate the shared cache
   458  	replica := replicas[i].DeepCopy()
   459  	if err := ssc.podControl.UpdateStatefulPod(ctx, updateSet, replica); err != nil {
   460  		return true, err
   461  	}
   462  
   463  	return false, nil
   464  }
   465  
   466  func (ssc *defaultStatefulSetControl) processCondemned(ctx context.Context, set *apps.StatefulSet, firstUnhealthyPod *v1.Pod, monotonic bool, condemned []*v1.Pod, i int) (bool, error) {
   467  	logger := klog.FromContext(ctx)
   468  	if isTerminating(condemned[i]) {
   469  		// if we are in monotonic mode, block and wait for terminating pods to expire
   470  		if monotonic {
   471  			logger.V(4).Info("StatefulSet is waiting for Pod to Terminate prior to scale down",
   472  				"statefulSet", klog.KObj(set), "pod", klog.KObj(condemned[i]))
   473  			return true, nil
   474  		}
   475  		return false, nil
   476  	}
   477  	// if we are in monotonic mode and the condemned target is not the first unhealthy Pod block
   478  	if !isRunningAndReady(condemned[i]) && monotonic && condemned[i] != firstUnhealthyPod {
   479  		logger.V(4).Info("StatefulSet is waiting for Pod to be Running and Ready prior to scale down",
   480  			"statefulSet", klog.KObj(set), "pod", klog.KObj(firstUnhealthyPod))
   481  		return true, nil
   482  	}
   483  	// if we are in monotonic mode and the condemned target is not the first unhealthy Pod, block.
   484  	if !isRunningAndAvailable(condemned[i], set.Spec.MinReadySeconds) && monotonic && condemned[i] != firstUnhealthyPod {
   485  		logger.V(4).Info("StatefulSet is waiting for Pod to be Available prior to scale down",
   486  			"statefulSet", klog.KObj(set), "pod", klog.KObj(firstUnhealthyPod))
   487  		return true, nil
   488  	}
   489  
   490  	logger.V(2).Info("Pod of StatefulSet is terminating for scale down",
   491  		"statefulSet", klog.KObj(set), "pod", klog.KObj(condemned[i]))
   492  	return true, ssc.podControl.DeleteStatefulPod(set, condemned[i])
   493  }
   494  
   495  func runForAll(pods []*v1.Pod, fn func(i int) (bool, error), monotonic bool) (bool, error) {
   496  	if monotonic {
   497  		for i := range pods {
   498  			if shouldExit, err := fn(i); shouldExit || err != nil {
   499  				return true, err
   500  			}
   501  		}
   502  	} else {
   503  		if _, err := slowStartBatch(1, len(pods), fn); err != nil {
   504  			return true, err
   505  		}
   506  	}
   507  	return false, nil
   508  }
   509  
   510  // updateStatefulSet performs the update function for a StatefulSet. This method creates, updates, and deletes Pods in
   511  // the set in order to conform the system to the target state for the set. The target state always contains
   512  // set.Spec.Replicas Pods with a Ready Condition. If the UpdateStrategy.Type for the set is
   513  // RollingUpdateStatefulSetStrategyType then all Pods in the set must be at set.Status.CurrentRevision.
   514  // If the UpdateStrategy.Type for the set is OnDeleteStatefulSetStrategyType, the target state implies nothing about
   515  // the revisions of Pods in the set. If the UpdateStrategy.Type for the set is PartitionStatefulSetStrategyType, then
   516  // all Pods with ordinal less than UpdateStrategy.Partition.Ordinal must be at Status.CurrentRevision and all other
   517  // Pods must be at Status.UpdateRevision. If the returned error is nil, the returned StatefulSetStatus is valid and the
   518  // update must be recorded. If the error is not nil, the method should be retried until successful.
   519  func (ssc *defaultStatefulSetControl) updateStatefulSet(
   520  	ctx context.Context,
   521  	set *apps.StatefulSet,
   522  	currentRevision *apps.ControllerRevision,
   523  	updateRevision *apps.ControllerRevision,
   524  	collisionCount int32,
   525  	pods []*v1.Pod) (*apps.StatefulSetStatus, error) {
   526  	logger := klog.FromContext(ctx)
   527  	// get the current and update revisions of the set.
   528  	currentSet, err := ApplyRevision(set, currentRevision)
   529  	if err != nil {
   530  		return nil, err
   531  	}
   532  	updateSet, err := ApplyRevision(set, updateRevision)
   533  	if err != nil {
   534  		return nil, err
   535  	}
   536  
   537  	// set the generation, and revisions in the returned status
   538  	status := apps.StatefulSetStatus{}
   539  	status.ObservedGeneration = set.Generation
   540  	status.CurrentRevision = currentRevision.Name
   541  	status.UpdateRevision = updateRevision.Name
   542  	status.CollisionCount = new(int32)
   543  	*status.CollisionCount = collisionCount
   544  
   545  	updateStatus(&status, set.Spec.MinReadySeconds, currentRevision, updateRevision, pods)
   546  
   547  	replicaCount := int(*set.Spec.Replicas)
   548  	// slice that will contain all Pods such that getStartOrdinal(set) <= getOrdinal(pod) <= getEndOrdinal(set)
   549  	replicas := make([]*v1.Pod, replicaCount)
   550  	// slice that will contain all Pods such that getOrdinal(pod) < getStartOrdinal(set) OR getOrdinal(pod) > getEndOrdinal(set)
   551  	condemned := make([]*v1.Pod, 0, len(pods))
   552  	unhealthy := 0
   553  	var firstUnhealthyPod *v1.Pod
   554  
   555  	// First we partition pods into two lists valid replicas and condemned Pods
   556  	for _, pod := range pods {
   557  		if podInOrdinalRange(pod, set) {
   558  			// if the ordinal of the pod is within the range of the current number of replicas,
   559  			// insert it at the indirection of its ordinal
   560  			replicas[getOrdinal(pod)-getStartOrdinal(set)] = pod
   561  		} else if getOrdinal(pod) >= 0 {
   562  			// if the ordinal is valid, but not within the range add it to the condemned list
   563  			condemned = append(condemned, pod)
   564  		}
   565  		// If the ordinal could not be parsed (ord < 0), ignore the Pod.
   566  	}
   567  
   568  	// for any empty indices in the sequence [0,set.Spec.Replicas) create a new Pod at the correct revision
   569  	for ord := getStartOrdinal(set); ord <= getEndOrdinal(set); ord++ {
   570  		replicaIdx := ord - getStartOrdinal(set)
   571  		if replicas[replicaIdx] == nil {
   572  			replicas[replicaIdx] = newVersionedStatefulSetPod(
   573  				currentSet,
   574  				updateSet,
   575  				currentRevision.Name,
   576  				updateRevision.Name, ord)
   577  		}
   578  	}
   579  
   580  	// sort the condemned Pods by their ordinals
   581  	sort.Sort(descendingOrdinal(condemned))
   582  
   583  	// find the first unhealthy Pod
   584  	for i := range replicas {
   585  		if !isHealthy(replicas[i]) {
   586  			unhealthy++
   587  			if firstUnhealthyPod == nil {
   588  				firstUnhealthyPod = replicas[i]
   589  			}
   590  		}
   591  	}
   592  
   593  	// or the first unhealthy condemned Pod (condemned are sorted in descending order for ease of use)
   594  	for i := len(condemned) - 1; i >= 0; i-- {
   595  		if !isHealthy(condemned[i]) {
   596  			unhealthy++
   597  			if firstUnhealthyPod == nil {
   598  				firstUnhealthyPod = condemned[i]
   599  			}
   600  		}
   601  	}
   602  
   603  	if unhealthy > 0 {
   604  		logger.V(4).Info("StatefulSet has unhealthy Pods", "statefulSet", klog.KObj(set), "unhealthyReplicas", unhealthy, "pod", klog.KObj(firstUnhealthyPod))
   605  	}
   606  
   607  	// If the StatefulSet is being deleted, don't do anything other than updating
   608  	// status.
   609  	if set.DeletionTimestamp != nil {
   610  		return &status, nil
   611  	}
   612  
   613  	monotonic := !allowsBurst(set)
   614  
   615  	// First, process each living replica. Exit if we run into an error or something blocking in monotonic mode.
   616  	processReplicaFn := func(i int) (bool, error) {
   617  		return ssc.processReplica(ctx, set, updateSet, monotonic, replicas, i)
   618  	}
   619  	if shouldExit, err := runForAll(replicas, processReplicaFn, monotonic); shouldExit || err != nil {
   620  		updateStatus(&status, set.Spec.MinReadySeconds, currentRevision, updateRevision, replicas, condemned)
   621  		return &status, err
   622  	}
   623  
   624  	// Fix pod claims for condemned pods, if necessary.
   625  	if utilfeature.DefaultFeatureGate.Enabled(features.StatefulSetAutoDeletePVC) {
   626  		fixPodClaim := func(i int) (bool, error) {
   627  			if matchPolicy, err := ssc.podControl.ClaimsMatchRetentionPolicy(ctx, updateSet, condemned[i]); err != nil {
   628  				return true, err
   629  			} else if !matchPolicy {
   630  				if err := ssc.podControl.UpdatePodClaimForRetentionPolicy(ctx, updateSet, condemned[i]); err != nil {
   631  					return true, err
   632  				}
   633  			}
   634  			return false, nil
   635  		}
   636  		if shouldExit, err := runForAll(condemned, fixPodClaim, monotonic); shouldExit || err != nil {
   637  			updateStatus(&status, set.Spec.MinReadySeconds, currentRevision, updateRevision, replicas, condemned)
   638  			return &status, err
   639  		}
   640  	}
   641  
   642  	// At this point, in monotonic mode all of the current Replicas are Running, Ready and Available,
   643  	// and we can consider termination.
   644  	// We will wait for all predecessors to be Running and Ready prior to attempting a deletion.
   645  	// We will terminate Pods in a monotonically decreasing order.
   646  	// Note that we do not resurrect Pods in this interval. Also note that scaling will take precedence over
   647  	// updates.
   648  	processCondemnedFn := func(i int) (bool, error) {
   649  		return ssc.processCondemned(ctx, set, firstUnhealthyPod, monotonic, condemned, i)
   650  	}
   651  	if shouldExit, err := runForAll(condemned, processCondemnedFn, monotonic); shouldExit || err != nil {
   652  		updateStatus(&status, set.Spec.MinReadySeconds, currentRevision, updateRevision, replicas, condemned)
   653  		return &status, err
   654  	}
   655  
   656  	updateStatus(&status, set.Spec.MinReadySeconds, currentRevision, updateRevision, replicas, condemned)
   657  
   658  	// for the OnDelete strategy we short circuit. Pods will be updated when they are manually deleted.
   659  	if set.Spec.UpdateStrategy.Type == apps.OnDeleteStatefulSetStrategyType {
   660  		return &status, nil
   661  	}
   662  
   663  	if utilfeature.DefaultFeatureGate.Enabled(features.MaxUnavailableStatefulSet) {
   664  		return updateStatefulSetAfterInvariantEstablished(ctx,
   665  			ssc,
   666  			set,
   667  			replicas,
   668  			updateRevision,
   669  			status,
   670  		)
   671  	}
   672  
   673  	// we compute the minimum ordinal of the target sequence for a destructive update based on the strategy.
   674  	updateMin := 0
   675  	if set.Spec.UpdateStrategy.RollingUpdate != nil {
   676  		updateMin = int(*set.Spec.UpdateStrategy.RollingUpdate.Partition)
   677  	}
   678  	// we terminate the Pod with the largest ordinal that does not match the update revision.
   679  	for target := len(replicas) - 1; target >= updateMin; target-- {
   680  
   681  		// delete the Pod if it is not already terminating and does not match the update revision.
   682  		if getPodRevision(replicas[target]) != updateRevision.Name && !isTerminating(replicas[target]) {
   683  			logger.V(2).Info("Pod of StatefulSet is terminating for update",
   684  				"statefulSet", klog.KObj(set), "pod", klog.KObj(replicas[target]))
   685  			if err := ssc.podControl.DeleteStatefulPod(set, replicas[target]); err != nil {
   686  				if !errors.IsNotFound(err) {
   687  					return &status, err
   688  				}
   689  			}
   690  			status.CurrentReplicas--
   691  			return &status, err
   692  		}
   693  
   694  		// wait for unhealthy Pods on update
   695  		if !isHealthy(replicas[target]) {
   696  			logger.V(4).Info("StatefulSet is waiting for Pod to update",
   697  				"statefulSet", klog.KObj(set), "pod", klog.KObj(replicas[target]))
   698  			return &status, nil
   699  		}
   700  
   701  	}
   702  	return &status, nil
   703  }
   704  
   705  func updateStatefulSetAfterInvariantEstablished(
   706  	ctx context.Context,
   707  	ssc *defaultStatefulSetControl,
   708  	set *apps.StatefulSet,
   709  	replicas []*v1.Pod,
   710  	updateRevision *apps.ControllerRevision,
   711  	status apps.StatefulSetStatus,
   712  ) (*apps.StatefulSetStatus, error) {
   713  
   714  	logger := klog.FromContext(ctx)
   715  	replicaCount := int(*set.Spec.Replicas)
   716  
   717  	// we compute the minimum ordinal of the target sequence for a destructive update based on the strategy.
   718  	updateMin := 0
   719  	maxUnavailable := 1
   720  	if set.Spec.UpdateStrategy.RollingUpdate != nil {
   721  		updateMin = int(*set.Spec.UpdateStrategy.RollingUpdate.Partition)
   722  
   723  		// if the feature was enabled and then later disabled, MaxUnavailable may have a value
   724  		// more than 1. Ignore the passed in value and Use maxUnavailable as 1 to enforce
   725  		// expected behavior when feature gate is not enabled.
   726  		var err error
   727  		maxUnavailable, err = getStatefulSetMaxUnavailable(set.Spec.UpdateStrategy.RollingUpdate.MaxUnavailable, replicaCount)
   728  		if err != nil {
   729  			return &status, err
   730  		}
   731  	}
   732  
   733  	// Collect all targets in the range between getStartOrdinal(set) and getEndOrdinal(set). Count any targets in that range
   734  	// that are unhealthy i.e. terminated or not running and ready as unavailable). Select the
   735  	// (MaxUnavailable - Unavailable) Pods, in order with respect to their ordinal for termination. Delete
   736  	// those pods and count the successful deletions. Update the status with the correct number of deletions.
   737  	unavailablePods := 0
   738  	for target := len(replicas) - 1; target >= 0; target-- {
   739  		if !isHealthy(replicas[target]) {
   740  			unavailablePods++
   741  		}
   742  	}
   743  
   744  	if unavailablePods >= maxUnavailable {
   745  		logger.V(2).Info("StatefulSet found unavailablePods, more than or equal to allowed maxUnavailable",
   746  			"statefulSet", klog.KObj(set),
   747  			"unavailablePods", unavailablePods,
   748  			"maxUnavailable", maxUnavailable)
   749  		return &status, nil
   750  	}
   751  
   752  	// Now we need to delete MaxUnavailable- unavailablePods
   753  	// start deleting one by one starting from the highest ordinal first
   754  	podsToDelete := maxUnavailable - unavailablePods
   755  
   756  	deletedPods := 0
   757  	for target := len(replicas) - 1; target >= updateMin && deletedPods < podsToDelete; target-- {
   758  
   759  		// delete the Pod if it is healthy and the revision doesnt match the target
   760  		if getPodRevision(replicas[target]) != updateRevision.Name && !isTerminating(replicas[target]) {
   761  			// delete the Pod if it is healthy and the revision doesnt match the target
   762  			logger.V(2).Info("StatefulSet terminating Pod for update",
   763  				"statefulSet", klog.KObj(set),
   764  				"pod", klog.KObj(replicas[target]))
   765  			if err := ssc.podControl.DeleteStatefulPod(set, replicas[target]); err != nil {
   766  				if !errors.IsNotFound(err) {
   767  					return &status, err
   768  				}
   769  			}
   770  			deletedPods++
   771  			status.CurrentReplicas--
   772  		}
   773  	}
   774  	return &status, nil
   775  }
   776  
   777  // updateStatefulSetStatus updates set's Status to be equal to status. If status indicates a complete update, it is
   778  // mutated to indicate completion. If status is semantically equivalent to set's Status no update is performed. If the
   779  // returned error is nil, the update is successful.
   780  func (ssc *defaultStatefulSetControl) updateStatefulSetStatus(
   781  	ctx context.Context,
   782  	set *apps.StatefulSet,
   783  	status *apps.StatefulSetStatus) error {
   784  	// complete any in progress rolling update if necessary
   785  	completeRollingUpdate(set, status)
   786  
   787  	// if the status is not inconsistent do not perform an update
   788  	if !inconsistentStatus(set, status) {
   789  		return nil
   790  	}
   791  
   792  	// copy set and update its status
   793  	set = set.DeepCopy()
   794  	if err := ssc.statusUpdater.UpdateStatefulSetStatus(ctx, set, status); err != nil {
   795  		return err
   796  	}
   797  
   798  	return nil
   799  }
   800  
   801  var _ StatefulSetControlInterface = &defaultStatefulSetControl{}