k8s.io/kubernetes@v1.29.3/pkg/controller/statefulset/stateful_set_control.go (about)

     1  /*
     2  Copyright 2016 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package statefulset
    18  
    19  import (
    20  	"context"
    21  	"sort"
    22  	"sync"
    23  
    24  	apps "k8s.io/api/apps/v1"
    25  	v1 "k8s.io/api/core/v1"
    26  	"k8s.io/apimachinery/pkg/api/errors"
    27  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    28  	utilerrors "k8s.io/apimachinery/pkg/util/errors"
    29  	utilfeature "k8s.io/apiserver/pkg/util/feature"
    30  	"k8s.io/client-go/tools/record"
    31  	"k8s.io/klog/v2"
    32  	"k8s.io/kubernetes/pkg/controller/history"
    33  	"k8s.io/kubernetes/pkg/features"
    34  	"k8s.io/utils/integer"
    35  )
    36  
    37  // Realistic value for maximum in-flight requests when processing in parallel mode.
    38  const MaxBatchSize = 500
    39  
    40  // StatefulSetControl implements the control logic for updating StatefulSets and their children Pods. It is implemented
    41  // as an interface to allow for extensions that provide different semantics. Currently, there is only one implementation.
    42  type StatefulSetControlInterface interface {
    43  	// UpdateStatefulSet implements the control logic for Pod creation, update, and deletion, and
    44  	// persistent volume creation, update, and deletion.
    45  	// If an implementation returns a non-nil error, the invocation will be retried using a rate-limited strategy.
    46  	// Implementors should sink any errors that they do not wish to trigger a retry, and they may feel free to
    47  	// exit exceptionally at any point provided they wish the update to be re-run at a later point in time.
    48  	UpdateStatefulSet(ctx context.Context, set *apps.StatefulSet, pods []*v1.Pod) (*apps.StatefulSetStatus, error)
    49  	// ListRevisions returns a array of the ControllerRevisions that represent the revisions of set. If the returned
    50  	// error is nil, the returns slice of ControllerRevisions is valid.
    51  	ListRevisions(set *apps.StatefulSet) ([]*apps.ControllerRevision, error)
    52  	// AdoptOrphanRevisions adopts any orphaned ControllerRevisions that match set's Selector. If all adoptions are
    53  	// successful the returned error is nil.
    54  	AdoptOrphanRevisions(set *apps.StatefulSet, revisions []*apps.ControllerRevision) error
    55  }
    56  
    57  // NewDefaultStatefulSetControl returns a new instance of the default implementation StatefulSetControlInterface that
    58  // implements the documented semantics for StatefulSets. podControl is the PodControlInterface used to create, update,
    59  // and delete Pods and to create PersistentVolumeClaims. statusUpdater is the StatefulSetStatusUpdaterInterface used
    60  // to update the status of StatefulSets. You should use an instance returned from NewRealStatefulPodControl() for any
    61  // scenario other than testing.
    62  func NewDefaultStatefulSetControl(
    63  	podControl *StatefulPodControl,
    64  	statusUpdater StatefulSetStatusUpdaterInterface,
    65  	controllerHistory history.Interface,
    66  	recorder record.EventRecorder) StatefulSetControlInterface {
    67  	return &defaultStatefulSetControl{podControl, statusUpdater, controllerHistory, recorder}
    68  }
    69  
    70  type defaultStatefulSetControl struct {
    71  	podControl        *StatefulPodControl
    72  	statusUpdater     StatefulSetStatusUpdaterInterface
    73  	controllerHistory history.Interface
    74  	recorder          record.EventRecorder
    75  }
    76  
    77  // UpdateStatefulSet executes the core logic loop for a stateful set, applying the predictable and
    78  // consistent monotonic update strategy by default - scale up proceeds in ordinal order, no new pod
    79  // is created while any pod is unhealthy, and pods are terminated in descending order. The burst
    80  // strategy allows these constraints to be relaxed - pods will be created and deleted eagerly and
    81  // in no particular order. Clients using the burst strategy should be careful to ensure they
    82  // understand the consistency implications of having unpredictable numbers of pods available.
    83  func (ssc *defaultStatefulSetControl) UpdateStatefulSet(ctx context.Context, set *apps.StatefulSet, pods []*v1.Pod) (*apps.StatefulSetStatus, error) {
    84  	set = set.DeepCopy() // set is modified when a new revision is created in performUpdate. Make a copy now to avoid mutation errors.
    85  
    86  	// list all revisions and sort them
    87  	revisions, err := ssc.ListRevisions(set)
    88  	if err != nil {
    89  		return nil, err
    90  	}
    91  	history.SortControllerRevisions(revisions)
    92  
    93  	currentRevision, updateRevision, status, err := ssc.performUpdate(ctx, set, pods, revisions)
    94  	if err != nil {
    95  		errs := []error{err}
    96  		if agg, ok := err.(utilerrors.Aggregate); ok {
    97  			errs = agg.Errors()
    98  		}
    99  		return nil, utilerrors.NewAggregate(append(errs, ssc.truncateHistory(set, pods, revisions, currentRevision, updateRevision)))
   100  	}
   101  
   102  	// maintain the set's revision history limit
   103  	return status, ssc.truncateHistory(set, pods, revisions, currentRevision, updateRevision)
   104  }
   105  
   106  func (ssc *defaultStatefulSetControl) performUpdate(
   107  	ctx context.Context, set *apps.StatefulSet, pods []*v1.Pod, revisions []*apps.ControllerRevision) (*apps.ControllerRevision, *apps.ControllerRevision, *apps.StatefulSetStatus, error) {
   108  	var currentStatus *apps.StatefulSetStatus
   109  	logger := klog.FromContext(ctx)
   110  	// get the current, and update revisions
   111  	currentRevision, updateRevision, collisionCount, err := ssc.getStatefulSetRevisions(set, revisions)
   112  	if err != nil {
   113  		return currentRevision, updateRevision, currentStatus, err
   114  	}
   115  
   116  	// perform the main update function and get the status
   117  	currentStatus, err = ssc.updateStatefulSet(ctx, set, currentRevision, updateRevision, collisionCount, pods)
   118  	if err != nil && currentStatus == nil {
   119  		return currentRevision, updateRevision, nil, err
   120  	}
   121  
   122  	// make sure to update the latest status even if there is an error with non-nil currentStatus
   123  	statusErr := ssc.updateStatefulSetStatus(ctx, set, currentStatus)
   124  	if statusErr == nil {
   125  		logger.V(4).Info("Updated status", "statefulSet", klog.KObj(set),
   126  			"replicas", currentStatus.Replicas,
   127  			"readyReplicas", currentStatus.ReadyReplicas,
   128  			"currentReplicas", currentStatus.CurrentReplicas,
   129  			"updatedReplicas", currentStatus.UpdatedReplicas)
   130  	}
   131  
   132  	switch {
   133  	case err != nil && statusErr != nil:
   134  		logger.Error(statusErr, "Could not update status", "statefulSet", klog.KObj(set))
   135  		return currentRevision, updateRevision, currentStatus, err
   136  	case err != nil:
   137  		return currentRevision, updateRevision, currentStatus, err
   138  	case statusErr != nil:
   139  		return currentRevision, updateRevision, currentStatus, statusErr
   140  	}
   141  
   142  	logger.V(4).Info("StatefulSet revisions", "statefulSet", klog.KObj(set),
   143  		"currentRevision", currentStatus.CurrentRevision,
   144  		"updateRevision", currentStatus.UpdateRevision)
   145  
   146  	return currentRevision, updateRevision, currentStatus, nil
   147  }
   148  
   149  func (ssc *defaultStatefulSetControl) ListRevisions(set *apps.StatefulSet) ([]*apps.ControllerRevision, error) {
   150  	selector, err := metav1.LabelSelectorAsSelector(set.Spec.Selector)
   151  	if err != nil {
   152  		return nil, err
   153  	}
   154  	return ssc.controllerHistory.ListControllerRevisions(set, selector)
   155  }
   156  
   157  func (ssc *defaultStatefulSetControl) AdoptOrphanRevisions(
   158  	set *apps.StatefulSet,
   159  	revisions []*apps.ControllerRevision) error {
   160  	for i := range revisions {
   161  		adopted, err := ssc.controllerHistory.AdoptControllerRevision(set, controllerKind, revisions[i])
   162  		if err != nil {
   163  			return err
   164  		}
   165  		revisions[i] = adopted
   166  	}
   167  	return nil
   168  }
   169  
   170  // truncateHistory truncates any non-live ControllerRevisions in revisions from set's history. The UpdateRevision and
   171  // CurrentRevision in set's Status are considered to be live. Any revisions associated with the Pods in pods are also
   172  // considered to be live. Non-live revisions are deleted, starting with the revision with the lowest Revision, until
   173  // only RevisionHistoryLimit revisions remain. If the returned error is nil the operation was successful. This method
   174  // expects that revisions is sorted when supplied.
   175  func (ssc *defaultStatefulSetControl) truncateHistory(
   176  	set *apps.StatefulSet,
   177  	pods []*v1.Pod,
   178  	revisions []*apps.ControllerRevision,
   179  	current *apps.ControllerRevision,
   180  	update *apps.ControllerRevision) error {
   181  	history := make([]*apps.ControllerRevision, 0, len(revisions))
   182  	// mark all live revisions
   183  	live := map[string]bool{}
   184  	if current != nil {
   185  		live[current.Name] = true
   186  	}
   187  	if update != nil {
   188  		live[update.Name] = true
   189  	}
   190  	for i := range pods {
   191  		live[getPodRevision(pods[i])] = true
   192  	}
   193  	// collect live revisions and historic revisions
   194  	for i := range revisions {
   195  		if !live[revisions[i].Name] {
   196  			history = append(history, revisions[i])
   197  		}
   198  	}
   199  	historyLen := len(history)
   200  	historyLimit := int(*set.Spec.RevisionHistoryLimit)
   201  	if historyLen <= historyLimit {
   202  		return nil
   203  	}
   204  	// delete any non-live history to maintain the revision limit.
   205  	history = history[:(historyLen - historyLimit)]
   206  	for i := 0; i < len(history); i++ {
   207  		if err := ssc.controllerHistory.DeleteControllerRevision(history[i]); err != nil {
   208  			return err
   209  		}
   210  	}
   211  	return nil
   212  }
   213  
   214  // getStatefulSetRevisions returns the current and update ControllerRevisions for set. It also
   215  // returns a collision count that records the number of name collisions set saw when creating
   216  // new ControllerRevisions. This count is incremented on every name collision and is used in
   217  // building the ControllerRevision names for name collision avoidance. This method may create
   218  // a new revision, or modify the Revision of an existing revision if an update to set is detected.
   219  // This method expects that revisions is sorted when supplied.
   220  func (ssc *defaultStatefulSetControl) getStatefulSetRevisions(
   221  	set *apps.StatefulSet,
   222  	revisions []*apps.ControllerRevision) (*apps.ControllerRevision, *apps.ControllerRevision, int32, error) {
   223  	var currentRevision, updateRevision *apps.ControllerRevision
   224  
   225  	revisionCount := len(revisions)
   226  	history.SortControllerRevisions(revisions)
   227  
   228  	// Use a local copy of set.Status.CollisionCount to avoid modifying set.Status directly.
   229  	// This copy is returned so the value gets carried over to set.Status in updateStatefulSet.
   230  	var collisionCount int32
   231  	if set.Status.CollisionCount != nil {
   232  		collisionCount = *set.Status.CollisionCount
   233  	}
   234  
   235  	// create a new revision from the current set
   236  	updateRevision, err := newRevision(set, nextRevision(revisions), &collisionCount)
   237  	if err != nil {
   238  		return nil, nil, collisionCount, err
   239  	}
   240  
   241  	// find any equivalent revisions
   242  	equalRevisions := history.FindEqualRevisions(revisions, updateRevision)
   243  	equalCount := len(equalRevisions)
   244  
   245  	if equalCount > 0 && history.EqualRevision(revisions[revisionCount-1], equalRevisions[equalCount-1]) {
   246  		// if the equivalent revision is immediately prior the update revision has not changed
   247  		updateRevision = revisions[revisionCount-1]
   248  	} else if equalCount > 0 {
   249  		// if the equivalent revision is not immediately prior we will roll back by incrementing the
   250  		// Revision of the equivalent revision
   251  		updateRevision, err = ssc.controllerHistory.UpdateControllerRevision(
   252  			equalRevisions[equalCount-1],
   253  			updateRevision.Revision)
   254  		if err != nil {
   255  			return nil, nil, collisionCount, err
   256  		}
   257  	} else {
   258  		//if there is no equivalent revision we create a new one
   259  		updateRevision, err = ssc.controllerHistory.CreateControllerRevision(set, updateRevision, &collisionCount)
   260  		if err != nil {
   261  			return nil, nil, collisionCount, err
   262  		}
   263  	}
   264  
   265  	// attempt to find the revision that corresponds to the current revision
   266  	for i := range revisions {
   267  		if revisions[i].Name == set.Status.CurrentRevision {
   268  			currentRevision = revisions[i]
   269  			break
   270  		}
   271  	}
   272  
   273  	// if the current revision is nil we initialize the history by setting it to the update revision
   274  	if currentRevision == nil {
   275  		currentRevision = updateRevision
   276  	}
   277  
   278  	return currentRevision, updateRevision, collisionCount, nil
   279  }
   280  
   281  func slowStartBatch(initialBatchSize int, remaining int, fn func(int) (bool, error)) (int, error) {
   282  	successes := 0
   283  	j := 0
   284  	for batchSize := integer.IntMin(remaining, initialBatchSize); batchSize > 0; batchSize = integer.IntMin(integer.IntMin(2*batchSize, remaining), MaxBatchSize) {
   285  		errCh := make(chan error, batchSize)
   286  		var wg sync.WaitGroup
   287  		wg.Add(batchSize)
   288  		for i := 0; i < batchSize; i++ {
   289  			go func(k int) {
   290  				defer wg.Done()
   291  				// Ignore the first parameter - relevant for monotonic only.
   292  				if _, err := fn(k); err != nil {
   293  					errCh <- err
   294  				}
   295  			}(j)
   296  			j++
   297  		}
   298  		wg.Wait()
   299  		successes += batchSize - len(errCh)
   300  		close(errCh)
   301  		if len(errCh) > 0 {
   302  			errs := make([]error, 0)
   303  			for err := range errCh {
   304  				errs = append(errs, err)
   305  			}
   306  			return successes, utilerrors.NewAggregate(errs)
   307  		}
   308  		remaining -= batchSize
   309  	}
   310  	return successes, nil
   311  }
   312  
   313  type replicaStatus struct {
   314  	replicas          int32
   315  	readyReplicas     int32
   316  	availableReplicas int32
   317  	currentReplicas   int32
   318  	updatedReplicas   int32
   319  }
   320  
   321  func computeReplicaStatus(pods []*v1.Pod, minReadySeconds int32, currentRevision, updateRevision *apps.ControllerRevision) replicaStatus {
   322  	status := replicaStatus{}
   323  	for _, pod := range pods {
   324  		if isCreated(pod) {
   325  			status.replicas++
   326  		}
   327  
   328  		// count the number of running and ready replicas
   329  		if isRunningAndReady(pod) {
   330  			status.readyReplicas++
   331  			// count the number of running and available replicas
   332  			if isRunningAndAvailable(pod, minReadySeconds) {
   333  				status.availableReplicas++
   334  			}
   335  
   336  		}
   337  
   338  		// count the number of current and update replicas
   339  		if isCreated(pod) && !isTerminating(pod) {
   340  			revision := getPodRevision(pod)
   341  			if revision == currentRevision.Name {
   342  				status.currentReplicas++
   343  			}
   344  			if revision == updateRevision.Name {
   345  				status.updatedReplicas++
   346  			}
   347  		}
   348  	}
   349  	return status
   350  }
   351  
   352  func updateStatus(status *apps.StatefulSetStatus, minReadySeconds int32, currentRevision, updateRevision *apps.ControllerRevision, podLists ...[]*v1.Pod) {
   353  	status.Replicas = 0
   354  	status.ReadyReplicas = 0
   355  	status.AvailableReplicas = 0
   356  	status.CurrentReplicas = 0
   357  	status.UpdatedReplicas = 0
   358  	for _, list := range podLists {
   359  		replicaStatus := computeReplicaStatus(list, minReadySeconds, currentRevision, updateRevision)
   360  		status.Replicas += replicaStatus.replicas
   361  		status.ReadyReplicas += replicaStatus.readyReplicas
   362  		status.AvailableReplicas += replicaStatus.availableReplicas
   363  		status.CurrentReplicas += replicaStatus.currentReplicas
   364  		status.UpdatedReplicas += replicaStatus.updatedReplicas
   365  	}
   366  }
   367  
   368  func (ssc *defaultStatefulSetControl) processReplica(
   369  	ctx context.Context,
   370  	set *apps.StatefulSet,
   371  	currentRevision *apps.ControllerRevision,
   372  	updateRevision *apps.ControllerRevision,
   373  	currentSet *apps.StatefulSet,
   374  	updateSet *apps.StatefulSet,
   375  	monotonic bool,
   376  	replicas []*v1.Pod,
   377  	i int) (bool, error) {
   378  	logger := klog.FromContext(ctx)
   379  	// Delete and recreate pods which finished running.
   380  	//
   381  	// Note that pods with phase Succeeded will also trigger this event. This is
   382  	// because final pod phase of evicted or otherwise forcibly stopped pods
   383  	// (e.g. terminated on node reboot) is determined by the exit code of the
   384  	// container, not by the reason for pod termination. We should restart the pod
   385  	// regardless of the exit code.
   386  	if isFailed(replicas[i]) || isSucceeded(replicas[i]) {
   387  		if isFailed(replicas[i]) {
   388  			ssc.recorder.Eventf(set, v1.EventTypeWarning, "RecreatingFailedPod",
   389  				"StatefulSet %s/%s is recreating failed Pod %s",
   390  				set.Namespace,
   391  				set.Name,
   392  				replicas[i].Name)
   393  		} else {
   394  			ssc.recorder.Eventf(set, v1.EventTypeNormal, "RecreatingTerminatedPod",
   395  				"StatefulSet %s/%s is recreating terminated Pod %s",
   396  				set.Namespace,
   397  				set.Name,
   398  				replicas[i].Name)
   399  		}
   400  		if err := ssc.podControl.DeleteStatefulPod(set, replicas[i]); err != nil {
   401  			return true, err
   402  		}
   403  		replicaOrd := i + getStartOrdinal(set)
   404  		replicas[i] = newVersionedStatefulSetPod(
   405  			currentSet,
   406  			updateSet,
   407  			currentRevision.Name,
   408  			updateRevision.Name,
   409  			replicaOrd)
   410  	}
   411  	// If we find a Pod that has not been created we create the Pod
   412  	if !isCreated(replicas[i]) {
   413  		if utilfeature.DefaultFeatureGate.Enabled(features.StatefulSetAutoDeletePVC) {
   414  			if isStale, err := ssc.podControl.PodClaimIsStale(set, replicas[i]); err != nil {
   415  				return true, err
   416  			} else if isStale {
   417  				// If a pod has a stale PVC, no more work can be done this round.
   418  				return true, err
   419  			}
   420  		}
   421  		if err := ssc.podControl.CreateStatefulPod(ctx, set, replicas[i]); err != nil {
   422  			return true, err
   423  		}
   424  		if monotonic {
   425  			// if the set does not allow bursting, return immediately
   426  			return true, nil
   427  		}
   428  	}
   429  
   430  	// If the Pod is in pending state then trigger PVC creation to create missing PVCs
   431  	if isPending(replicas[i]) {
   432  		logger.V(4).Info(
   433  			"StatefulSet is triggering PVC creation for pending Pod",
   434  			"statefulSet", klog.KObj(set), "pod", klog.KObj(replicas[i]))
   435  		if err := ssc.podControl.createMissingPersistentVolumeClaims(ctx, set, replicas[i]); err != nil {
   436  			return true, err
   437  		}
   438  	}
   439  
   440  	// If we find a Pod that is currently terminating, we must wait until graceful deletion
   441  	// completes before we continue to make progress.
   442  	if isTerminating(replicas[i]) && monotonic {
   443  		logger.V(4).Info("StatefulSet is waiting for Pod to Terminate",
   444  			"statefulSet", klog.KObj(set), "pod", klog.KObj(replicas[i]))
   445  		return true, nil
   446  	}
   447  
   448  	// If we have a Pod that has been created but is not running and ready we can not make progress.
   449  	// We must ensure that all for each Pod, when we create it, all of its predecessors, with respect to its
   450  	// ordinal, are Running and Ready.
   451  	if !isRunningAndReady(replicas[i]) && monotonic {
   452  		logger.V(4).Info("StatefulSet is waiting for Pod to be Running and Ready",
   453  			"statefulSet", klog.KObj(set), "pod", klog.KObj(replicas[i]))
   454  		return true, nil
   455  	}
   456  
   457  	// If we have a Pod that has been created but is not available we can not make progress.
   458  	// We must ensure that all for each Pod, when we create it, all of its predecessors, with respect to its
   459  	// ordinal, are Available.
   460  	if !isRunningAndAvailable(replicas[i], set.Spec.MinReadySeconds) && monotonic {
   461  		logger.V(4).Info("StatefulSet is waiting for Pod to be Available",
   462  			"statefulSet", klog.KObj(set), "pod", klog.KObj(replicas[i]))
   463  		return true, nil
   464  	}
   465  
   466  	// Enforce the StatefulSet invariants
   467  	retentionMatch := true
   468  	if utilfeature.DefaultFeatureGate.Enabled(features.StatefulSetAutoDeletePVC) {
   469  		var err error
   470  		retentionMatch, err = ssc.podControl.ClaimsMatchRetentionPolicy(ctx, updateSet, replicas[i])
   471  		// An error is expected if the pod is not yet fully updated, and so return is treated as matching.
   472  		if err != nil {
   473  			retentionMatch = true
   474  		}
   475  	}
   476  
   477  	if identityMatches(set, replicas[i]) && storageMatches(set, replicas[i]) && retentionMatch {
   478  		return false, nil
   479  	}
   480  
   481  	// Make a deep copy so we don't mutate the shared cache
   482  	replica := replicas[i].DeepCopy()
   483  	if err := ssc.podControl.UpdateStatefulPod(ctx, updateSet, replica); err != nil {
   484  		return true, err
   485  	}
   486  
   487  	return false, nil
   488  }
   489  
   490  func (ssc *defaultStatefulSetControl) processCondemned(ctx context.Context, set *apps.StatefulSet, firstUnhealthyPod *v1.Pod, monotonic bool, condemned []*v1.Pod, i int) (bool, error) {
   491  	logger := klog.FromContext(ctx)
   492  	if isTerminating(condemned[i]) {
   493  		// if we are in monotonic mode, block and wait for terminating pods to expire
   494  		if monotonic {
   495  			logger.V(4).Info("StatefulSet is waiting for Pod to Terminate prior to scale down",
   496  				"statefulSet", klog.KObj(set), "pod", klog.KObj(condemned[i]))
   497  			return true, nil
   498  		}
   499  		return false, nil
   500  	}
   501  	// if we are in monotonic mode and the condemned target is not the first unhealthy Pod block
   502  	if !isRunningAndReady(condemned[i]) && monotonic && condemned[i] != firstUnhealthyPod {
   503  		logger.V(4).Info("StatefulSet is waiting for Pod to be Running and Ready prior to scale down",
   504  			"statefulSet", klog.KObj(set), "pod", klog.KObj(firstUnhealthyPod))
   505  		return true, nil
   506  	}
   507  	// if we are in monotonic mode and the condemned target is not the first unhealthy Pod, block.
   508  	if !isRunningAndAvailable(condemned[i], set.Spec.MinReadySeconds) && monotonic && condemned[i] != firstUnhealthyPod {
   509  		logger.V(4).Info("StatefulSet is waiting for Pod to be Available prior to scale down",
   510  			"statefulSet", klog.KObj(set), "pod", klog.KObj(firstUnhealthyPod))
   511  		return true, nil
   512  	}
   513  
   514  	logger.V(2).Info("Pod of StatefulSet is terminating for scale down",
   515  		"statefulSet", klog.KObj(set), "pod", klog.KObj(condemned[i]))
   516  	return true, ssc.podControl.DeleteStatefulPod(set, condemned[i])
   517  }
   518  
   519  func runForAll(pods []*v1.Pod, fn func(i int) (bool, error), monotonic bool) (bool, error) {
   520  	if monotonic {
   521  		for i := range pods {
   522  			if shouldExit, err := fn(i); shouldExit || err != nil {
   523  				return true, err
   524  			}
   525  		}
   526  	} else {
   527  		if _, err := slowStartBatch(1, len(pods), fn); err != nil {
   528  			return true, err
   529  		}
   530  	}
   531  	return false, nil
   532  }
   533  
   534  // updateStatefulSet performs the update function for a StatefulSet. This method creates, updates, and deletes Pods in
   535  // the set in order to conform the system to the target state for the set. The target state always contains
   536  // set.Spec.Replicas Pods with a Ready Condition. If the UpdateStrategy.Type for the set is
   537  // RollingUpdateStatefulSetStrategyType then all Pods in the set must be at set.Status.CurrentRevision.
   538  // If the UpdateStrategy.Type for the set is OnDeleteStatefulSetStrategyType, the target state implies nothing about
   539  // the revisions of Pods in the set. If the UpdateStrategy.Type for the set is PartitionStatefulSetStrategyType, then
   540  // all Pods with ordinal less than UpdateStrategy.Partition.Ordinal must be at Status.CurrentRevision and all other
   541  // Pods must be at Status.UpdateRevision. If the returned error is nil, the returned StatefulSetStatus is valid and the
   542  // update must be recorded. If the error is not nil, the method should be retried until successful.
   543  func (ssc *defaultStatefulSetControl) updateStatefulSet(
   544  	ctx context.Context,
   545  	set *apps.StatefulSet,
   546  	currentRevision *apps.ControllerRevision,
   547  	updateRevision *apps.ControllerRevision,
   548  	collisionCount int32,
   549  	pods []*v1.Pod) (*apps.StatefulSetStatus, error) {
   550  	logger := klog.FromContext(ctx)
   551  	// get the current and update revisions of the set.
   552  	currentSet, err := ApplyRevision(set, currentRevision)
   553  	if err != nil {
   554  		return nil, err
   555  	}
   556  	updateSet, err := ApplyRevision(set, updateRevision)
   557  	if err != nil {
   558  		return nil, err
   559  	}
   560  
   561  	// set the generation, and revisions in the returned status
   562  	status := apps.StatefulSetStatus{}
   563  	status.ObservedGeneration = set.Generation
   564  	status.CurrentRevision = currentRevision.Name
   565  	status.UpdateRevision = updateRevision.Name
   566  	status.CollisionCount = new(int32)
   567  	*status.CollisionCount = collisionCount
   568  
   569  	updateStatus(&status, set.Spec.MinReadySeconds, currentRevision, updateRevision, pods)
   570  
   571  	replicaCount := int(*set.Spec.Replicas)
   572  	// slice that will contain all Pods such that getStartOrdinal(set) <= getOrdinal(pod) <= getEndOrdinal(set)
   573  	replicas := make([]*v1.Pod, replicaCount)
   574  	// slice that will contain all Pods such that getOrdinal(pod) < getStartOrdinal(set) OR getOrdinal(pod) > getEndOrdinal(set)
   575  	condemned := make([]*v1.Pod, 0, len(pods))
   576  	unhealthy := 0
   577  	var firstUnhealthyPod *v1.Pod
   578  
   579  	// First we partition pods into two lists valid replicas and condemned Pods
   580  	for _, pod := range pods {
   581  		if podInOrdinalRange(pod, set) {
   582  			// if the ordinal of the pod is within the range of the current number of replicas,
   583  			// insert it at the indirection of its ordinal
   584  			replicas[getOrdinal(pod)-getStartOrdinal(set)] = pod
   585  		} else if getOrdinal(pod) >= 0 {
   586  			// if the ordinal is valid, but not within the range add it to the condemned list
   587  			condemned = append(condemned, pod)
   588  		}
   589  		// If the ordinal could not be parsed (ord < 0), ignore the Pod.
   590  	}
   591  
   592  	// for any empty indices in the sequence [0,set.Spec.Replicas) create a new Pod at the correct revision
   593  	for ord := getStartOrdinal(set); ord <= getEndOrdinal(set); ord++ {
   594  		replicaIdx := ord - getStartOrdinal(set)
   595  		if replicas[replicaIdx] == nil {
   596  			replicas[replicaIdx] = newVersionedStatefulSetPod(
   597  				currentSet,
   598  				updateSet,
   599  				currentRevision.Name,
   600  				updateRevision.Name, ord)
   601  		}
   602  	}
   603  
   604  	// sort the condemned Pods by their ordinals
   605  	sort.Sort(descendingOrdinal(condemned))
   606  
   607  	// find the first unhealthy Pod
   608  	for i := range replicas {
   609  		if !isHealthy(replicas[i]) {
   610  			unhealthy++
   611  			if firstUnhealthyPod == nil {
   612  				firstUnhealthyPod = replicas[i]
   613  			}
   614  		}
   615  	}
   616  
   617  	// or the first unhealthy condemned Pod (condemned are sorted in descending order for ease of use)
   618  	for i := len(condemned) - 1; i >= 0; i-- {
   619  		if !isHealthy(condemned[i]) {
   620  			unhealthy++
   621  			if firstUnhealthyPod == nil {
   622  				firstUnhealthyPod = condemned[i]
   623  			}
   624  		}
   625  	}
   626  
   627  	if unhealthy > 0 {
   628  		logger.V(4).Info("StatefulSet has unhealthy Pods", "statefulSet", klog.KObj(set), "unhealthyReplicas", unhealthy, "pod", klog.KObj(firstUnhealthyPod))
   629  	}
   630  
   631  	// If the StatefulSet is being deleted, don't do anything other than updating
   632  	// status.
   633  	if set.DeletionTimestamp != nil {
   634  		return &status, nil
   635  	}
   636  
   637  	monotonic := !allowsBurst(set)
   638  
   639  	// First, process each living replica. Exit if we run into an error or something blocking in monotonic mode.
   640  	processReplicaFn := func(i int) (bool, error) {
   641  		return ssc.processReplica(ctx, set, currentRevision, updateRevision, currentSet, updateSet, monotonic, replicas, i)
   642  	}
   643  	if shouldExit, err := runForAll(replicas, processReplicaFn, monotonic); shouldExit || err != nil {
   644  		updateStatus(&status, set.Spec.MinReadySeconds, currentRevision, updateRevision, replicas, condemned)
   645  		return &status, err
   646  	}
   647  
   648  	// Fix pod claims for condemned pods, if necessary.
   649  	if utilfeature.DefaultFeatureGate.Enabled(features.StatefulSetAutoDeletePVC) {
   650  		fixPodClaim := func(i int) (bool, error) {
   651  			if matchPolicy, err := ssc.podControl.ClaimsMatchRetentionPolicy(ctx, updateSet, condemned[i]); err != nil {
   652  				return true, err
   653  			} else if !matchPolicy {
   654  				if err := ssc.podControl.UpdatePodClaimForRetentionPolicy(ctx, updateSet, condemned[i]); err != nil {
   655  					return true, err
   656  				}
   657  			}
   658  			return false, nil
   659  		}
   660  		if shouldExit, err := runForAll(condemned, fixPodClaim, monotonic); shouldExit || err != nil {
   661  			updateStatus(&status, set.Spec.MinReadySeconds, currentRevision, updateRevision, replicas, condemned)
   662  			return &status, err
   663  		}
   664  	}
   665  
   666  	// At this point, in monotonic mode all of the current Replicas are Running, Ready and Available,
   667  	// and we can consider termination.
   668  	// We will wait for all predecessors to be Running and Ready prior to attempting a deletion.
   669  	// We will terminate Pods in a monotonically decreasing order.
   670  	// Note that we do not resurrect Pods in this interval. Also note that scaling will take precedence over
   671  	// updates.
   672  	processCondemnedFn := func(i int) (bool, error) {
   673  		return ssc.processCondemned(ctx, set, firstUnhealthyPod, monotonic, condemned, i)
   674  	}
   675  	if shouldExit, err := runForAll(condemned, processCondemnedFn, monotonic); shouldExit || err != nil {
   676  		updateStatus(&status, set.Spec.MinReadySeconds, currentRevision, updateRevision, replicas, condemned)
   677  		return &status, err
   678  	}
   679  
   680  	updateStatus(&status, set.Spec.MinReadySeconds, currentRevision, updateRevision, replicas, condemned)
   681  
   682  	// for the OnDelete strategy we short circuit. Pods will be updated when they are manually deleted.
   683  	if set.Spec.UpdateStrategy.Type == apps.OnDeleteStatefulSetStrategyType {
   684  		return &status, nil
   685  	}
   686  
   687  	if utilfeature.DefaultFeatureGate.Enabled(features.MaxUnavailableStatefulSet) {
   688  		return updateStatefulSetAfterInvariantEstablished(ctx,
   689  			ssc,
   690  			set,
   691  			replicas,
   692  			updateRevision,
   693  			status,
   694  		)
   695  	}
   696  
   697  	// we compute the minimum ordinal of the target sequence for a destructive update based on the strategy.
   698  	updateMin := 0
   699  	if set.Spec.UpdateStrategy.RollingUpdate != nil {
   700  		updateMin = int(*set.Spec.UpdateStrategy.RollingUpdate.Partition)
   701  	}
   702  	// we terminate the Pod with the largest ordinal that does not match the update revision.
   703  	for target := len(replicas) - 1; target >= updateMin; target-- {
   704  
   705  		// delete the Pod if it is not already terminating and does not match the update revision.
   706  		if getPodRevision(replicas[target]) != updateRevision.Name && !isTerminating(replicas[target]) {
   707  			logger.V(2).Info("Pod of StatefulSet is terminating for update",
   708  				"statefulSet", klog.KObj(set), "pod", klog.KObj(replicas[target]))
   709  			if err := ssc.podControl.DeleteStatefulPod(set, replicas[target]); err != nil {
   710  				if !errors.IsNotFound(err) {
   711  					return &status, err
   712  				}
   713  			}
   714  			status.CurrentReplicas--
   715  			return &status, err
   716  		}
   717  
   718  		// wait for unhealthy Pods on update
   719  		if !isHealthy(replicas[target]) {
   720  			logger.V(4).Info("StatefulSet is waiting for Pod to update",
   721  				"statefulSet", klog.KObj(set), "pod", klog.KObj(replicas[target]))
   722  			return &status, nil
   723  		}
   724  
   725  	}
   726  	return &status, nil
   727  }
   728  
   729  func updateStatefulSetAfterInvariantEstablished(
   730  	ctx context.Context,
   731  	ssc *defaultStatefulSetControl,
   732  	set *apps.StatefulSet,
   733  	replicas []*v1.Pod,
   734  	updateRevision *apps.ControllerRevision,
   735  	status apps.StatefulSetStatus,
   736  ) (*apps.StatefulSetStatus, error) {
   737  
   738  	logger := klog.FromContext(ctx)
   739  	replicaCount := int(*set.Spec.Replicas)
   740  
   741  	// we compute the minimum ordinal of the target sequence for a destructive update based on the strategy.
   742  	updateMin := 0
   743  	maxUnavailable := 1
   744  	if set.Spec.UpdateStrategy.RollingUpdate != nil {
   745  		updateMin = int(*set.Spec.UpdateStrategy.RollingUpdate.Partition)
   746  
   747  		// if the feature was enabled and then later disabled, MaxUnavailable may have a value
   748  		// more than 1. Ignore the passed in value and Use maxUnavailable as 1 to enforce
   749  		// expected behavior when feature gate is not enabled.
   750  		var err error
   751  		maxUnavailable, err = getStatefulSetMaxUnavailable(set.Spec.UpdateStrategy.RollingUpdate.MaxUnavailable, replicaCount)
   752  		if err != nil {
   753  			return &status, err
   754  		}
   755  	}
   756  
   757  	// Collect all targets in the range between getStartOrdinal(set) and getEndOrdinal(set). Count any targets in that range
   758  	// that are unhealthy i.e. terminated or not running and ready as unavailable). Select the
   759  	// (MaxUnavailable - Unavailable) Pods, in order with respect to their ordinal for termination. Delete
   760  	// those pods and count the successful deletions. Update the status with the correct number of deletions.
   761  	unavailablePods := 0
   762  	for target := len(replicas) - 1; target >= 0; target-- {
   763  		if !isHealthy(replicas[target]) {
   764  			unavailablePods++
   765  		}
   766  	}
   767  
   768  	if unavailablePods >= maxUnavailable {
   769  		logger.V(2).Info("StatefulSet found unavailablePods, more than or equal to allowed maxUnavailable",
   770  			"statefulSet", klog.KObj(set),
   771  			"unavailablePods", unavailablePods,
   772  			"maxUnavailable", maxUnavailable)
   773  		return &status, nil
   774  	}
   775  
   776  	// Now we need to delete MaxUnavailable- unavailablePods
   777  	// start deleting one by one starting from the highest ordinal first
   778  	podsToDelete := maxUnavailable - unavailablePods
   779  
   780  	deletedPods := 0
   781  	for target := len(replicas) - 1; target >= updateMin && deletedPods < podsToDelete; target-- {
   782  
   783  		// delete the Pod if it is healthy and the revision doesnt match the target
   784  		if getPodRevision(replicas[target]) != updateRevision.Name && !isTerminating(replicas[target]) {
   785  			// delete the Pod if it is healthy and the revision doesnt match the target
   786  			logger.V(2).Info("StatefulSet terminating Pod for update",
   787  				"statefulSet", klog.KObj(set),
   788  				"pod", klog.KObj(replicas[target]))
   789  			if err := ssc.podControl.DeleteStatefulPod(set, replicas[target]); err != nil {
   790  				if !errors.IsNotFound(err) {
   791  					return &status, err
   792  				}
   793  			}
   794  			deletedPods++
   795  			status.CurrentReplicas--
   796  		}
   797  	}
   798  	return &status, nil
   799  }
   800  
   801  // updateStatefulSetStatus updates set's Status to be equal to status. If status indicates a complete update, it is
   802  // mutated to indicate completion. If status is semantically equivalent to set's Status no update is performed. If the
   803  // returned error is nil, the update is successful.
   804  func (ssc *defaultStatefulSetControl) updateStatefulSetStatus(
   805  	ctx context.Context,
   806  	set *apps.StatefulSet,
   807  	status *apps.StatefulSetStatus) error {
   808  	// complete any in progress rolling update if necessary
   809  	completeRollingUpdate(set, status)
   810  
   811  	// if the status is not inconsistent do not perform an update
   812  	if !inconsistentStatus(set, status) {
   813  		return nil
   814  	}
   815  
   816  	// copy set and update its status
   817  	set = set.DeepCopy()
   818  	if err := ssc.statusUpdater.UpdateStatefulSetStatus(ctx, set, status); err != nil {
   819  		return err
   820  	}
   821  
   822  	return nil
   823  }
   824  
   825  var _ StatefulSetControlInterface = &defaultStatefulSetControl{}