k8s.io/kubernetes@v1.29.3/pkg/controller/statefulset/stateful_set_control.go (about) 1 /* 2 Copyright 2016 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package statefulset 18 19 import ( 20 "context" 21 "sort" 22 "sync" 23 24 apps "k8s.io/api/apps/v1" 25 v1 "k8s.io/api/core/v1" 26 "k8s.io/apimachinery/pkg/api/errors" 27 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 28 utilerrors "k8s.io/apimachinery/pkg/util/errors" 29 utilfeature "k8s.io/apiserver/pkg/util/feature" 30 "k8s.io/client-go/tools/record" 31 "k8s.io/klog/v2" 32 "k8s.io/kubernetes/pkg/controller/history" 33 "k8s.io/kubernetes/pkg/features" 34 "k8s.io/utils/integer" 35 ) 36 37 // Realistic value for maximum in-flight requests when processing in parallel mode. 38 const MaxBatchSize = 500 39 40 // StatefulSetControl implements the control logic for updating StatefulSets and their children Pods. It is implemented 41 // as an interface to allow for extensions that provide different semantics. Currently, there is only one implementation. 42 type StatefulSetControlInterface interface { 43 // UpdateStatefulSet implements the control logic for Pod creation, update, and deletion, and 44 // persistent volume creation, update, and deletion. 45 // If an implementation returns a non-nil error, the invocation will be retried using a rate-limited strategy. 46 // Implementors should sink any errors that they do not wish to trigger a retry, and they may feel free to 47 // exit exceptionally at any point provided they wish the update to be re-run at a later point in time. 48 UpdateStatefulSet(ctx context.Context, set *apps.StatefulSet, pods []*v1.Pod) (*apps.StatefulSetStatus, error) 49 // ListRevisions returns a array of the ControllerRevisions that represent the revisions of set. If the returned 50 // error is nil, the returns slice of ControllerRevisions is valid. 51 ListRevisions(set *apps.StatefulSet) ([]*apps.ControllerRevision, error) 52 // AdoptOrphanRevisions adopts any orphaned ControllerRevisions that match set's Selector. If all adoptions are 53 // successful the returned error is nil. 54 AdoptOrphanRevisions(set *apps.StatefulSet, revisions []*apps.ControllerRevision) error 55 } 56 57 // NewDefaultStatefulSetControl returns a new instance of the default implementation StatefulSetControlInterface that 58 // implements the documented semantics for StatefulSets. podControl is the PodControlInterface used to create, update, 59 // and delete Pods and to create PersistentVolumeClaims. statusUpdater is the StatefulSetStatusUpdaterInterface used 60 // to update the status of StatefulSets. You should use an instance returned from NewRealStatefulPodControl() for any 61 // scenario other than testing. 62 func NewDefaultStatefulSetControl( 63 podControl *StatefulPodControl, 64 statusUpdater StatefulSetStatusUpdaterInterface, 65 controllerHistory history.Interface, 66 recorder record.EventRecorder) StatefulSetControlInterface { 67 return &defaultStatefulSetControl{podControl, statusUpdater, controllerHistory, recorder} 68 } 69 70 type defaultStatefulSetControl struct { 71 podControl *StatefulPodControl 72 statusUpdater StatefulSetStatusUpdaterInterface 73 controllerHistory history.Interface 74 recorder record.EventRecorder 75 } 76 77 // UpdateStatefulSet executes the core logic loop for a stateful set, applying the predictable and 78 // consistent monotonic update strategy by default - scale up proceeds in ordinal order, no new pod 79 // is created while any pod is unhealthy, and pods are terminated in descending order. The burst 80 // strategy allows these constraints to be relaxed - pods will be created and deleted eagerly and 81 // in no particular order. Clients using the burst strategy should be careful to ensure they 82 // understand the consistency implications of having unpredictable numbers of pods available. 83 func (ssc *defaultStatefulSetControl) UpdateStatefulSet(ctx context.Context, set *apps.StatefulSet, pods []*v1.Pod) (*apps.StatefulSetStatus, error) { 84 set = set.DeepCopy() // set is modified when a new revision is created in performUpdate. Make a copy now to avoid mutation errors. 85 86 // list all revisions and sort them 87 revisions, err := ssc.ListRevisions(set) 88 if err != nil { 89 return nil, err 90 } 91 history.SortControllerRevisions(revisions) 92 93 currentRevision, updateRevision, status, err := ssc.performUpdate(ctx, set, pods, revisions) 94 if err != nil { 95 errs := []error{err} 96 if agg, ok := err.(utilerrors.Aggregate); ok { 97 errs = agg.Errors() 98 } 99 return nil, utilerrors.NewAggregate(append(errs, ssc.truncateHistory(set, pods, revisions, currentRevision, updateRevision))) 100 } 101 102 // maintain the set's revision history limit 103 return status, ssc.truncateHistory(set, pods, revisions, currentRevision, updateRevision) 104 } 105 106 func (ssc *defaultStatefulSetControl) performUpdate( 107 ctx context.Context, set *apps.StatefulSet, pods []*v1.Pod, revisions []*apps.ControllerRevision) (*apps.ControllerRevision, *apps.ControllerRevision, *apps.StatefulSetStatus, error) { 108 var currentStatus *apps.StatefulSetStatus 109 logger := klog.FromContext(ctx) 110 // get the current, and update revisions 111 currentRevision, updateRevision, collisionCount, err := ssc.getStatefulSetRevisions(set, revisions) 112 if err != nil { 113 return currentRevision, updateRevision, currentStatus, err 114 } 115 116 // perform the main update function and get the status 117 currentStatus, err = ssc.updateStatefulSet(ctx, set, currentRevision, updateRevision, collisionCount, pods) 118 if err != nil && currentStatus == nil { 119 return currentRevision, updateRevision, nil, err 120 } 121 122 // make sure to update the latest status even if there is an error with non-nil currentStatus 123 statusErr := ssc.updateStatefulSetStatus(ctx, set, currentStatus) 124 if statusErr == nil { 125 logger.V(4).Info("Updated status", "statefulSet", klog.KObj(set), 126 "replicas", currentStatus.Replicas, 127 "readyReplicas", currentStatus.ReadyReplicas, 128 "currentReplicas", currentStatus.CurrentReplicas, 129 "updatedReplicas", currentStatus.UpdatedReplicas) 130 } 131 132 switch { 133 case err != nil && statusErr != nil: 134 logger.Error(statusErr, "Could not update status", "statefulSet", klog.KObj(set)) 135 return currentRevision, updateRevision, currentStatus, err 136 case err != nil: 137 return currentRevision, updateRevision, currentStatus, err 138 case statusErr != nil: 139 return currentRevision, updateRevision, currentStatus, statusErr 140 } 141 142 logger.V(4).Info("StatefulSet revisions", "statefulSet", klog.KObj(set), 143 "currentRevision", currentStatus.CurrentRevision, 144 "updateRevision", currentStatus.UpdateRevision) 145 146 return currentRevision, updateRevision, currentStatus, nil 147 } 148 149 func (ssc *defaultStatefulSetControl) ListRevisions(set *apps.StatefulSet) ([]*apps.ControllerRevision, error) { 150 selector, err := metav1.LabelSelectorAsSelector(set.Spec.Selector) 151 if err != nil { 152 return nil, err 153 } 154 return ssc.controllerHistory.ListControllerRevisions(set, selector) 155 } 156 157 func (ssc *defaultStatefulSetControl) AdoptOrphanRevisions( 158 set *apps.StatefulSet, 159 revisions []*apps.ControllerRevision) error { 160 for i := range revisions { 161 adopted, err := ssc.controllerHistory.AdoptControllerRevision(set, controllerKind, revisions[i]) 162 if err != nil { 163 return err 164 } 165 revisions[i] = adopted 166 } 167 return nil 168 } 169 170 // truncateHistory truncates any non-live ControllerRevisions in revisions from set's history. The UpdateRevision and 171 // CurrentRevision in set's Status are considered to be live. Any revisions associated with the Pods in pods are also 172 // considered to be live. Non-live revisions are deleted, starting with the revision with the lowest Revision, until 173 // only RevisionHistoryLimit revisions remain. If the returned error is nil the operation was successful. This method 174 // expects that revisions is sorted when supplied. 175 func (ssc *defaultStatefulSetControl) truncateHistory( 176 set *apps.StatefulSet, 177 pods []*v1.Pod, 178 revisions []*apps.ControllerRevision, 179 current *apps.ControllerRevision, 180 update *apps.ControllerRevision) error { 181 history := make([]*apps.ControllerRevision, 0, len(revisions)) 182 // mark all live revisions 183 live := map[string]bool{} 184 if current != nil { 185 live[current.Name] = true 186 } 187 if update != nil { 188 live[update.Name] = true 189 } 190 for i := range pods { 191 live[getPodRevision(pods[i])] = true 192 } 193 // collect live revisions and historic revisions 194 for i := range revisions { 195 if !live[revisions[i].Name] { 196 history = append(history, revisions[i]) 197 } 198 } 199 historyLen := len(history) 200 historyLimit := int(*set.Spec.RevisionHistoryLimit) 201 if historyLen <= historyLimit { 202 return nil 203 } 204 // delete any non-live history to maintain the revision limit. 205 history = history[:(historyLen - historyLimit)] 206 for i := 0; i < len(history); i++ { 207 if err := ssc.controllerHistory.DeleteControllerRevision(history[i]); err != nil { 208 return err 209 } 210 } 211 return nil 212 } 213 214 // getStatefulSetRevisions returns the current and update ControllerRevisions for set. It also 215 // returns a collision count that records the number of name collisions set saw when creating 216 // new ControllerRevisions. This count is incremented on every name collision and is used in 217 // building the ControllerRevision names for name collision avoidance. This method may create 218 // a new revision, or modify the Revision of an existing revision if an update to set is detected. 219 // This method expects that revisions is sorted when supplied. 220 func (ssc *defaultStatefulSetControl) getStatefulSetRevisions( 221 set *apps.StatefulSet, 222 revisions []*apps.ControllerRevision) (*apps.ControllerRevision, *apps.ControllerRevision, int32, error) { 223 var currentRevision, updateRevision *apps.ControllerRevision 224 225 revisionCount := len(revisions) 226 history.SortControllerRevisions(revisions) 227 228 // Use a local copy of set.Status.CollisionCount to avoid modifying set.Status directly. 229 // This copy is returned so the value gets carried over to set.Status in updateStatefulSet. 230 var collisionCount int32 231 if set.Status.CollisionCount != nil { 232 collisionCount = *set.Status.CollisionCount 233 } 234 235 // create a new revision from the current set 236 updateRevision, err := newRevision(set, nextRevision(revisions), &collisionCount) 237 if err != nil { 238 return nil, nil, collisionCount, err 239 } 240 241 // find any equivalent revisions 242 equalRevisions := history.FindEqualRevisions(revisions, updateRevision) 243 equalCount := len(equalRevisions) 244 245 if equalCount > 0 && history.EqualRevision(revisions[revisionCount-1], equalRevisions[equalCount-1]) { 246 // if the equivalent revision is immediately prior the update revision has not changed 247 updateRevision = revisions[revisionCount-1] 248 } else if equalCount > 0 { 249 // if the equivalent revision is not immediately prior we will roll back by incrementing the 250 // Revision of the equivalent revision 251 updateRevision, err = ssc.controllerHistory.UpdateControllerRevision( 252 equalRevisions[equalCount-1], 253 updateRevision.Revision) 254 if err != nil { 255 return nil, nil, collisionCount, err 256 } 257 } else { 258 //if there is no equivalent revision we create a new one 259 updateRevision, err = ssc.controllerHistory.CreateControllerRevision(set, updateRevision, &collisionCount) 260 if err != nil { 261 return nil, nil, collisionCount, err 262 } 263 } 264 265 // attempt to find the revision that corresponds to the current revision 266 for i := range revisions { 267 if revisions[i].Name == set.Status.CurrentRevision { 268 currentRevision = revisions[i] 269 break 270 } 271 } 272 273 // if the current revision is nil we initialize the history by setting it to the update revision 274 if currentRevision == nil { 275 currentRevision = updateRevision 276 } 277 278 return currentRevision, updateRevision, collisionCount, nil 279 } 280 281 func slowStartBatch(initialBatchSize int, remaining int, fn func(int) (bool, error)) (int, error) { 282 successes := 0 283 j := 0 284 for batchSize := integer.IntMin(remaining, initialBatchSize); batchSize > 0; batchSize = integer.IntMin(integer.IntMin(2*batchSize, remaining), MaxBatchSize) { 285 errCh := make(chan error, batchSize) 286 var wg sync.WaitGroup 287 wg.Add(batchSize) 288 for i := 0; i < batchSize; i++ { 289 go func(k int) { 290 defer wg.Done() 291 // Ignore the first parameter - relevant for monotonic only. 292 if _, err := fn(k); err != nil { 293 errCh <- err 294 } 295 }(j) 296 j++ 297 } 298 wg.Wait() 299 successes += batchSize - len(errCh) 300 close(errCh) 301 if len(errCh) > 0 { 302 errs := make([]error, 0) 303 for err := range errCh { 304 errs = append(errs, err) 305 } 306 return successes, utilerrors.NewAggregate(errs) 307 } 308 remaining -= batchSize 309 } 310 return successes, nil 311 } 312 313 type replicaStatus struct { 314 replicas int32 315 readyReplicas int32 316 availableReplicas int32 317 currentReplicas int32 318 updatedReplicas int32 319 } 320 321 func computeReplicaStatus(pods []*v1.Pod, minReadySeconds int32, currentRevision, updateRevision *apps.ControllerRevision) replicaStatus { 322 status := replicaStatus{} 323 for _, pod := range pods { 324 if isCreated(pod) { 325 status.replicas++ 326 } 327 328 // count the number of running and ready replicas 329 if isRunningAndReady(pod) { 330 status.readyReplicas++ 331 // count the number of running and available replicas 332 if isRunningAndAvailable(pod, minReadySeconds) { 333 status.availableReplicas++ 334 } 335 336 } 337 338 // count the number of current and update replicas 339 if isCreated(pod) && !isTerminating(pod) { 340 revision := getPodRevision(pod) 341 if revision == currentRevision.Name { 342 status.currentReplicas++ 343 } 344 if revision == updateRevision.Name { 345 status.updatedReplicas++ 346 } 347 } 348 } 349 return status 350 } 351 352 func updateStatus(status *apps.StatefulSetStatus, minReadySeconds int32, currentRevision, updateRevision *apps.ControllerRevision, podLists ...[]*v1.Pod) { 353 status.Replicas = 0 354 status.ReadyReplicas = 0 355 status.AvailableReplicas = 0 356 status.CurrentReplicas = 0 357 status.UpdatedReplicas = 0 358 for _, list := range podLists { 359 replicaStatus := computeReplicaStatus(list, minReadySeconds, currentRevision, updateRevision) 360 status.Replicas += replicaStatus.replicas 361 status.ReadyReplicas += replicaStatus.readyReplicas 362 status.AvailableReplicas += replicaStatus.availableReplicas 363 status.CurrentReplicas += replicaStatus.currentReplicas 364 status.UpdatedReplicas += replicaStatus.updatedReplicas 365 } 366 } 367 368 func (ssc *defaultStatefulSetControl) processReplica( 369 ctx context.Context, 370 set *apps.StatefulSet, 371 currentRevision *apps.ControllerRevision, 372 updateRevision *apps.ControllerRevision, 373 currentSet *apps.StatefulSet, 374 updateSet *apps.StatefulSet, 375 monotonic bool, 376 replicas []*v1.Pod, 377 i int) (bool, error) { 378 logger := klog.FromContext(ctx) 379 // Delete and recreate pods which finished running. 380 // 381 // Note that pods with phase Succeeded will also trigger this event. This is 382 // because final pod phase of evicted or otherwise forcibly stopped pods 383 // (e.g. terminated on node reboot) is determined by the exit code of the 384 // container, not by the reason for pod termination. We should restart the pod 385 // regardless of the exit code. 386 if isFailed(replicas[i]) || isSucceeded(replicas[i]) { 387 if isFailed(replicas[i]) { 388 ssc.recorder.Eventf(set, v1.EventTypeWarning, "RecreatingFailedPod", 389 "StatefulSet %s/%s is recreating failed Pod %s", 390 set.Namespace, 391 set.Name, 392 replicas[i].Name) 393 } else { 394 ssc.recorder.Eventf(set, v1.EventTypeNormal, "RecreatingTerminatedPod", 395 "StatefulSet %s/%s is recreating terminated Pod %s", 396 set.Namespace, 397 set.Name, 398 replicas[i].Name) 399 } 400 if err := ssc.podControl.DeleteStatefulPod(set, replicas[i]); err != nil { 401 return true, err 402 } 403 replicaOrd := i + getStartOrdinal(set) 404 replicas[i] = newVersionedStatefulSetPod( 405 currentSet, 406 updateSet, 407 currentRevision.Name, 408 updateRevision.Name, 409 replicaOrd) 410 } 411 // If we find a Pod that has not been created we create the Pod 412 if !isCreated(replicas[i]) { 413 if utilfeature.DefaultFeatureGate.Enabled(features.StatefulSetAutoDeletePVC) { 414 if isStale, err := ssc.podControl.PodClaimIsStale(set, replicas[i]); err != nil { 415 return true, err 416 } else if isStale { 417 // If a pod has a stale PVC, no more work can be done this round. 418 return true, err 419 } 420 } 421 if err := ssc.podControl.CreateStatefulPod(ctx, set, replicas[i]); err != nil { 422 return true, err 423 } 424 if monotonic { 425 // if the set does not allow bursting, return immediately 426 return true, nil 427 } 428 } 429 430 // If the Pod is in pending state then trigger PVC creation to create missing PVCs 431 if isPending(replicas[i]) { 432 logger.V(4).Info( 433 "StatefulSet is triggering PVC creation for pending Pod", 434 "statefulSet", klog.KObj(set), "pod", klog.KObj(replicas[i])) 435 if err := ssc.podControl.createMissingPersistentVolumeClaims(ctx, set, replicas[i]); err != nil { 436 return true, err 437 } 438 } 439 440 // If we find a Pod that is currently terminating, we must wait until graceful deletion 441 // completes before we continue to make progress. 442 if isTerminating(replicas[i]) && monotonic { 443 logger.V(4).Info("StatefulSet is waiting for Pod to Terminate", 444 "statefulSet", klog.KObj(set), "pod", klog.KObj(replicas[i])) 445 return true, nil 446 } 447 448 // If we have a Pod that has been created but is not running and ready we can not make progress. 449 // We must ensure that all for each Pod, when we create it, all of its predecessors, with respect to its 450 // ordinal, are Running and Ready. 451 if !isRunningAndReady(replicas[i]) && monotonic { 452 logger.V(4).Info("StatefulSet is waiting for Pod to be Running and Ready", 453 "statefulSet", klog.KObj(set), "pod", klog.KObj(replicas[i])) 454 return true, nil 455 } 456 457 // If we have a Pod that has been created but is not available we can not make progress. 458 // We must ensure that all for each Pod, when we create it, all of its predecessors, with respect to its 459 // ordinal, are Available. 460 if !isRunningAndAvailable(replicas[i], set.Spec.MinReadySeconds) && monotonic { 461 logger.V(4).Info("StatefulSet is waiting for Pod to be Available", 462 "statefulSet", klog.KObj(set), "pod", klog.KObj(replicas[i])) 463 return true, nil 464 } 465 466 // Enforce the StatefulSet invariants 467 retentionMatch := true 468 if utilfeature.DefaultFeatureGate.Enabled(features.StatefulSetAutoDeletePVC) { 469 var err error 470 retentionMatch, err = ssc.podControl.ClaimsMatchRetentionPolicy(ctx, updateSet, replicas[i]) 471 // An error is expected if the pod is not yet fully updated, and so return is treated as matching. 472 if err != nil { 473 retentionMatch = true 474 } 475 } 476 477 if identityMatches(set, replicas[i]) && storageMatches(set, replicas[i]) && retentionMatch { 478 return false, nil 479 } 480 481 // Make a deep copy so we don't mutate the shared cache 482 replica := replicas[i].DeepCopy() 483 if err := ssc.podControl.UpdateStatefulPod(ctx, updateSet, replica); err != nil { 484 return true, err 485 } 486 487 return false, nil 488 } 489 490 func (ssc *defaultStatefulSetControl) processCondemned(ctx context.Context, set *apps.StatefulSet, firstUnhealthyPod *v1.Pod, monotonic bool, condemned []*v1.Pod, i int) (bool, error) { 491 logger := klog.FromContext(ctx) 492 if isTerminating(condemned[i]) { 493 // if we are in monotonic mode, block and wait for terminating pods to expire 494 if monotonic { 495 logger.V(4).Info("StatefulSet is waiting for Pod to Terminate prior to scale down", 496 "statefulSet", klog.KObj(set), "pod", klog.KObj(condemned[i])) 497 return true, nil 498 } 499 return false, nil 500 } 501 // if we are in monotonic mode and the condemned target is not the first unhealthy Pod block 502 if !isRunningAndReady(condemned[i]) && monotonic && condemned[i] != firstUnhealthyPod { 503 logger.V(4).Info("StatefulSet is waiting for Pod to be Running and Ready prior to scale down", 504 "statefulSet", klog.KObj(set), "pod", klog.KObj(firstUnhealthyPod)) 505 return true, nil 506 } 507 // if we are in monotonic mode and the condemned target is not the first unhealthy Pod, block. 508 if !isRunningAndAvailable(condemned[i], set.Spec.MinReadySeconds) && monotonic && condemned[i] != firstUnhealthyPod { 509 logger.V(4).Info("StatefulSet is waiting for Pod to be Available prior to scale down", 510 "statefulSet", klog.KObj(set), "pod", klog.KObj(firstUnhealthyPod)) 511 return true, nil 512 } 513 514 logger.V(2).Info("Pod of StatefulSet is terminating for scale down", 515 "statefulSet", klog.KObj(set), "pod", klog.KObj(condemned[i])) 516 return true, ssc.podControl.DeleteStatefulPod(set, condemned[i]) 517 } 518 519 func runForAll(pods []*v1.Pod, fn func(i int) (bool, error), monotonic bool) (bool, error) { 520 if monotonic { 521 for i := range pods { 522 if shouldExit, err := fn(i); shouldExit || err != nil { 523 return true, err 524 } 525 } 526 } else { 527 if _, err := slowStartBatch(1, len(pods), fn); err != nil { 528 return true, err 529 } 530 } 531 return false, nil 532 } 533 534 // updateStatefulSet performs the update function for a StatefulSet. This method creates, updates, and deletes Pods in 535 // the set in order to conform the system to the target state for the set. The target state always contains 536 // set.Spec.Replicas Pods with a Ready Condition. If the UpdateStrategy.Type for the set is 537 // RollingUpdateStatefulSetStrategyType then all Pods in the set must be at set.Status.CurrentRevision. 538 // If the UpdateStrategy.Type for the set is OnDeleteStatefulSetStrategyType, the target state implies nothing about 539 // the revisions of Pods in the set. If the UpdateStrategy.Type for the set is PartitionStatefulSetStrategyType, then 540 // all Pods with ordinal less than UpdateStrategy.Partition.Ordinal must be at Status.CurrentRevision and all other 541 // Pods must be at Status.UpdateRevision. If the returned error is nil, the returned StatefulSetStatus is valid and the 542 // update must be recorded. If the error is not nil, the method should be retried until successful. 543 func (ssc *defaultStatefulSetControl) updateStatefulSet( 544 ctx context.Context, 545 set *apps.StatefulSet, 546 currentRevision *apps.ControllerRevision, 547 updateRevision *apps.ControllerRevision, 548 collisionCount int32, 549 pods []*v1.Pod) (*apps.StatefulSetStatus, error) { 550 logger := klog.FromContext(ctx) 551 // get the current and update revisions of the set. 552 currentSet, err := ApplyRevision(set, currentRevision) 553 if err != nil { 554 return nil, err 555 } 556 updateSet, err := ApplyRevision(set, updateRevision) 557 if err != nil { 558 return nil, err 559 } 560 561 // set the generation, and revisions in the returned status 562 status := apps.StatefulSetStatus{} 563 status.ObservedGeneration = set.Generation 564 status.CurrentRevision = currentRevision.Name 565 status.UpdateRevision = updateRevision.Name 566 status.CollisionCount = new(int32) 567 *status.CollisionCount = collisionCount 568 569 updateStatus(&status, set.Spec.MinReadySeconds, currentRevision, updateRevision, pods) 570 571 replicaCount := int(*set.Spec.Replicas) 572 // slice that will contain all Pods such that getStartOrdinal(set) <= getOrdinal(pod) <= getEndOrdinal(set) 573 replicas := make([]*v1.Pod, replicaCount) 574 // slice that will contain all Pods such that getOrdinal(pod) < getStartOrdinal(set) OR getOrdinal(pod) > getEndOrdinal(set) 575 condemned := make([]*v1.Pod, 0, len(pods)) 576 unhealthy := 0 577 var firstUnhealthyPod *v1.Pod 578 579 // First we partition pods into two lists valid replicas and condemned Pods 580 for _, pod := range pods { 581 if podInOrdinalRange(pod, set) { 582 // if the ordinal of the pod is within the range of the current number of replicas, 583 // insert it at the indirection of its ordinal 584 replicas[getOrdinal(pod)-getStartOrdinal(set)] = pod 585 } else if getOrdinal(pod) >= 0 { 586 // if the ordinal is valid, but not within the range add it to the condemned list 587 condemned = append(condemned, pod) 588 } 589 // If the ordinal could not be parsed (ord < 0), ignore the Pod. 590 } 591 592 // for any empty indices in the sequence [0,set.Spec.Replicas) create a new Pod at the correct revision 593 for ord := getStartOrdinal(set); ord <= getEndOrdinal(set); ord++ { 594 replicaIdx := ord - getStartOrdinal(set) 595 if replicas[replicaIdx] == nil { 596 replicas[replicaIdx] = newVersionedStatefulSetPod( 597 currentSet, 598 updateSet, 599 currentRevision.Name, 600 updateRevision.Name, ord) 601 } 602 } 603 604 // sort the condemned Pods by their ordinals 605 sort.Sort(descendingOrdinal(condemned)) 606 607 // find the first unhealthy Pod 608 for i := range replicas { 609 if !isHealthy(replicas[i]) { 610 unhealthy++ 611 if firstUnhealthyPod == nil { 612 firstUnhealthyPod = replicas[i] 613 } 614 } 615 } 616 617 // or the first unhealthy condemned Pod (condemned are sorted in descending order for ease of use) 618 for i := len(condemned) - 1; i >= 0; i-- { 619 if !isHealthy(condemned[i]) { 620 unhealthy++ 621 if firstUnhealthyPod == nil { 622 firstUnhealthyPod = condemned[i] 623 } 624 } 625 } 626 627 if unhealthy > 0 { 628 logger.V(4).Info("StatefulSet has unhealthy Pods", "statefulSet", klog.KObj(set), "unhealthyReplicas", unhealthy, "pod", klog.KObj(firstUnhealthyPod)) 629 } 630 631 // If the StatefulSet is being deleted, don't do anything other than updating 632 // status. 633 if set.DeletionTimestamp != nil { 634 return &status, nil 635 } 636 637 monotonic := !allowsBurst(set) 638 639 // First, process each living replica. Exit if we run into an error or something blocking in monotonic mode. 640 processReplicaFn := func(i int) (bool, error) { 641 return ssc.processReplica(ctx, set, currentRevision, updateRevision, currentSet, updateSet, monotonic, replicas, i) 642 } 643 if shouldExit, err := runForAll(replicas, processReplicaFn, monotonic); shouldExit || err != nil { 644 updateStatus(&status, set.Spec.MinReadySeconds, currentRevision, updateRevision, replicas, condemned) 645 return &status, err 646 } 647 648 // Fix pod claims for condemned pods, if necessary. 649 if utilfeature.DefaultFeatureGate.Enabled(features.StatefulSetAutoDeletePVC) { 650 fixPodClaim := func(i int) (bool, error) { 651 if matchPolicy, err := ssc.podControl.ClaimsMatchRetentionPolicy(ctx, updateSet, condemned[i]); err != nil { 652 return true, err 653 } else if !matchPolicy { 654 if err := ssc.podControl.UpdatePodClaimForRetentionPolicy(ctx, updateSet, condemned[i]); err != nil { 655 return true, err 656 } 657 } 658 return false, nil 659 } 660 if shouldExit, err := runForAll(condemned, fixPodClaim, monotonic); shouldExit || err != nil { 661 updateStatus(&status, set.Spec.MinReadySeconds, currentRevision, updateRevision, replicas, condemned) 662 return &status, err 663 } 664 } 665 666 // At this point, in monotonic mode all of the current Replicas are Running, Ready and Available, 667 // and we can consider termination. 668 // We will wait for all predecessors to be Running and Ready prior to attempting a deletion. 669 // We will terminate Pods in a monotonically decreasing order. 670 // Note that we do not resurrect Pods in this interval. Also note that scaling will take precedence over 671 // updates. 672 processCondemnedFn := func(i int) (bool, error) { 673 return ssc.processCondemned(ctx, set, firstUnhealthyPod, monotonic, condemned, i) 674 } 675 if shouldExit, err := runForAll(condemned, processCondemnedFn, monotonic); shouldExit || err != nil { 676 updateStatus(&status, set.Spec.MinReadySeconds, currentRevision, updateRevision, replicas, condemned) 677 return &status, err 678 } 679 680 updateStatus(&status, set.Spec.MinReadySeconds, currentRevision, updateRevision, replicas, condemned) 681 682 // for the OnDelete strategy we short circuit. Pods will be updated when they are manually deleted. 683 if set.Spec.UpdateStrategy.Type == apps.OnDeleteStatefulSetStrategyType { 684 return &status, nil 685 } 686 687 if utilfeature.DefaultFeatureGate.Enabled(features.MaxUnavailableStatefulSet) { 688 return updateStatefulSetAfterInvariantEstablished(ctx, 689 ssc, 690 set, 691 replicas, 692 updateRevision, 693 status, 694 ) 695 } 696 697 // we compute the minimum ordinal of the target sequence for a destructive update based on the strategy. 698 updateMin := 0 699 if set.Spec.UpdateStrategy.RollingUpdate != nil { 700 updateMin = int(*set.Spec.UpdateStrategy.RollingUpdate.Partition) 701 } 702 // we terminate the Pod with the largest ordinal that does not match the update revision. 703 for target := len(replicas) - 1; target >= updateMin; target-- { 704 705 // delete the Pod if it is not already terminating and does not match the update revision. 706 if getPodRevision(replicas[target]) != updateRevision.Name && !isTerminating(replicas[target]) { 707 logger.V(2).Info("Pod of StatefulSet is terminating for update", 708 "statefulSet", klog.KObj(set), "pod", klog.KObj(replicas[target])) 709 if err := ssc.podControl.DeleteStatefulPod(set, replicas[target]); err != nil { 710 if !errors.IsNotFound(err) { 711 return &status, err 712 } 713 } 714 status.CurrentReplicas-- 715 return &status, err 716 } 717 718 // wait for unhealthy Pods on update 719 if !isHealthy(replicas[target]) { 720 logger.V(4).Info("StatefulSet is waiting for Pod to update", 721 "statefulSet", klog.KObj(set), "pod", klog.KObj(replicas[target])) 722 return &status, nil 723 } 724 725 } 726 return &status, nil 727 } 728 729 func updateStatefulSetAfterInvariantEstablished( 730 ctx context.Context, 731 ssc *defaultStatefulSetControl, 732 set *apps.StatefulSet, 733 replicas []*v1.Pod, 734 updateRevision *apps.ControllerRevision, 735 status apps.StatefulSetStatus, 736 ) (*apps.StatefulSetStatus, error) { 737 738 logger := klog.FromContext(ctx) 739 replicaCount := int(*set.Spec.Replicas) 740 741 // we compute the minimum ordinal of the target sequence for a destructive update based on the strategy. 742 updateMin := 0 743 maxUnavailable := 1 744 if set.Spec.UpdateStrategy.RollingUpdate != nil { 745 updateMin = int(*set.Spec.UpdateStrategy.RollingUpdate.Partition) 746 747 // if the feature was enabled and then later disabled, MaxUnavailable may have a value 748 // more than 1. Ignore the passed in value and Use maxUnavailable as 1 to enforce 749 // expected behavior when feature gate is not enabled. 750 var err error 751 maxUnavailable, err = getStatefulSetMaxUnavailable(set.Spec.UpdateStrategy.RollingUpdate.MaxUnavailable, replicaCount) 752 if err != nil { 753 return &status, err 754 } 755 } 756 757 // Collect all targets in the range between getStartOrdinal(set) and getEndOrdinal(set). Count any targets in that range 758 // that are unhealthy i.e. terminated or not running and ready as unavailable). Select the 759 // (MaxUnavailable - Unavailable) Pods, in order with respect to their ordinal for termination. Delete 760 // those pods and count the successful deletions. Update the status with the correct number of deletions. 761 unavailablePods := 0 762 for target := len(replicas) - 1; target >= 0; target-- { 763 if !isHealthy(replicas[target]) { 764 unavailablePods++ 765 } 766 } 767 768 if unavailablePods >= maxUnavailable { 769 logger.V(2).Info("StatefulSet found unavailablePods, more than or equal to allowed maxUnavailable", 770 "statefulSet", klog.KObj(set), 771 "unavailablePods", unavailablePods, 772 "maxUnavailable", maxUnavailable) 773 return &status, nil 774 } 775 776 // Now we need to delete MaxUnavailable- unavailablePods 777 // start deleting one by one starting from the highest ordinal first 778 podsToDelete := maxUnavailable - unavailablePods 779 780 deletedPods := 0 781 for target := len(replicas) - 1; target >= updateMin && deletedPods < podsToDelete; target-- { 782 783 // delete the Pod if it is healthy and the revision doesnt match the target 784 if getPodRevision(replicas[target]) != updateRevision.Name && !isTerminating(replicas[target]) { 785 // delete the Pod if it is healthy and the revision doesnt match the target 786 logger.V(2).Info("StatefulSet terminating Pod for update", 787 "statefulSet", klog.KObj(set), 788 "pod", klog.KObj(replicas[target])) 789 if err := ssc.podControl.DeleteStatefulPod(set, replicas[target]); err != nil { 790 if !errors.IsNotFound(err) { 791 return &status, err 792 } 793 } 794 deletedPods++ 795 status.CurrentReplicas-- 796 } 797 } 798 return &status, nil 799 } 800 801 // updateStatefulSetStatus updates set's Status to be equal to status. If status indicates a complete update, it is 802 // mutated to indicate completion. If status is semantically equivalent to set's Status no update is performed. If the 803 // returned error is nil, the update is successful. 804 func (ssc *defaultStatefulSetControl) updateStatefulSetStatus( 805 ctx context.Context, 806 set *apps.StatefulSet, 807 status *apps.StatefulSetStatus) error { 808 // complete any in progress rolling update if necessary 809 completeRollingUpdate(set, status) 810 811 // if the status is not inconsistent do not perform an update 812 if !inconsistentStatus(set, status) { 813 return nil 814 } 815 816 // copy set and update its status 817 set = set.DeepCopy() 818 if err := ssc.statusUpdater.UpdateStatefulSetStatus(ctx, set, status); err != nil { 819 return err 820 } 821 822 return nil 823 } 824 825 var _ StatefulSetControlInterface = &defaultStatefulSetControl{}