k8s.io/kubernetes@v1.31.0-alpha.0.0.20240520171757-56147500dadc/pkg/controller/statefulset/stateful_set_control.go (about) 1 /* 2 Copyright 2016 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package statefulset 18 19 import ( 20 "context" 21 "sort" 22 "sync" 23 24 apps "k8s.io/api/apps/v1" 25 v1 "k8s.io/api/core/v1" 26 "k8s.io/apimachinery/pkg/api/errors" 27 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 28 utilerrors "k8s.io/apimachinery/pkg/util/errors" 29 utilfeature "k8s.io/apiserver/pkg/util/feature" 30 "k8s.io/klog/v2" 31 "k8s.io/kubernetes/pkg/controller/history" 32 "k8s.io/kubernetes/pkg/features" 33 ) 34 35 // Realistic value for maximum in-flight requests when processing in parallel mode. 36 const MaxBatchSize = 500 37 38 // StatefulSetControl implements the control logic for updating StatefulSets and their children Pods. It is implemented 39 // as an interface to allow for extensions that provide different semantics. Currently, there is only one implementation. 40 type StatefulSetControlInterface interface { 41 // UpdateStatefulSet implements the control logic for Pod creation, update, and deletion, and 42 // persistent volume creation, update, and deletion. 43 // If an implementation returns a non-nil error, the invocation will be retried using a rate-limited strategy. 44 // Implementors should sink any errors that they do not wish to trigger a retry, and they may feel free to 45 // exit exceptionally at any point provided they wish the update to be re-run at a later point in time. 46 UpdateStatefulSet(ctx context.Context, set *apps.StatefulSet, pods []*v1.Pod) (*apps.StatefulSetStatus, error) 47 // ListRevisions returns a array of the ControllerRevisions that represent the revisions of set. If the returned 48 // error is nil, the returns slice of ControllerRevisions is valid. 49 ListRevisions(set *apps.StatefulSet) ([]*apps.ControllerRevision, error) 50 // AdoptOrphanRevisions adopts any orphaned ControllerRevisions that match set's Selector. If all adoptions are 51 // successful the returned error is nil. 52 AdoptOrphanRevisions(set *apps.StatefulSet, revisions []*apps.ControllerRevision) error 53 } 54 55 // NewDefaultStatefulSetControl returns a new instance of the default implementation StatefulSetControlInterface that 56 // implements the documented semantics for StatefulSets. podControl is the PodControlInterface used to create, update, 57 // and delete Pods and to create PersistentVolumeClaims. statusUpdater is the StatefulSetStatusUpdaterInterface used 58 // to update the status of StatefulSets. You should use an instance returned from NewRealStatefulPodControl() for any 59 // scenario other than testing. 60 func NewDefaultStatefulSetControl( 61 podControl *StatefulPodControl, 62 statusUpdater StatefulSetStatusUpdaterInterface, 63 controllerHistory history.Interface) StatefulSetControlInterface { 64 return &defaultStatefulSetControl{podControl, statusUpdater, controllerHistory} 65 } 66 67 type defaultStatefulSetControl struct { 68 podControl *StatefulPodControl 69 statusUpdater StatefulSetStatusUpdaterInterface 70 controllerHistory history.Interface 71 } 72 73 // UpdateStatefulSet executes the core logic loop for a stateful set, applying the predictable and 74 // consistent monotonic update strategy by default - scale up proceeds in ordinal order, no new pod 75 // is created while any pod is unhealthy, and pods are terminated in descending order. The burst 76 // strategy allows these constraints to be relaxed - pods will be created and deleted eagerly and 77 // in no particular order. Clients using the burst strategy should be careful to ensure they 78 // understand the consistency implications of having unpredictable numbers of pods available. 79 func (ssc *defaultStatefulSetControl) UpdateStatefulSet(ctx context.Context, set *apps.StatefulSet, pods []*v1.Pod) (*apps.StatefulSetStatus, error) { 80 set = set.DeepCopy() // set is modified when a new revision is created in performUpdate. Make a copy now to avoid mutation errors. 81 82 // list all revisions and sort them 83 revisions, err := ssc.ListRevisions(set) 84 if err != nil { 85 return nil, err 86 } 87 history.SortControllerRevisions(revisions) 88 89 currentRevision, updateRevision, status, err := ssc.performUpdate(ctx, set, pods, revisions) 90 if err != nil { 91 errs := []error{err} 92 if agg, ok := err.(utilerrors.Aggregate); ok { 93 errs = agg.Errors() 94 } 95 return nil, utilerrors.NewAggregate(append(errs, ssc.truncateHistory(set, pods, revisions, currentRevision, updateRevision))) 96 } 97 98 // maintain the set's revision history limit 99 return status, ssc.truncateHistory(set, pods, revisions, currentRevision, updateRevision) 100 } 101 102 func (ssc *defaultStatefulSetControl) performUpdate( 103 ctx context.Context, set *apps.StatefulSet, pods []*v1.Pod, revisions []*apps.ControllerRevision) (*apps.ControllerRevision, *apps.ControllerRevision, *apps.StatefulSetStatus, error) { 104 var currentStatus *apps.StatefulSetStatus 105 logger := klog.FromContext(ctx) 106 // get the current, and update revisions 107 currentRevision, updateRevision, collisionCount, err := ssc.getStatefulSetRevisions(set, revisions) 108 if err != nil { 109 return currentRevision, updateRevision, currentStatus, err 110 } 111 112 // perform the main update function and get the status 113 currentStatus, err = ssc.updateStatefulSet(ctx, set, currentRevision, updateRevision, collisionCount, pods) 114 if err != nil && currentStatus == nil { 115 return currentRevision, updateRevision, nil, err 116 } 117 118 // make sure to update the latest status even if there is an error with non-nil currentStatus 119 statusErr := ssc.updateStatefulSetStatus(ctx, set, currentStatus) 120 if statusErr == nil { 121 logger.V(4).Info("Updated status", "statefulSet", klog.KObj(set), 122 "replicas", currentStatus.Replicas, 123 "readyReplicas", currentStatus.ReadyReplicas, 124 "currentReplicas", currentStatus.CurrentReplicas, 125 "updatedReplicas", currentStatus.UpdatedReplicas) 126 } 127 128 switch { 129 case err != nil && statusErr != nil: 130 logger.Error(statusErr, "Could not update status", "statefulSet", klog.KObj(set)) 131 return currentRevision, updateRevision, currentStatus, err 132 case err != nil: 133 return currentRevision, updateRevision, currentStatus, err 134 case statusErr != nil: 135 return currentRevision, updateRevision, currentStatus, statusErr 136 } 137 138 logger.V(4).Info("StatefulSet revisions", "statefulSet", klog.KObj(set), 139 "currentRevision", currentStatus.CurrentRevision, 140 "updateRevision", currentStatus.UpdateRevision) 141 142 return currentRevision, updateRevision, currentStatus, nil 143 } 144 145 func (ssc *defaultStatefulSetControl) ListRevisions(set *apps.StatefulSet) ([]*apps.ControllerRevision, error) { 146 selector, err := metav1.LabelSelectorAsSelector(set.Spec.Selector) 147 if err != nil { 148 return nil, err 149 } 150 return ssc.controllerHistory.ListControllerRevisions(set, selector) 151 } 152 153 func (ssc *defaultStatefulSetControl) AdoptOrphanRevisions( 154 set *apps.StatefulSet, 155 revisions []*apps.ControllerRevision) error { 156 for i := range revisions { 157 adopted, err := ssc.controllerHistory.AdoptControllerRevision(set, controllerKind, revisions[i]) 158 if err != nil { 159 return err 160 } 161 revisions[i] = adopted 162 } 163 return nil 164 } 165 166 // truncateHistory truncates any non-live ControllerRevisions in revisions from set's history. The UpdateRevision and 167 // CurrentRevision in set's Status are considered to be live. Any revisions associated with the Pods in pods are also 168 // considered to be live. Non-live revisions are deleted, starting with the revision with the lowest Revision, until 169 // only RevisionHistoryLimit revisions remain. If the returned error is nil the operation was successful. This method 170 // expects that revisions is sorted when supplied. 171 func (ssc *defaultStatefulSetControl) truncateHistory( 172 set *apps.StatefulSet, 173 pods []*v1.Pod, 174 revisions []*apps.ControllerRevision, 175 current *apps.ControllerRevision, 176 update *apps.ControllerRevision) error { 177 history := make([]*apps.ControllerRevision, 0, len(revisions)) 178 // mark all live revisions 179 live := map[string]bool{} 180 if current != nil { 181 live[current.Name] = true 182 } 183 if update != nil { 184 live[update.Name] = true 185 } 186 for i := range pods { 187 live[getPodRevision(pods[i])] = true 188 } 189 // collect live revisions and historic revisions 190 for i := range revisions { 191 if !live[revisions[i].Name] { 192 history = append(history, revisions[i]) 193 } 194 } 195 historyLen := len(history) 196 historyLimit := int(*set.Spec.RevisionHistoryLimit) 197 if historyLen <= historyLimit { 198 return nil 199 } 200 // delete any non-live history to maintain the revision limit. 201 history = history[:(historyLen - historyLimit)] 202 for i := 0; i < len(history); i++ { 203 if err := ssc.controllerHistory.DeleteControllerRevision(history[i]); err != nil { 204 return err 205 } 206 } 207 return nil 208 } 209 210 // getStatefulSetRevisions returns the current and update ControllerRevisions for set. It also 211 // returns a collision count that records the number of name collisions set saw when creating 212 // new ControllerRevisions. This count is incremented on every name collision and is used in 213 // building the ControllerRevision names for name collision avoidance. This method may create 214 // a new revision, or modify the Revision of an existing revision if an update to set is detected. 215 // This method expects that revisions is sorted when supplied. 216 func (ssc *defaultStatefulSetControl) getStatefulSetRevisions( 217 set *apps.StatefulSet, 218 revisions []*apps.ControllerRevision) (*apps.ControllerRevision, *apps.ControllerRevision, int32, error) { 219 var currentRevision, updateRevision *apps.ControllerRevision 220 221 revisionCount := len(revisions) 222 history.SortControllerRevisions(revisions) 223 224 // Use a local copy of set.Status.CollisionCount to avoid modifying set.Status directly. 225 // This copy is returned so the value gets carried over to set.Status in updateStatefulSet. 226 var collisionCount int32 227 if set.Status.CollisionCount != nil { 228 collisionCount = *set.Status.CollisionCount 229 } 230 231 // create a new revision from the current set 232 updateRevision, err := newRevision(set, nextRevision(revisions), &collisionCount) 233 if err != nil { 234 return nil, nil, collisionCount, err 235 } 236 237 // find any equivalent revisions 238 equalRevisions := history.FindEqualRevisions(revisions, updateRevision) 239 equalCount := len(equalRevisions) 240 241 if equalCount > 0 && history.EqualRevision(revisions[revisionCount-1], equalRevisions[equalCount-1]) { 242 // if the equivalent revision is immediately prior the update revision has not changed 243 updateRevision = revisions[revisionCount-1] 244 } else if equalCount > 0 { 245 // if the equivalent revision is not immediately prior we will roll back by incrementing the 246 // Revision of the equivalent revision 247 updateRevision, err = ssc.controllerHistory.UpdateControllerRevision( 248 equalRevisions[equalCount-1], 249 updateRevision.Revision) 250 if err != nil { 251 return nil, nil, collisionCount, err 252 } 253 } else { 254 //if there is no equivalent revision we create a new one 255 updateRevision, err = ssc.controllerHistory.CreateControllerRevision(set, updateRevision, &collisionCount) 256 if err != nil { 257 return nil, nil, collisionCount, err 258 } 259 } 260 261 // attempt to find the revision that corresponds to the current revision 262 for i := range revisions { 263 if revisions[i].Name == set.Status.CurrentRevision { 264 currentRevision = revisions[i] 265 break 266 } 267 } 268 269 // if the current revision is nil we initialize the history by setting it to the update revision 270 if currentRevision == nil { 271 currentRevision = updateRevision 272 } 273 274 return currentRevision, updateRevision, collisionCount, nil 275 } 276 277 func slowStartBatch(initialBatchSize int, remaining int, fn func(int) (bool, error)) (int, error) { 278 successes := 0 279 j := 0 280 for batchSize := min(remaining, initialBatchSize); batchSize > 0; batchSize = min(min(2*batchSize, remaining), MaxBatchSize) { 281 errCh := make(chan error, batchSize) 282 var wg sync.WaitGroup 283 wg.Add(batchSize) 284 for i := 0; i < batchSize; i++ { 285 go func(k int) { 286 defer wg.Done() 287 // Ignore the first parameter - relevant for monotonic only. 288 if _, err := fn(k); err != nil { 289 errCh <- err 290 } 291 }(j) 292 j++ 293 } 294 wg.Wait() 295 successes += batchSize - len(errCh) 296 close(errCh) 297 if len(errCh) > 0 { 298 errs := make([]error, 0) 299 for err := range errCh { 300 errs = append(errs, err) 301 } 302 return successes, utilerrors.NewAggregate(errs) 303 } 304 remaining -= batchSize 305 } 306 return successes, nil 307 } 308 309 type replicaStatus struct { 310 replicas int32 311 readyReplicas int32 312 availableReplicas int32 313 currentReplicas int32 314 updatedReplicas int32 315 } 316 317 func computeReplicaStatus(pods []*v1.Pod, minReadySeconds int32, currentRevision, updateRevision *apps.ControllerRevision) replicaStatus { 318 status := replicaStatus{} 319 for _, pod := range pods { 320 if isCreated(pod) { 321 status.replicas++ 322 } 323 324 // count the number of running and ready replicas 325 if isRunningAndReady(pod) { 326 status.readyReplicas++ 327 // count the number of running and available replicas 328 if isRunningAndAvailable(pod, minReadySeconds) { 329 status.availableReplicas++ 330 } 331 332 } 333 334 // count the number of current and update replicas 335 if isCreated(pod) && !isTerminating(pod) { 336 revision := getPodRevision(pod) 337 if revision == currentRevision.Name { 338 status.currentReplicas++ 339 } 340 if revision == updateRevision.Name { 341 status.updatedReplicas++ 342 } 343 } 344 } 345 return status 346 } 347 348 func updateStatus(status *apps.StatefulSetStatus, minReadySeconds int32, currentRevision, updateRevision *apps.ControllerRevision, podLists ...[]*v1.Pod) { 349 status.Replicas = 0 350 status.ReadyReplicas = 0 351 status.AvailableReplicas = 0 352 status.CurrentReplicas = 0 353 status.UpdatedReplicas = 0 354 for _, list := range podLists { 355 replicaStatus := computeReplicaStatus(list, minReadySeconds, currentRevision, updateRevision) 356 status.Replicas += replicaStatus.replicas 357 status.ReadyReplicas += replicaStatus.readyReplicas 358 status.AvailableReplicas += replicaStatus.availableReplicas 359 status.CurrentReplicas += replicaStatus.currentReplicas 360 status.UpdatedReplicas += replicaStatus.updatedReplicas 361 } 362 } 363 364 func (ssc *defaultStatefulSetControl) processReplica( 365 ctx context.Context, 366 set *apps.StatefulSet, 367 updateSet *apps.StatefulSet, 368 monotonic bool, 369 replicas []*v1.Pod, 370 i int) (bool, error) { 371 logger := klog.FromContext(ctx) 372 373 // Note that pods with phase Succeeded will also trigger this event. This is 374 // because final pod phase of evicted or otherwise forcibly stopped pods 375 // (e.g. terminated on node reboot) is determined by the exit code of the 376 // container, not by the reason for pod termination. We should restart the pod 377 // regardless of the exit code. 378 if isFailed(replicas[i]) || isSucceeded(replicas[i]) { 379 if replicas[i].DeletionTimestamp == nil { 380 if err := ssc.podControl.DeleteStatefulPod(set, replicas[i]); err != nil { 381 return true, err 382 } 383 } 384 // New pod should be generated on the next sync after the current pod is removed from etcd. 385 return true, nil 386 } 387 // If we find a Pod that has not been created we create the Pod 388 if !isCreated(replicas[i]) { 389 if utilfeature.DefaultFeatureGate.Enabled(features.StatefulSetAutoDeletePVC) { 390 if isStale, err := ssc.podControl.PodClaimIsStale(set, replicas[i]); err != nil { 391 return true, err 392 } else if isStale { 393 // If a pod has a stale PVC, no more work can be done this round. 394 return true, err 395 } 396 } 397 if err := ssc.podControl.CreateStatefulPod(ctx, set, replicas[i]); err != nil { 398 return true, err 399 } 400 if monotonic { 401 // if the set does not allow bursting, return immediately 402 return true, nil 403 } 404 } 405 406 // If the Pod is in pending state then trigger PVC creation to create missing PVCs 407 if isPending(replicas[i]) { 408 logger.V(4).Info( 409 "StatefulSet is triggering PVC creation for pending Pod", 410 "statefulSet", klog.KObj(set), "pod", klog.KObj(replicas[i])) 411 if err := ssc.podControl.createMissingPersistentVolumeClaims(ctx, set, replicas[i]); err != nil { 412 return true, err 413 } 414 } 415 416 // If we find a Pod that is currently terminating, we must wait until graceful deletion 417 // completes before we continue to make progress. 418 if isTerminating(replicas[i]) && monotonic { 419 logger.V(4).Info("StatefulSet is waiting for Pod to Terminate", 420 "statefulSet", klog.KObj(set), "pod", klog.KObj(replicas[i])) 421 return true, nil 422 } 423 424 // If we have a Pod that has been created but is not running and ready we can not make progress. 425 // We must ensure that all for each Pod, when we create it, all of its predecessors, with respect to its 426 // ordinal, are Running and Ready. 427 if !isRunningAndReady(replicas[i]) && monotonic { 428 logger.V(4).Info("StatefulSet is waiting for Pod to be Running and Ready", 429 "statefulSet", klog.KObj(set), "pod", klog.KObj(replicas[i])) 430 return true, nil 431 } 432 433 // If we have a Pod that has been created but is not available we can not make progress. 434 // We must ensure that all for each Pod, when we create it, all of its predecessors, with respect to its 435 // ordinal, are Available. 436 if !isRunningAndAvailable(replicas[i], set.Spec.MinReadySeconds) && monotonic { 437 logger.V(4).Info("StatefulSet is waiting for Pod to be Available", 438 "statefulSet", klog.KObj(set), "pod", klog.KObj(replicas[i])) 439 return true, nil 440 } 441 442 // Enforce the StatefulSet invariants 443 retentionMatch := true 444 if utilfeature.DefaultFeatureGate.Enabled(features.StatefulSetAutoDeletePVC) { 445 var err error 446 retentionMatch, err = ssc.podControl.ClaimsMatchRetentionPolicy(ctx, updateSet, replicas[i]) 447 // An error is expected if the pod is not yet fully updated, and so return is treated as matching. 448 if err != nil { 449 retentionMatch = true 450 } 451 } 452 453 if identityMatches(set, replicas[i]) && storageMatches(set, replicas[i]) && retentionMatch { 454 return false, nil 455 } 456 457 // Make a deep copy so we don't mutate the shared cache 458 replica := replicas[i].DeepCopy() 459 if err := ssc.podControl.UpdateStatefulPod(ctx, updateSet, replica); err != nil { 460 return true, err 461 } 462 463 return false, nil 464 } 465 466 func (ssc *defaultStatefulSetControl) processCondemned(ctx context.Context, set *apps.StatefulSet, firstUnhealthyPod *v1.Pod, monotonic bool, condemned []*v1.Pod, i int) (bool, error) { 467 logger := klog.FromContext(ctx) 468 if isTerminating(condemned[i]) { 469 // if we are in monotonic mode, block and wait for terminating pods to expire 470 if monotonic { 471 logger.V(4).Info("StatefulSet is waiting for Pod to Terminate prior to scale down", 472 "statefulSet", klog.KObj(set), "pod", klog.KObj(condemned[i])) 473 return true, nil 474 } 475 return false, nil 476 } 477 // if we are in monotonic mode and the condemned target is not the first unhealthy Pod block 478 if !isRunningAndReady(condemned[i]) && monotonic && condemned[i] != firstUnhealthyPod { 479 logger.V(4).Info("StatefulSet is waiting for Pod to be Running and Ready prior to scale down", 480 "statefulSet", klog.KObj(set), "pod", klog.KObj(firstUnhealthyPod)) 481 return true, nil 482 } 483 // if we are in monotonic mode and the condemned target is not the first unhealthy Pod, block. 484 if !isRunningAndAvailable(condemned[i], set.Spec.MinReadySeconds) && monotonic && condemned[i] != firstUnhealthyPod { 485 logger.V(4).Info("StatefulSet is waiting for Pod to be Available prior to scale down", 486 "statefulSet", klog.KObj(set), "pod", klog.KObj(firstUnhealthyPod)) 487 return true, nil 488 } 489 490 logger.V(2).Info("Pod of StatefulSet is terminating for scale down", 491 "statefulSet", klog.KObj(set), "pod", klog.KObj(condemned[i])) 492 return true, ssc.podControl.DeleteStatefulPod(set, condemned[i]) 493 } 494 495 func runForAll(pods []*v1.Pod, fn func(i int) (bool, error), monotonic bool) (bool, error) { 496 if monotonic { 497 for i := range pods { 498 if shouldExit, err := fn(i); shouldExit || err != nil { 499 return true, err 500 } 501 } 502 } else { 503 if _, err := slowStartBatch(1, len(pods), fn); err != nil { 504 return true, err 505 } 506 } 507 return false, nil 508 } 509 510 // updateStatefulSet performs the update function for a StatefulSet. This method creates, updates, and deletes Pods in 511 // the set in order to conform the system to the target state for the set. The target state always contains 512 // set.Spec.Replicas Pods with a Ready Condition. If the UpdateStrategy.Type for the set is 513 // RollingUpdateStatefulSetStrategyType then all Pods in the set must be at set.Status.CurrentRevision. 514 // If the UpdateStrategy.Type for the set is OnDeleteStatefulSetStrategyType, the target state implies nothing about 515 // the revisions of Pods in the set. If the UpdateStrategy.Type for the set is PartitionStatefulSetStrategyType, then 516 // all Pods with ordinal less than UpdateStrategy.Partition.Ordinal must be at Status.CurrentRevision and all other 517 // Pods must be at Status.UpdateRevision. If the returned error is nil, the returned StatefulSetStatus is valid and the 518 // update must be recorded. If the error is not nil, the method should be retried until successful. 519 func (ssc *defaultStatefulSetControl) updateStatefulSet( 520 ctx context.Context, 521 set *apps.StatefulSet, 522 currentRevision *apps.ControllerRevision, 523 updateRevision *apps.ControllerRevision, 524 collisionCount int32, 525 pods []*v1.Pod) (*apps.StatefulSetStatus, error) { 526 logger := klog.FromContext(ctx) 527 // get the current and update revisions of the set. 528 currentSet, err := ApplyRevision(set, currentRevision) 529 if err != nil { 530 return nil, err 531 } 532 updateSet, err := ApplyRevision(set, updateRevision) 533 if err != nil { 534 return nil, err 535 } 536 537 // set the generation, and revisions in the returned status 538 status := apps.StatefulSetStatus{} 539 status.ObservedGeneration = set.Generation 540 status.CurrentRevision = currentRevision.Name 541 status.UpdateRevision = updateRevision.Name 542 status.CollisionCount = new(int32) 543 *status.CollisionCount = collisionCount 544 545 updateStatus(&status, set.Spec.MinReadySeconds, currentRevision, updateRevision, pods) 546 547 replicaCount := int(*set.Spec.Replicas) 548 // slice that will contain all Pods such that getStartOrdinal(set) <= getOrdinal(pod) <= getEndOrdinal(set) 549 replicas := make([]*v1.Pod, replicaCount) 550 // slice that will contain all Pods such that getOrdinal(pod) < getStartOrdinal(set) OR getOrdinal(pod) > getEndOrdinal(set) 551 condemned := make([]*v1.Pod, 0, len(pods)) 552 unhealthy := 0 553 var firstUnhealthyPod *v1.Pod 554 555 // First we partition pods into two lists valid replicas and condemned Pods 556 for _, pod := range pods { 557 if podInOrdinalRange(pod, set) { 558 // if the ordinal of the pod is within the range of the current number of replicas, 559 // insert it at the indirection of its ordinal 560 replicas[getOrdinal(pod)-getStartOrdinal(set)] = pod 561 } else if getOrdinal(pod) >= 0 { 562 // if the ordinal is valid, but not within the range add it to the condemned list 563 condemned = append(condemned, pod) 564 } 565 // If the ordinal could not be parsed (ord < 0), ignore the Pod. 566 } 567 568 // for any empty indices in the sequence [0,set.Spec.Replicas) create a new Pod at the correct revision 569 for ord := getStartOrdinal(set); ord <= getEndOrdinal(set); ord++ { 570 replicaIdx := ord - getStartOrdinal(set) 571 if replicas[replicaIdx] == nil { 572 replicas[replicaIdx] = newVersionedStatefulSetPod( 573 currentSet, 574 updateSet, 575 currentRevision.Name, 576 updateRevision.Name, ord) 577 } 578 } 579 580 // sort the condemned Pods by their ordinals 581 sort.Sort(descendingOrdinal(condemned)) 582 583 // find the first unhealthy Pod 584 for i := range replicas { 585 if !isHealthy(replicas[i]) { 586 unhealthy++ 587 if firstUnhealthyPod == nil { 588 firstUnhealthyPod = replicas[i] 589 } 590 } 591 } 592 593 // or the first unhealthy condemned Pod (condemned are sorted in descending order for ease of use) 594 for i := len(condemned) - 1; i >= 0; i-- { 595 if !isHealthy(condemned[i]) { 596 unhealthy++ 597 if firstUnhealthyPod == nil { 598 firstUnhealthyPod = condemned[i] 599 } 600 } 601 } 602 603 if unhealthy > 0 { 604 logger.V(4).Info("StatefulSet has unhealthy Pods", "statefulSet", klog.KObj(set), "unhealthyReplicas", unhealthy, "pod", klog.KObj(firstUnhealthyPod)) 605 } 606 607 // If the StatefulSet is being deleted, don't do anything other than updating 608 // status. 609 if set.DeletionTimestamp != nil { 610 return &status, nil 611 } 612 613 monotonic := !allowsBurst(set) 614 615 // First, process each living replica. Exit if we run into an error or something blocking in monotonic mode. 616 processReplicaFn := func(i int) (bool, error) { 617 return ssc.processReplica(ctx, set, updateSet, monotonic, replicas, i) 618 } 619 if shouldExit, err := runForAll(replicas, processReplicaFn, monotonic); shouldExit || err != nil { 620 updateStatus(&status, set.Spec.MinReadySeconds, currentRevision, updateRevision, replicas, condemned) 621 return &status, err 622 } 623 624 // Fix pod claims for condemned pods, if necessary. 625 if utilfeature.DefaultFeatureGate.Enabled(features.StatefulSetAutoDeletePVC) { 626 fixPodClaim := func(i int) (bool, error) { 627 if matchPolicy, err := ssc.podControl.ClaimsMatchRetentionPolicy(ctx, updateSet, condemned[i]); err != nil { 628 return true, err 629 } else if !matchPolicy { 630 if err := ssc.podControl.UpdatePodClaimForRetentionPolicy(ctx, updateSet, condemned[i]); err != nil { 631 return true, err 632 } 633 } 634 return false, nil 635 } 636 if shouldExit, err := runForAll(condemned, fixPodClaim, monotonic); shouldExit || err != nil { 637 updateStatus(&status, set.Spec.MinReadySeconds, currentRevision, updateRevision, replicas, condemned) 638 return &status, err 639 } 640 } 641 642 // At this point, in monotonic mode all of the current Replicas are Running, Ready and Available, 643 // and we can consider termination. 644 // We will wait for all predecessors to be Running and Ready prior to attempting a deletion. 645 // We will terminate Pods in a monotonically decreasing order. 646 // Note that we do not resurrect Pods in this interval. Also note that scaling will take precedence over 647 // updates. 648 processCondemnedFn := func(i int) (bool, error) { 649 return ssc.processCondemned(ctx, set, firstUnhealthyPod, monotonic, condemned, i) 650 } 651 if shouldExit, err := runForAll(condemned, processCondemnedFn, monotonic); shouldExit || err != nil { 652 updateStatus(&status, set.Spec.MinReadySeconds, currentRevision, updateRevision, replicas, condemned) 653 return &status, err 654 } 655 656 updateStatus(&status, set.Spec.MinReadySeconds, currentRevision, updateRevision, replicas, condemned) 657 658 // for the OnDelete strategy we short circuit. Pods will be updated when they are manually deleted. 659 if set.Spec.UpdateStrategy.Type == apps.OnDeleteStatefulSetStrategyType { 660 return &status, nil 661 } 662 663 if utilfeature.DefaultFeatureGate.Enabled(features.MaxUnavailableStatefulSet) { 664 return updateStatefulSetAfterInvariantEstablished(ctx, 665 ssc, 666 set, 667 replicas, 668 updateRevision, 669 status, 670 ) 671 } 672 673 // we compute the minimum ordinal of the target sequence for a destructive update based on the strategy. 674 updateMin := 0 675 if set.Spec.UpdateStrategy.RollingUpdate != nil { 676 updateMin = int(*set.Spec.UpdateStrategy.RollingUpdate.Partition) 677 } 678 // we terminate the Pod with the largest ordinal that does not match the update revision. 679 for target := len(replicas) - 1; target >= updateMin; target-- { 680 681 // delete the Pod if it is not already terminating and does not match the update revision. 682 if getPodRevision(replicas[target]) != updateRevision.Name && !isTerminating(replicas[target]) { 683 logger.V(2).Info("Pod of StatefulSet is terminating for update", 684 "statefulSet", klog.KObj(set), "pod", klog.KObj(replicas[target])) 685 if err := ssc.podControl.DeleteStatefulPod(set, replicas[target]); err != nil { 686 if !errors.IsNotFound(err) { 687 return &status, err 688 } 689 } 690 status.CurrentReplicas-- 691 return &status, err 692 } 693 694 // wait for unhealthy Pods on update 695 if !isHealthy(replicas[target]) { 696 logger.V(4).Info("StatefulSet is waiting for Pod to update", 697 "statefulSet", klog.KObj(set), "pod", klog.KObj(replicas[target])) 698 return &status, nil 699 } 700 701 } 702 return &status, nil 703 } 704 705 func updateStatefulSetAfterInvariantEstablished( 706 ctx context.Context, 707 ssc *defaultStatefulSetControl, 708 set *apps.StatefulSet, 709 replicas []*v1.Pod, 710 updateRevision *apps.ControllerRevision, 711 status apps.StatefulSetStatus, 712 ) (*apps.StatefulSetStatus, error) { 713 714 logger := klog.FromContext(ctx) 715 replicaCount := int(*set.Spec.Replicas) 716 717 // we compute the minimum ordinal of the target sequence for a destructive update based on the strategy. 718 updateMin := 0 719 maxUnavailable := 1 720 if set.Spec.UpdateStrategy.RollingUpdate != nil { 721 updateMin = int(*set.Spec.UpdateStrategy.RollingUpdate.Partition) 722 723 // if the feature was enabled and then later disabled, MaxUnavailable may have a value 724 // more than 1. Ignore the passed in value and Use maxUnavailable as 1 to enforce 725 // expected behavior when feature gate is not enabled. 726 var err error 727 maxUnavailable, err = getStatefulSetMaxUnavailable(set.Spec.UpdateStrategy.RollingUpdate.MaxUnavailable, replicaCount) 728 if err != nil { 729 return &status, err 730 } 731 } 732 733 // Collect all targets in the range between getStartOrdinal(set) and getEndOrdinal(set). Count any targets in that range 734 // that are unhealthy i.e. terminated or not running and ready as unavailable). Select the 735 // (MaxUnavailable - Unavailable) Pods, in order with respect to their ordinal for termination. Delete 736 // those pods and count the successful deletions. Update the status with the correct number of deletions. 737 unavailablePods := 0 738 for target := len(replicas) - 1; target >= 0; target-- { 739 if !isHealthy(replicas[target]) { 740 unavailablePods++ 741 } 742 } 743 744 if unavailablePods >= maxUnavailable { 745 logger.V(2).Info("StatefulSet found unavailablePods, more than or equal to allowed maxUnavailable", 746 "statefulSet", klog.KObj(set), 747 "unavailablePods", unavailablePods, 748 "maxUnavailable", maxUnavailable) 749 return &status, nil 750 } 751 752 // Now we need to delete MaxUnavailable- unavailablePods 753 // start deleting one by one starting from the highest ordinal first 754 podsToDelete := maxUnavailable - unavailablePods 755 756 deletedPods := 0 757 for target := len(replicas) - 1; target >= updateMin && deletedPods < podsToDelete; target-- { 758 759 // delete the Pod if it is healthy and the revision doesnt match the target 760 if getPodRevision(replicas[target]) != updateRevision.Name && !isTerminating(replicas[target]) { 761 // delete the Pod if it is healthy and the revision doesnt match the target 762 logger.V(2).Info("StatefulSet terminating Pod for update", 763 "statefulSet", klog.KObj(set), 764 "pod", klog.KObj(replicas[target])) 765 if err := ssc.podControl.DeleteStatefulPod(set, replicas[target]); err != nil { 766 if !errors.IsNotFound(err) { 767 return &status, err 768 } 769 } 770 deletedPods++ 771 status.CurrentReplicas-- 772 } 773 } 774 return &status, nil 775 } 776 777 // updateStatefulSetStatus updates set's Status to be equal to status. If status indicates a complete update, it is 778 // mutated to indicate completion. If status is semantically equivalent to set's Status no update is performed. If the 779 // returned error is nil, the update is successful. 780 func (ssc *defaultStatefulSetControl) updateStatefulSetStatus( 781 ctx context.Context, 782 set *apps.StatefulSet, 783 status *apps.StatefulSetStatus) error { 784 // complete any in progress rolling update if necessary 785 completeRollingUpdate(set, status) 786 787 // if the status is not inconsistent do not perform an update 788 if !inconsistentStatus(set, status) { 789 return nil 790 } 791 792 // copy set and update its status 793 set = set.DeepCopy() 794 if err := ssc.statusUpdater.UpdateStatefulSetStatus(ctx, set, status); err != nil { 795 return err 796 } 797 798 return nil 799 } 800 801 var _ StatefulSetControlInterface = &defaultStatefulSetControl{}