sigs.k8s.io/cluster-api@v1.6.3/internal/controllers/machinedeployment/machinedeployment_rolling.go (about) 1 /* 2 Copyright 2018 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package machinedeployment 18 19 import ( 20 "context" 21 "sort" 22 23 "github.com/pkg/errors" 24 "k8s.io/utils/integer" 25 ctrl "sigs.k8s.io/controller-runtime" 26 "sigs.k8s.io/controller-runtime/pkg/client" 27 28 clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" 29 "sigs.k8s.io/cluster-api/internal/controllers/machinedeployment/mdutil" 30 ) 31 32 // rolloutRolling implements the logic for rolling a new MachineSet. 33 func (r *Reconciler) rolloutRolling(ctx context.Context, md *clusterv1.MachineDeployment, msList []*clusterv1.MachineSet) error { 34 newMS, oldMSs, err := r.getAllMachineSetsAndSyncRevision(ctx, md, msList, true) 35 if err != nil { 36 return err 37 } 38 39 // newMS can be nil in case there is already a MachineSet associated with this deployment, 40 // but there are only either changes in annotations or MinReadySeconds. Or in other words, 41 // this can be nil if there are changes, but no replacement of existing machines is needed. 42 if newMS == nil { 43 return nil 44 } 45 46 allMSs := append(oldMSs, newMS) 47 48 // Scale up, if we can. 49 if err := r.reconcileNewMachineSet(ctx, allMSs, newMS, md); err != nil { 50 return err 51 } 52 53 if err := r.syncDeploymentStatus(allMSs, newMS, md); err != nil { 54 return err 55 } 56 57 // Scale down, if we can. 58 if err := r.reconcileOldMachineSets(ctx, allMSs, oldMSs, newMS, md); err != nil { 59 return err 60 } 61 62 if err := r.syncDeploymentStatus(allMSs, newMS, md); err != nil { 63 return err 64 } 65 66 if mdutil.DeploymentComplete(md, &md.Status) { 67 if err := r.cleanupDeployment(ctx, oldMSs, md); err != nil { 68 return err 69 } 70 } 71 72 return nil 73 } 74 75 func (r *Reconciler) reconcileNewMachineSet(ctx context.Context, allMSs []*clusterv1.MachineSet, newMS *clusterv1.MachineSet, deployment *clusterv1.MachineDeployment) error { 76 if deployment.Spec.Replicas == nil { 77 return errors.Errorf("spec.replicas for MachineDeployment %v is nil, this is unexpected", client.ObjectKeyFromObject(deployment)) 78 } 79 80 if newMS.Spec.Replicas == nil { 81 return errors.Errorf("spec.replicas for MachineSet %v is nil, this is unexpected", client.ObjectKeyFromObject(newMS)) 82 } 83 84 if *(newMS.Spec.Replicas) == *(deployment.Spec.Replicas) { 85 // Scaling not required. 86 return nil 87 } 88 89 if *(newMS.Spec.Replicas) > *(deployment.Spec.Replicas) { 90 // Scale down. 91 return r.scaleMachineSet(ctx, newMS, *(deployment.Spec.Replicas), deployment) 92 } 93 94 newReplicasCount, err := mdutil.NewMSNewReplicas(deployment, allMSs, *newMS.Spec.Replicas) 95 if err != nil { 96 return err 97 } 98 return r.scaleMachineSet(ctx, newMS, newReplicasCount, deployment) 99 } 100 101 func (r *Reconciler) reconcileOldMachineSets(ctx context.Context, allMSs []*clusterv1.MachineSet, oldMSs []*clusterv1.MachineSet, newMS *clusterv1.MachineSet, deployment *clusterv1.MachineDeployment) error { 102 log := ctrl.LoggerFrom(ctx) 103 104 if deployment.Spec.Replicas == nil { 105 return errors.Errorf("spec.replicas for MachineDeployment %v is nil, this is unexpected", 106 client.ObjectKeyFromObject(deployment)) 107 } 108 109 if newMS.Spec.Replicas == nil { 110 return errors.Errorf("spec.replicas for MachineSet %v is nil, this is unexpected", 111 client.ObjectKeyFromObject(newMS)) 112 } 113 114 oldMachinesCount := mdutil.GetReplicaCountForMachineSets(oldMSs) 115 if oldMachinesCount == 0 { 116 // Can't scale down further 117 return nil 118 } 119 120 allMachinesCount := mdutil.GetReplicaCountForMachineSets(allMSs) 121 log.V(4).Info("New MachineSet has available machines", 122 "machineset", client.ObjectKeyFromObject(newMS).String(), "available-replicas", newMS.Status.AvailableReplicas) 123 maxUnavailable := mdutil.MaxUnavailable(*deployment) 124 125 // Check if we can scale down. We can scale down in the following 2 cases: 126 // * Some old MachineSets have unhealthy replicas, we could safely scale down those unhealthy replicas since that won't further 127 // increase unavailability. 128 // * New MachineSet has scaled up and it's replicas becomes ready, then we can scale down old MachineSets in a further step. 129 // 130 // maxScaledDown := allMachinesCount - minAvailable - newMachineSetMachinesUnavailable 131 // take into account not only maxUnavailable and any surge machines that have been created, but also unavailable machines from 132 // the newMS, so that the unavailable machines from the newMS would not make us scale down old MachineSets in a further 133 // step(that will increase unavailability). 134 // 135 // Concrete example: 136 // 137 // * 10 replicas 138 // * 2 maxUnavailable (absolute number, not percent) 139 // * 3 maxSurge (absolute number, not percent) 140 // 141 // case 1: 142 // * Deployment is updated, newMS is created with 3 replicas, oldMS is scaled down to 8, and newMS is scaled up to 5. 143 // * The new MachineSet machines crashloop and never become available. 144 // * allMachinesCount is 13. minAvailable is 8. newMSMachinesUnavailable is 5. 145 // * A node fails and causes one of the oldMS machines to become unavailable. However, 13 - 8 - 5 = 0, so the oldMS won't be scaled down. 146 // * The user notices the crashloop and does kubectl rollout undo to rollback. 147 // * newMSMachinesUnavailable is 1, since we rolled back to the good MachineSet, so maxScaledDown = 13 - 8 - 1 = 4. 4 of the crashlooping machines will be scaled down. 148 // * The total number of machines will then be 9 and the newMS can be scaled up to 10. 149 // 150 // case 2: 151 // Same example, but pushing a new machine template instead of rolling back (aka "roll over"): 152 // * The new MachineSet created must start with 0 replicas because allMachinesCount is already at 13. 153 // * However, newMSMachinesUnavailable would also be 0, so the 2 old MachineSets could be scaled down by 5 (13 - 8 - 0), which would then 154 // allow the new MachineSet to be scaled up by 5. 155 minAvailable := *(deployment.Spec.Replicas) - maxUnavailable 156 newMSUnavailableMachineCount := *(newMS.Spec.Replicas) - newMS.Status.AvailableReplicas 157 maxScaledDown := allMachinesCount - minAvailable - newMSUnavailableMachineCount 158 if maxScaledDown <= 0 { 159 return nil 160 } 161 162 // Clean up unhealthy replicas first, otherwise unhealthy replicas will block deployment 163 // and cause timeout. See https://github.com/kubernetes/kubernetes/issues/16737 164 oldMSs, cleanupCount, err := r.cleanupUnhealthyReplicas(ctx, oldMSs, deployment, maxScaledDown) 165 if err != nil { 166 return err 167 } 168 169 log.V(4).Info("Cleaned up unhealthy replicas from old MachineSets", "count", cleanupCount) 170 171 // Scale down old MachineSets, need check maxUnavailable to ensure we can scale down 172 allMSs = oldMSs 173 allMSs = append(allMSs, newMS) 174 scaledDownCount, err := r.scaleDownOldMachineSetsForRollingUpdate(ctx, allMSs, oldMSs, deployment) 175 if err != nil { 176 return err 177 } 178 179 log.V(4).Info("Scaled down old MachineSets of MachineDeployment", "count", scaledDownCount) 180 return nil 181 } 182 183 // cleanupUnhealthyReplicas will scale down old MachineSets with unhealthy replicas, so that all unhealthy replicas will be deleted. 184 func (r *Reconciler) cleanupUnhealthyReplicas(ctx context.Context, oldMSs []*clusterv1.MachineSet, deployment *clusterv1.MachineDeployment, maxCleanupCount int32) ([]*clusterv1.MachineSet, int32, error) { 185 log := ctrl.LoggerFrom(ctx) 186 187 sort.Sort(mdutil.MachineSetsByCreationTimestamp(oldMSs)) 188 189 // Scale down all old MachineSets with any unhealthy replicas. MachineSet will honour Spec.DeletePolicy 190 // for deleting Machines. Machines with a deletion timestamp, with a failure message or without a nodeRef 191 // are preferred for all strategies. 192 // This results in a best effort to remove machines backing unhealthy nodes. 193 totalScaledDown := int32(0) 194 195 for _, targetMS := range oldMSs { 196 if targetMS.Spec.Replicas == nil { 197 return nil, 0, errors.Errorf("spec.replicas for MachineSet %v is nil, this is unexpected", client.ObjectKeyFromObject(targetMS)) 198 } 199 200 if totalScaledDown >= maxCleanupCount { 201 break 202 } 203 204 oldMSReplicas := *(targetMS.Spec.Replicas) 205 if oldMSReplicas == 0 { 206 // cannot scale down this MachineSet. 207 continue 208 } 209 210 oldMSAvailableReplicas := targetMS.Status.AvailableReplicas 211 log.V(4).Info("Found available Machines in old MachineSet", 212 "count", oldMSAvailableReplicas, "target-machineset", client.ObjectKeyFromObject(targetMS).String()) 213 if oldMSReplicas == oldMSAvailableReplicas { 214 // no unhealthy replicas found, no scaling required. 215 continue 216 } 217 218 remainingCleanupCount := maxCleanupCount - totalScaledDown 219 unhealthyCount := oldMSReplicas - oldMSAvailableReplicas 220 scaledDownCount := integer.Int32Min(remainingCleanupCount, unhealthyCount) 221 newReplicasCount := oldMSReplicas - scaledDownCount 222 223 if newReplicasCount > oldMSReplicas { 224 return nil, 0, errors.Errorf("when cleaning up unhealthy replicas, got invalid request to scale down %v: %d -> %d", 225 client.ObjectKeyFromObject(targetMS), oldMSReplicas, newReplicasCount) 226 } 227 228 if err := r.scaleMachineSet(ctx, targetMS, newReplicasCount, deployment); err != nil { 229 return nil, totalScaledDown, err 230 } 231 232 totalScaledDown += scaledDownCount 233 } 234 235 return oldMSs, totalScaledDown, nil 236 } 237 238 // scaleDownOldMachineSetsForRollingUpdate scales down old MachineSets when deployment strategy is "RollingUpdate". 239 // Need check maxUnavailable to ensure availability. 240 func (r *Reconciler) scaleDownOldMachineSetsForRollingUpdate(ctx context.Context, allMSs []*clusterv1.MachineSet, oldMSs []*clusterv1.MachineSet, deployment *clusterv1.MachineDeployment) (int32, error) { 241 log := ctrl.LoggerFrom(ctx) 242 243 if deployment.Spec.Replicas == nil { 244 return 0, errors.Errorf("spec.replicas for MachineDeployment %v is nil, this is unexpected", client.ObjectKeyFromObject(deployment)) 245 } 246 247 maxUnavailable := mdutil.MaxUnavailable(*deployment) 248 minAvailable := *(deployment.Spec.Replicas) - maxUnavailable 249 250 // Find the number of available machines. 251 availableMachineCount := mdutil.GetAvailableReplicaCountForMachineSets(allMSs) 252 253 // Check if we can scale down. 254 if availableMachineCount <= minAvailable { 255 // Cannot scale down. 256 return 0, nil 257 } 258 259 log.V(4).Info("Found available machines in deployment, scaling down old MSes", "count", availableMachineCount) 260 261 sort.Sort(mdutil.MachineSetsByCreationTimestamp(oldMSs)) 262 263 totalScaledDown := int32(0) 264 totalScaleDownCount := availableMachineCount - minAvailable 265 for _, targetMS := range oldMSs { 266 if targetMS.Spec.Replicas == nil { 267 return 0, errors.Errorf("spec.replicas for MachineSet %v is nil, this is unexpected", client.ObjectKeyFromObject(targetMS)) 268 } 269 270 if totalScaledDown >= totalScaleDownCount { 271 // No further scaling required. 272 break 273 } 274 275 if *(targetMS.Spec.Replicas) == 0 { 276 // cannot scale down this MachineSet. 277 continue 278 } 279 280 // Scale down. 281 scaleDownCount := integer.Int32Min(*(targetMS.Spec.Replicas), totalScaleDownCount-totalScaledDown) 282 newReplicasCount := *(targetMS.Spec.Replicas) - scaleDownCount 283 if newReplicasCount > *(targetMS.Spec.Replicas) { 284 return totalScaledDown, errors.Errorf("when scaling down old MachineSet, got invalid request to scale down %v: %d -> %d", 285 client.ObjectKeyFromObject(targetMS), *(targetMS.Spec.Replicas), newReplicasCount) 286 } 287 288 if err := r.scaleMachineSet(ctx, targetMS, newReplicasCount, deployment); err != nil { 289 return totalScaledDown, err 290 } 291 292 totalScaledDown += scaleDownCount 293 } 294 295 return totalScaledDown, nil 296 }