sigs.k8s.io/cluster-api@v1.7.1/controlplane/kubeadm/internal/controllers/scale.go (about) 1 /* 2 Copyright 2020 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package controllers 18 19 import ( 20 "context" 21 "strings" 22 23 "github.com/blang/semver/v4" 24 "github.com/pkg/errors" 25 corev1 "k8s.io/api/core/v1" 26 apierrors "k8s.io/apimachinery/pkg/api/errors" 27 kerrors "k8s.io/apimachinery/pkg/util/errors" 28 "k8s.io/klog/v2" 29 ctrl "sigs.k8s.io/controller-runtime" 30 31 clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" 32 controlplanev1 "sigs.k8s.io/cluster-api/controlplane/kubeadm/api/v1beta1" 33 "sigs.k8s.io/cluster-api/controlplane/kubeadm/internal" 34 "sigs.k8s.io/cluster-api/util/collections" 35 "sigs.k8s.io/cluster-api/util/conditions" 36 ) 37 38 func (r *KubeadmControlPlaneReconciler) initializeControlPlane(ctx context.Context, controlPlane *internal.ControlPlane) (ctrl.Result, error) { 39 logger := ctrl.LoggerFrom(ctx) 40 41 bootstrapSpec := controlPlane.InitialControlPlaneConfig() 42 fd := controlPlane.NextFailureDomainForScaleUp(ctx) 43 if err := r.cloneConfigsAndGenerateMachine(ctx, controlPlane.Cluster, controlPlane.KCP, bootstrapSpec, fd); err != nil { 44 logger.Error(err, "Failed to create initial control plane Machine") 45 r.recorder.Eventf(controlPlane.KCP, corev1.EventTypeWarning, "FailedInitialization", "Failed to create initial control plane Machine for cluster %s control plane: %v", klog.KObj(controlPlane.Cluster), err) 46 return ctrl.Result{}, err 47 } 48 49 // Requeue the control plane, in case there are additional operations to perform 50 return ctrl.Result{Requeue: true}, nil 51 } 52 53 func (r *KubeadmControlPlaneReconciler) scaleUpControlPlane(ctx context.Context, controlPlane *internal.ControlPlane) (ctrl.Result, error) { 54 logger := ctrl.LoggerFrom(ctx) 55 56 // Run preflight checks to ensure that the control plane is stable before proceeding with a scale up/scale down operation; if not, wait. 57 if result, err := r.preflightChecks(ctx, controlPlane); err != nil || !result.IsZero() { 58 return result, err 59 } 60 61 // Create the bootstrap configuration 62 bootstrapSpec := controlPlane.JoinControlPlaneConfig() 63 fd := controlPlane.NextFailureDomainForScaleUp(ctx) 64 if err := r.cloneConfigsAndGenerateMachine(ctx, controlPlane.Cluster, controlPlane.KCP, bootstrapSpec, fd); err != nil { 65 logger.Error(err, "Failed to create additional control plane Machine") 66 r.recorder.Eventf(controlPlane.KCP, corev1.EventTypeWarning, "FailedScaleUp", "Failed to create additional control plane Machine for cluster % control plane: %v", klog.KObj(controlPlane.Cluster), err) 67 return ctrl.Result{}, err 68 } 69 70 // Requeue the control plane, in case there are other operations to perform 71 return ctrl.Result{Requeue: true}, nil 72 } 73 74 func (r *KubeadmControlPlaneReconciler) scaleDownControlPlane( 75 ctx context.Context, 76 controlPlane *internal.ControlPlane, 77 outdatedMachines collections.Machines, 78 ) (ctrl.Result, error) { 79 logger := ctrl.LoggerFrom(ctx) 80 81 // Pick the Machine that we should scale down. 82 machineToDelete, err := selectMachineForScaleDown(ctx, controlPlane, outdatedMachines) 83 if err != nil { 84 return ctrl.Result{}, errors.Wrap(err, "failed to select machine for scale down") 85 } 86 87 // Run preflight checks ensuring the control plane is stable before proceeding with a scale up/scale down operation; if not, wait. 88 // Given that we're scaling down, we can exclude the machineToDelete from the preflight checks. 89 if result, err := r.preflightChecks(ctx, controlPlane, machineToDelete); err != nil || !result.IsZero() { 90 return result, err 91 } 92 93 workloadCluster, err := controlPlane.GetWorkloadCluster(ctx) 94 if err != nil { 95 logger.Error(err, "Failed to create client to workload cluster") 96 return ctrl.Result{}, errors.Wrapf(err, "failed to create client to workload cluster") 97 } 98 99 if machineToDelete == nil { 100 logger.Info("Failed to pick control plane Machine to delete") 101 return ctrl.Result{}, errors.New("failed to pick control plane Machine to delete") 102 } 103 104 // If KCP should manage etcd, If etcd leadership is on machine that is about to be deleted, move it to the newest member available. 105 if controlPlane.IsEtcdManaged() { 106 etcdLeaderCandidate := controlPlane.Machines.Newest() 107 if err := workloadCluster.ForwardEtcdLeadership(ctx, machineToDelete, etcdLeaderCandidate); err != nil { 108 logger.Error(err, "Failed to move leadership to candidate machine", "candidate", etcdLeaderCandidate.Name) 109 return ctrl.Result{}, err 110 } 111 if err := workloadCluster.RemoveEtcdMemberForMachine(ctx, machineToDelete); err != nil { 112 logger.Error(err, "Failed to remove etcd member for machine") 113 return ctrl.Result{}, err 114 } 115 } 116 117 parsedVersion, err := semver.ParseTolerant(controlPlane.KCP.Spec.Version) 118 if err != nil { 119 return ctrl.Result{}, errors.Wrapf(err, "failed to parse kubernetes version %q", controlPlane.KCP.Spec.Version) 120 } 121 122 if err := workloadCluster.RemoveMachineFromKubeadmConfigMap(ctx, machineToDelete, parsedVersion); err != nil { 123 logger.Error(err, "Failed to remove machine from kubeadm ConfigMap") 124 return ctrl.Result{}, err 125 } 126 127 logger = logger.WithValues("Machine", klog.KObj(machineToDelete)) 128 if err := r.Client.Delete(ctx, machineToDelete); err != nil && !apierrors.IsNotFound(err) { 129 logger.Error(err, "Failed to delete control plane machine") 130 r.recorder.Eventf(controlPlane.KCP, corev1.EventTypeWarning, "FailedScaleDown", 131 "Failed to delete control plane Machine %s for cluster %s control plane: %v", machineToDelete.Name, klog.KObj(controlPlane.Cluster), err) 132 return ctrl.Result{}, err 133 } 134 135 // Requeue the control plane, in case there are additional operations to perform 136 return ctrl.Result{Requeue: true}, nil 137 } 138 139 // preflightChecks checks if the control plane is stable before proceeding with a scale up/scale down operation, 140 // where stable means that: 141 // - There are no machine deletion in progress 142 // - All the health conditions on KCP are true. 143 // - All the health conditions on the control plane machines are true. 144 // If the control plane is not passing preflight checks, it requeue. 145 // 146 // NOTE: this func uses KCP conditions, it is required to call reconcileControlPlaneConditions before this. 147 func (r *KubeadmControlPlaneReconciler) preflightChecks(ctx context.Context, controlPlane *internal.ControlPlane, excludeFor ...*clusterv1.Machine) (ctrl.Result, error) { //nolint:unparam 148 logger := ctrl.LoggerFrom(ctx) 149 150 // If there is no KCP-owned control-plane machines, then control-plane has not been initialized yet, 151 // so it is considered ok to proceed. 152 if controlPlane.Machines.Len() == 0 { 153 return ctrl.Result{}, nil 154 } 155 156 // If there are deleting machines, wait for the operation to complete. 157 if controlPlane.HasDeletingMachine() { 158 logger.Info("Waiting for machines to be deleted", "Machines", strings.Join(controlPlane.Machines.Filter(collections.HasDeletionTimestamp).Names(), ", ")) 159 return ctrl.Result{RequeueAfter: deleteRequeueAfter}, nil 160 } 161 162 // Check machine health conditions; if there are conditions with False or Unknown, then wait. 163 allMachineHealthConditions := []clusterv1.ConditionType{ 164 controlplanev1.MachineAPIServerPodHealthyCondition, 165 controlplanev1.MachineControllerManagerPodHealthyCondition, 166 controlplanev1.MachineSchedulerPodHealthyCondition, 167 } 168 if controlPlane.IsEtcdManaged() { 169 allMachineHealthConditions = append(allMachineHealthConditions, 170 controlplanev1.MachineEtcdPodHealthyCondition, 171 controlplanev1.MachineEtcdMemberHealthyCondition, 172 ) 173 } 174 machineErrors := []error{} 175 176 loopmachines: 177 for _, machine := range controlPlane.Machines { 178 for _, excluded := range excludeFor { 179 // If this machine should be excluded from the individual 180 // health check, continue the out loop. 181 if machine.Name == excluded.Name { 182 continue loopmachines 183 } 184 } 185 186 if machine.Status.NodeRef == nil { 187 // The conditions will only ever be set on a Machine if we're able to correlate a Machine to a Node. 188 // Correlating Machines to Nodes requires the nodeRef to be set. 189 // Instead of confusing users with errors about that the conditions are not set, let's point them 190 // towards the unset nodeRef (which is the root cause of the conditions not being there). 191 machineErrors = append(machineErrors, errors.Errorf("Machine %s does not have a corresponding Node yet (Machine.status.nodeRef not set)", machine.Name)) 192 } else { 193 for _, condition := range allMachineHealthConditions { 194 if err := preflightCheckCondition("Machine", machine, condition); err != nil { 195 machineErrors = append(machineErrors, err) 196 } 197 } 198 } 199 } 200 if len(machineErrors) > 0 { 201 aggregatedError := kerrors.NewAggregate(machineErrors) 202 r.recorder.Eventf(controlPlane.KCP, corev1.EventTypeWarning, "ControlPlaneUnhealthy", 203 "Waiting for control plane to pass preflight checks to continue reconciliation: %v", aggregatedError) 204 logger.Info("Waiting for control plane to pass preflight checks", "failures", aggregatedError.Error()) 205 206 return ctrl.Result{RequeueAfter: preflightFailedRequeueAfter}, nil 207 } 208 209 return ctrl.Result{}, nil 210 } 211 212 func preflightCheckCondition(kind string, obj conditions.Getter, condition clusterv1.ConditionType) error { 213 c := conditions.Get(obj, condition) 214 if c == nil { 215 return errors.Errorf("%s %s does not have %s condition", kind, obj.GetName(), condition) 216 } 217 if c.Status == corev1.ConditionFalse { 218 return errors.Errorf("%s %s reports %s condition is false (%s, %s)", kind, obj.GetName(), condition, c.Severity, c.Message) 219 } 220 if c.Status == corev1.ConditionUnknown { 221 return errors.Errorf("%s %s reports %s condition is unknown (%s)", kind, obj.GetName(), condition, c.Message) 222 } 223 return nil 224 } 225 226 func selectMachineForScaleDown(ctx context.Context, controlPlane *internal.ControlPlane, outdatedMachines collections.Machines) (*clusterv1.Machine, error) { 227 machines := controlPlane.Machines 228 switch { 229 case controlPlane.MachineWithDeleteAnnotation(outdatedMachines).Len() > 0: 230 machines = controlPlane.MachineWithDeleteAnnotation(outdatedMachines) 231 case controlPlane.MachineWithDeleteAnnotation(machines).Len() > 0: 232 machines = controlPlane.MachineWithDeleteAnnotation(machines) 233 case controlPlane.UnhealthyMachinesWithUnhealthyControlPlaneComponents(outdatedMachines).Len() > 0: 234 machines = controlPlane.UnhealthyMachinesWithUnhealthyControlPlaneComponents(outdatedMachines) 235 case outdatedMachines.Len() > 0: 236 machines = outdatedMachines 237 } 238 return controlPlane.MachineInFailureDomainWithMostMachines(ctx, machines) 239 }