sigs.k8s.io/cluster-api@v1.7.1/controlplane/kubeadm/internal/controllers/scale.go (about)

     1  /*
     2  Copyright 2020 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package controllers
    18  
    19  import (
    20  	"context"
    21  	"strings"
    22  
    23  	"github.com/blang/semver/v4"
    24  	"github.com/pkg/errors"
    25  	corev1 "k8s.io/api/core/v1"
    26  	apierrors "k8s.io/apimachinery/pkg/api/errors"
    27  	kerrors "k8s.io/apimachinery/pkg/util/errors"
    28  	"k8s.io/klog/v2"
    29  	ctrl "sigs.k8s.io/controller-runtime"
    30  
    31  	clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1"
    32  	controlplanev1 "sigs.k8s.io/cluster-api/controlplane/kubeadm/api/v1beta1"
    33  	"sigs.k8s.io/cluster-api/controlplane/kubeadm/internal"
    34  	"sigs.k8s.io/cluster-api/util/collections"
    35  	"sigs.k8s.io/cluster-api/util/conditions"
    36  )
    37  
    38  func (r *KubeadmControlPlaneReconciler) initializeControlPlane(ctx context.Context, controlPlane *internal.ControlPlane) (ctrl.Result, error) {
    39  	logger := ctrl.LoggerFrom(ctx)
    40  
    41  	bootstrapSpec := controlPlane.InitialControlPlaneConfig()
    42  	fd := controlPlane.NextFailureDomainForScaleUp(ctx)
    43  	if err := r.cloneConfigsAndGenerateMachine(ctx, controlPlane.Cluster, controlPlane.KCP, bootstrapSpec, fd); err != nil {
    44  		logger.Error(err, "Failed to create initial control plane Machine")
    45  		r.recorder.Eventf(controlPlane.KCP, corev1.EventTypeWarning, "FailedInitialization", "Failed to create initial control plane Machine for cluster %s control plane: %v", klog.KObj(controlPlane.Cluster), err)
    46  		return ctrl.Result{}, err
    47  	}
    48  
    49  	// Requeue the control plane, in case there are additional operations to perform
    50  	return ctrl.Result{Requeue: true}, nil
    51  }
    52  
    53  func (r *KubeadmControlPlaneReconciler) scaleUpControlPlane(ctx context.Context, controlPlane *internal.ControlPlane) (ctrl.Result, error) {
    54  	logger := ctrl.LoggerFrom(ctx)
    55  
    56  	// Run preflight checks to ensure that the control plane is stable before proceeding with a scale up/scale down operation; if not, wait.
    57  	if result, err := r.preflightChecks(ctx, controlPlane); err != nil || !result.IsZero() {
    58  		return result, err
    59  	}
    60  
    61  	// Create the bootstrap configuration
    62  	bootstrapSpec := controlPlane.JoinControlPlaneConfig()
    63  	fd := controlPlane.NextFailureDomainForScaleUp(ctx)
    64  	if err := r.cloneConfigsAndGenerateMachine(ctx, controlPlane.Cluster, controlPlane.KCP, bootstrapSpec, fd); err != nil {
    65  		logger.Error(err, "Failed to create additional control plane Machine")
    66  		r.recorder.Eventf(controlPlane.KCP, corev1.EventTypeWarning, "FailedScaleUp", "Failed to create additional control plane Machine for cluster % control plane: %v", klog.KObj(controlPlane.Cluster), err)
    67  		return ctrl.Result{}, err
    68  	}
    69  
    70  	// Requeue the control plane, in case there are other operations to perform
    71  	return ctrl.Result{Requeue: true}, nil
    72  }
    73  
    74  func (r *KubeadmControlPlaneReconciler) scaleDownControlPlane(
    75  	ctx context.Context,
    76  	controlPlane *internal.ControlPlane,
    77  	outdatedMachines collections.Machines,
    78  ) (ctrl.Result, error) {
    79  	logger := ctrl.LoggerFrom(ctx)
    80  
    81  	// Pick the Machine that we should scale down.
    82  	machineToDelete, err := selectMachineForScaleDown(ctx, controlPlane, outdatedMachines)
    83  	if err != nil {
    84  		return ctrl.Result{}, errors.Wrap(err, "failed to select machine for scale down")
    85  	}
    86  
    87  	// Run preflight checks ensuring the control plane is stable before proceeding with a scale up/scale down operation; if not, wait.
    88  	// Given that we're scaling down, we can exclude the machineToDelete from the preflight checks.
    89  	if result, err := r.preflightChecks(ctx, controlPlane, machineToDelete); err != nil || !result.IsZero() {
    90  		return result, err
    91  	}
    92  
    93  	workloadCluster, err := controlPlane.GetWorkloadCluster(ctx)
    94  	if err != nil {
    95  		logger.Error(err, "Failed to create client to workload cluster")
    96  		return ctrl.Result{}, errors.Wrapf(err, "failed to create client to workload cluster")
    97  	}
    98  
    99  	if machineToDelete == nil {
   100  		logger.Info("Failed to pick control plane Machine to delete")
   101  		return ctrl.Result{}, errors.New("failed to pick control plane Machine to delete")
   102  	}
   103  
   104  	// If KCP should manage etcd, If etcd leadership is on machine that is about to be deleted, move it to the newest member available.
   105  	if controlPlane.IsEtcdManaged() {
   106  		etcdLeaderCandidate := controlPlane.Machines.Newest()
   107  		if err := workloadCluster.ForwardEtcdLeadership(ctx, machineToDelete, etcdLeaderCandidate); err != nil {
   108  			logger.Error(err, "Failed to move leadership to candidate machine", "candidate", etcdLeaderCandidate.Name)
   109  			return ctrl.Result{}, err
   110  		}
   111  		if err := workloadCluster.RemoveEtcdMemberForMachine(ctx, machineToDelete); err != nil {
   112  			logger.Error(err, "Failed to remove etcd member for machine")
   113  			return ctrl.Result{}, err
   114  		}
   115  	}
   116  
   117  	parsedVersion, err := semver.ParseTolerant(controlPlane.KCP.Spec.Version)
   118  	if err != nil {
   119  		return ctrl.Result{}, errors.Wrapf(err, "failed to parse kubernetes version %q", controlPlane.KCP.Spec.Version)
   120  	}
   121  
   122  	if err := workloadCluster.RemoveMachineFromKubeadmConfigMap(ctx, machineToDelete, parsedVersion); err != nil {
   123  		logger.Error(err, "Failed to remove machine from kubeadm ConfigMap")
   124  		return ctrl.Result{}, err
   125  	}
   126  
   127  	logger = logger.WithValues("Machine", klog.KObj(machineToDelete))
   128  	if err := r.Client.Delete(ctx, machineToDelete); err != nil && !apierrors.IsNotFound(err) {
   129  		logger.Error(err, "Failed to delete control plane machine")
   130  		r.recorder.Eventf(controlPlane.KCP, corev1.EventTypeWarning, "FailedScaleDown",
   131  			"Failed to delete control plane Machine %s for cluster %s control plane: %v", machineToDelete.Name, klog.KObj(controlPlane.Cluster), err)
   132  		return ctrl.Result{}, err
   133  	}
   134  
   135  	// Requeue the control plane, in case there are additional operations to perform
   136  	return ctrl.Result{Requeue: true}, nil
   137  }
   138  
   139  // preflightChecks checks if the control plane is stable before proceeding with a scale up/scale down operation,
   140  // where stable means that:
   141  // - There are no machine deletion in progress
   142  // - All the health conditions on KCP are true.
   143  // - All the health conditions on the control plane machines are true.
   144  // If the control plane is not passing preflight checks, it requeue.
   145  //
   146  // NOTE: this func uses KCP conditions, it is required to call reconcileControlPlaneConditions before this.
   147  func (r *KubeadmControlPlaneReconciler) preflightChecks(ctx context.Context, controlPlane *internal.ControlPlane, excludeFor ...*clusterv1.Machine) (ctrl.Result, error) { //nolint:unparam
   148  	logger := ctrl.LoggerFrom(ctx)
   149  
   150  	// If there is no KCP-owned control-plane machines, then control-plane has not been initialized yet,
   151  	// so it is considered ok to proceed.
   152  	if controlPlane.Machines.Len() == 0 {
   153  		return ctrl.Result{}, nil
   154  	}
   155  
   156  	// If there are deleting machines, wait for the operation to complete.
   157  	if controlPlane.HasDeletingMachine() {
   158  		logger.Info("Waiting for machines to be deleted", "Machines", strings.Join(controlPlane.Machines.Filter(collections.HasDeletionTimestamp).Names(), ", "))
   159  		return ctrl.Result{RequeueAfter: deleteRequeueAfter}, nil
   160  	}
   161  
   162  	// Check machine health conditions; if there are conditions with False or Unknown, then wait.
   163  	allMachineHealthConditions := []clusterv1.ConditionType{
   164  		controlplanev1.MachineAPIServerPodHealthyCondition,
   165  		controlplanev1.MachineControllerManagerPodHealthyCondition,
   166  		controlplanev1.MachineSchedulerPodHealthyCondition,
   167  	}
   168  	if controlPlane.IsEtcdManaged() {
   169  		allMachineHealthConditions = append(allMachineHealthConditions,
   170  			controlplanev1.MachineEtcdPodHealthyCondition,
   171  			controlplanev1.MachineEtcdMemberHealthyCondition,
   172  		)
   173  	}
   174  	machineErrors := []error{}
   175  
   176  loopmachines:
   177  	for _, machine := range controlPlane.Machines {
   178  		for _, excluded := range excludeFor {
   179  			// If this machine should be excluded from the individual
   180  			// health check, continue the out loop.
   181  			if machine.Name == excluded.Name {
   182  				continue loopmachines
   183  			}
   184  		}
   185  
   186  		if machine.Status.NodeRef == nil {
   187  			// The conditions will only ever be set on a Machine if we're able to correlate a Machine to a Node.
   188  			// Correlating Machines to Nodes requires the nodeRef to be set.
   189  			// Instead of confusing users with errors about that the conditions are not set, let's point them
   190  			// towards the unset nodeRef (which is the root cause of the conditions not being there).
   191  			machineErrors = append(machineErrors, errors.Errorf("Machine %s does not have a corresponding Node yet (Machine.status.nodeRef not set)", machine.Name))
   192  		} else {
   193  			for _, condition := range allMachineHealthConditions {
   194  				if err := preflightCheckCondition("Machine", machine, condition); err != nil {
   195  					machineErrors = append(machineErrors, err)
   196  				}
   197  			}
   198  		}
   199  	}
   200  	if len(machineErrors) > 0 {
   201  		aggregatedError := kerrors.NewAggregate(machineErrors)
   202  		r.recorder.Eventf(controlPlane.KCP, corev1.EventTypeWarning, "ControlPlaneUnhealthy",
   203  			"Waiting for control plane to pass preflight checks to continue reconciliation: %v", aggregatedError)
   204  		logger.Info("Waiting for control plane to pass preflight checks", "failures", aggregatedError.Error())
   205  
   206  		return ctrl.Result{RequeueAfter: preflightFailedRequeueAfter}, nil
   207  	}
   208  
   209  	return ctrl.Result{}, nil
   210  }
   211  
   212  func preflightCheckCondition(kind string, obj conditions.Getter, condition clusterv1.ConditionType) error {
   213  	c := conditions.Get(obj, condition)
   214  	if c == nil {
   215  		return errors.Errorf("%s %s does not have %s condition", kind, obj.GetName(), condition)
   216  	}
   217  	if c.Status == corev1.ConditionFalse {
   218  		return errors.Errorf("%s %s reports %s condition is false (%s, %s)", kind, obj.GetName(), condition, c.Severity, c.Message)
   219  	}
   220  	if c.Status == corev1.ConditionUnknown {
   221  		return errors.Errorf("%s %s reports %s condition is unknown (%s)", kind, obj.GetName(), condition, c.Message)
   222  	}
   223  	return nil
   224  }
   225  
   226  func selectMachineForScaleDown(ctx context.Context, controlPlane *internal.ControlPlane, outdatedMachines collections.Machines) (*clusterv1.Machine, error) {
   227  	machines := controlPlane.Machines
   228  	switch {
   229  	case controlPlane.MachineWithDeleteAnnotation(outdatedMachines).Len() > 0:
   230  		machines = controlPlane.MachineWithDeleteAnnotation(outdatedMachines)
   231  	case controlPlane.MachineWithDeleteAnnotation(machines).Len() > 0:
   232  		machines = controlPlane.MachineWithDeleteAnnotation(machines)
   233  	case controlPlane.UnhealthyMachinesWithUnhealthyControlPlaneComponents(outdatedMachines).Len() > 0:
   234  		machines = controlPlane.UnhealthyMachinesWithUnhealthyControlPlaneComponents(outdatedMachines)
   235  	case outdatedMachines.Len() > 0:
   236  		machines = outdatedMachines
   237  	}
   238  	return controlPlane.MachineInFailureDomainWithMostMachines(ctx, machines)
   239  }