sigs.k8s.io/cluster-api@v1.7.1/controlplane/kubeadm/internal/controllers/remediation_test.go (about)

     1  /*
     2  Copyright 2020 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package controllers
    18  
    19  import (
    20  	"context"
    21  	"fmt"
    22  	"strings"
    23  	"testing"
    24  	"time"
    25  
    26  	. "github.com/onsi/gomega"
    27  	"github.com/pkg/errors"
    28  	corev1 "k8s.io/api/core/v1"
    29  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    30  	"k8s.io/apimachinery/pkg/util/intstr"
    31  	"k8s.io/client-go/tools/record"
    32  	utilptr "k8s.io/utils/ptr"
    33  	"sigs.k8s.io/controller-runtime/pkg/client"
    34  
    35  	clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1"
    36  	controlplanev1 "sigs.k8s.io/cluster-api/controlplane/kubeadm/api/v1beta1"
    37  	"sigs.k8s.io/cluster-api/controlplane/kubeadm/internal"
    38  	"sigs.k8s.io/cluster-api/util/collections"
    39  	"sigs.k8s.io/cluster-api/util/conditions"
    40  	"sigs.k8s.io/cluster-api/util/patch"
    41  )
    42  
    43  func TestGetMachineToBeRemediated(t *testing.T) {
    44  	t.Run("returns the oldest machine if there are no provisioning machines", func(t *testing.T) {
    45  		g := NewWithT(t)
    46  
    47  		ns, err := env.CreateNamespace(ctx, "ns1")
    48  		g.Expect(err).ToNot(HaveOccurred())
    49  		defer func() {
    50  			g.Expect(env.Cleanup(ctx, ns)).To(Succeed())
    51  		}()
    52  
    53  		m1 := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withMachineHealthCheckFailed())
    54  		m2 := createMachine(ctx, g, ns.Name, "m2-unhealthy-", withMachineHealthCheckFailed())
    55  
    56  		unhealthyMachines := collections.FromMachines(m1, m2)
    57  
    58  		g.Expect(getMachineToBeRemediated(unhealthyMachines).Name).To(HavePrefix("m1-unhealthy-"))
    59  	})
    60  
    61  	t.Run("returns the oldest of the provisioning machines", func(t *testing.T) {
    62  		g := NewWithT(t)
    63  
    64  		ns, err := env.CreateNamespace(ctx, "ns1")
    65  		g.Expect(err).ToNot(HaveOccurred())
    66  		defer func() {
    67  			g.Expect(env.Cleanup(ctx, ns)).To(Succeed())
    68  		}()
    69  
    70  		m1 := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withMachineHealthCheckFailed())
    71  		m2 := createMachine(ctx, g, ns.Name, "m2-unhealthy-", withMachineHealthCheckFailed(), withoutNodeRef())
    72  		m3 := createMachine(ctx, g, ns.Name, "m3-unhealthy-", withMachineHealthCheckFailed(), withoutNodeRef())
    73  
    74  		unhealthyMachines := collections.FromMachines(m1, m2, m3)
    75  
    76  		g.Expect(getMachineToBeRemediated(unhealthyMachines).Name).To(HavePrefix("m2-unhealthy-"))
    77  	})
    78  }
    79  
    80  func TestReconcileUnhealthyMachines(t *testing.T) {
    81  	g := NewWithT(t)
    82  
    83  	r := &KubeadmControlPlaneReconciler{
    84  		Client:   env.GetClient(),
    85  		recorder: record.NewFakeRecorder(32),
    86  	}
    87  	ns, err := env.CreateNamespace(ctx, "ns1")
    88  	g.Expect(err).ToNot(HaveOccurred())
    89  	defer func() {
    90  		g.Expect(env.Cleanup(ctx, ns)).To(Succeed())
    91  	}()
    92  
    93  	var removeFinalizer = func(g *WithT, m *clusterv1.Machine) {
    94  		patchHelper, err := patch.NewHelper(m, env.GetClient())
    95  		g.Expect(err).ToNot(HaveOccurred())
    96  		m.ObjectMeta.Finalizers = nil
    97  		g.Expect(patchHelper.Patch(ctx, m)).To(Succeed())
    98  	}
    99  
   100  	t.Run("It cleans up stuck remediation on previously unhealthy machines", func(t *testing.T) {
   101  		g := NewWithT(t)
   102  
   103  		m := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withStuckRemediation())
   104  
   105  		controlPlane := &internal.ControlPlane{
   106  			KCP:      &controlplanev1.KubeadmControlPlane{},
   107  			Cluster:  &clusterv1.Cluster{},
   108  			Machines: collections.FromMachines(m),
   109  		}
   110  		ret, err := r.reconcileUnhealthyMachines(ctx, controlPlane)
   111  
   112  		g.Expect(ret.IsZero()).To(BeTrue()) // Remediation skipped
   113  		g.Expect(err).ToNot(HaveOccurred())
   114  
   115  		g.Eventually(func() error {
   116  			if err := env.Get(ctx, client.ObjectKey{Namespace: m.Namespace, Name: m.Name}, m); err != nil {
   117  				return err
   118  			}
   119  			c := conditions.Get(m, clusterv1.MachineOwnerRemediatedCondition)
   120  			if c == nil {
   121  				return nil
   122  			}
   123  			return errors.Errorf("condition %s still exists", clusterv1.MachineOwnerRemediatedCondition)
   124  		}, 10*time.Second).Should(Succeed())
   125  	})
   126  
   127  	// Generic preflight checks
   128  	// Those are ore flight checks that happen no matter if the control plane has been already initialized or not.
   129  
   130  	t.Run("Remediation does not happen if there are no unhealthy machines", func(t *testing.T) {
   131  		g := NewWithT(t)
   132  
   133  		controlPlane := &internal.ControlPlane{
   134  			KCP:      &controlplanev1.KubeadmControlPlane{},
   135  			Cluster:  &clusterv1.Cluster{},
   136  			Machines: collections.New(),
   137  		}
   138  		ret, err := r.reconcileUnhealthyMachines(ctx, controlPlane)
   139  
   140  		g.Expect(ret.IsZero()).To(BeTrue()) // Remediation skipped
   141  		g.Expect(err).ToNot(HaveOccurred())
   142  	})
   143  	t.Run("reconcileUnhealthyMachines return early if another remediation is in progress", func(t *testing.T) {
   144  		g := NewWithT(t)
   145  
   146  		m := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withStuckRemediation())
   147  		conditions.MarkFalse(m, clusterv1.MachineHealthCheckSucceededCondition, clusterv1.MachineHasFailureReason, clusterv1.ConditionSeverityWarning, "")
   148  		conditions.MarkFalse(m, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "")
   149  		controlPlane := &internal.ControlPlane{
   150  			KCP: &controlplanev1.KubeadmControlPlane{
   151  				ObjectMeta: metav1.ObjectMeta{
   152  					Annotations: map[string]string{
   153  						controlplanev1.RemediationInProgressAnnotation: MustMarshalRemediationData(&RemediationData{
   154  							Machine:    "foo",
   155  							Timestamp:  metav1.Time{Time: time.Now().UTC()},
   156  							RetryCount: 0,
   157  						}),
   158  					},
   159  				},
   160  			},
   161  			Cluster:  &clusterv1.Cluster{},
   162  			Machines: collections.FromMachines(m),
   163  		}
   164  		ret, err := r.reconcileUnhealthyMachines(ctx, controlPlane)
   165  
   166  		g.Expect(ret.IsZero()).To(BeTrue()) // Remediation skipped
   167  		g.Expect(err).ToNot(HaveOccurred())
   168  	})
   169  	t.Run("reconcileUnhealthyMachines return early if the machine to be remediated is already being deleted", func(t *testing.T) {
   170  		g := NewWithT(t)
   171  
   172  		m := getDeletingMachine(ns.Name, "m1-unhealthy-deleting-", withMachineHealthCheckFailed())
   173  		conditions.MarkFalse(m, clusterv1.MachineHealthCheckSucceededCondition, clusterv1.MachineHasFailureReason, clusterv1.ConditionSeverityWarning, "")
   174  		conditions.MarkFalse(m, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "")
   175  		controlPlane := &internal.ControlPlane{
   176  			KCP:      &controlplanev1.KubeadmControlPlane{},
   177  			Cluster:  &clusterv1.Cluster{},
   178  			Machines: collections.FromMachines(m),
   179  		}
   180  		ret, err := r.reconcileUnhealthyMachines(ctx, controlPlane)
   181  
   182  		g.Expect(controlPlane.KCP.Annotations).ToNot(HaveKey(controlplanev1.RemediationInProgressAnnotation))
   183  
   184  		g.Expect(ret.IsZero()).To(BeTrue()) // Remediation skipped
   185  		g.Expect(err).ToNot(HaveOccurred())
   186  	})
   187  	t.Run("Remediation does not happen if MaxRetry is reached", func(t *testing.T) {
   188  		g := NewWithT(t)
   189  
   190  		m1 := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withMachineHealthCheckFailed(), withWaitBeforeDeleteFinalizer(), withRemediateForAnnotation(MustMarshalRemediationData(&RemediationData{
   191  			Machine:    "m0",
   192  			Timestamp:  metav1.Time{Time: time.Now().Add(-controlplanev1.DefaultMinHealthyPeriod / 2).UTC()}, // minHealthy not expired yet.
   193  			RetryCount: 3,
   194  		})))
   195  		m2 := createMachine(ctx, g, ns.Name, "m2-healthy-", withHealthyEtcdMember())
   196  		m3 := createMachine(ctx, g, ns.Name, "m3-healthy-", withHealthyEtcdMember())
   197  
   198  		controlPlane := &internal.ControlPlane{
   199  			KCP: &controlplanev1.KubeadmControlPlane{
   200  				Spec: controlplanev1.KubeadmControlPlaneSpec{
   201  					Replicas: utilptr.To[int32](3),
   202  					Version:  "v1.19.1",
   203  					RemediationStrategy: &controlplanev1.RemediationStrategy{
   204  						MaxRetry: utilptr.To[int32](3),
   205  					},
   206  				},
   207  			},
   208  			Cluster:  &clusterv1.Cluster{},
   209  			Machines: collections.FromMachines(m1, m2, m3),
   210  		}
   211  
   212  		r := &KubeadmControlPlaneReconciler{
   213  			Client:   env.GetClient(),
   214  			recorder: record.NewFakeRecorder(32),
   215  			managementCluster: &fakeManagementCluster{
   216  				Workload: fakeWorkloadCluster{
   217  					EtcdMembersResult: nodes(controlPlane.Machines),
   218  				},
   219  			},
   220  		}
   221  
   222  		ret, err := r.reconcileUnhealthyMachines(ctx, controlPlane)
   223  
   224  		g.Expect(ret.IsZero()).To(BeTrue()) // Remediation skipped
   225  		g.Expect(err).ToNot(HaveOccurred())
   226  
   227  		g.Expect(controlPlane.KCP.Annotations).ToNot(HaveKey(controlplanev1.RemediationInProgressAnnotation))
   228  
   229  		assertMachineCondition(ctx, g, m1, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP can't remediate this machine because the operation already failed 3 times (MaxRetry)")
   230  
   231  		err = env.Get(ctx, client.ObjectKey{Namespace: m1.Namespace, Name: m1.Name}, m1)
   232  		g.Expect(err).ToNot(HaveOccurred())
   233  		g.Expect(m1.ObjectMeta.DeletionTimestamp.IsZero()).To(BeTrue())
   234  
   235  		removeFinalizer(g, m1)
   236  		g.Expect(env.Cleanup(ctx, m1, m2, m3)).To(Succeed())
   237  	})
   238  	t.Run("Retry history is ignored if min healthy period is expired, default min healthy period", func(t *testing.T) {
   239  		g := NewWithT(t)
   240  
   241  		m1 := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withMachineHealthCheckFailed(), withWaitBeforeDeleteFinalizer(), withRemediateForAnnotation(MustMarshalRemediationData(&RemediationData{
   242  			Machine:    "m0",
   243  			Timestamp:  metav1.Time{Time: time.Now().Add(-2 * controlplanev1.DefaultMinHealthyPeriod).UTC()}, // minHealthyPeriod already expired.
   244  			RetryCount: 3,
   245  		})))
   246  		m2 := createMachine(ctx, g, ns.Name, "m2-healthy-", withHealthyEtcdMember())
   247  		m3 := createMachine(ctx, g, ns.Name, "m3-healthy-", withHealthyEtcdMember())
   248  
   249  		controlPlane := &internal.ControlPlane{
   250  			KCP: &controlplanev1.KubeadmControlPlane{
   251  				Spec: controlplanev1.KubeadmControlPlaneSpec{
   252  					Replicas: utilptr.To[int32](3),
   253  					Version:  "v1.19.1",
   254  					RemediationStrategy: &controlplanev1.RemediationStrategy{
   255  						MaxRetry: utilptr.To[int32](3),
   256  					},
   257  				},
   258  			},
   259  			Cluster:  &clusterv1.Cluster{},
   260  			Machines: collections.FromMachines(m1, m2, m3),
   261  		}
   262  
   263  		r := &KubeadmControlPlaneReconciler{
   264  			Client:   env.GetClient(),
   265  			recorder: record.NewFakeRecorder(32),
   266  			managementCluster: &fakeManagementCluster{
   267  				Workload: fakeWorkloadCluster{
   268  					EtcdMembersResult: nodes(controlPlane.Machines),
   269  				},
   270  			},
   271  		}
   272  
   273  		ret, err := r.reconcileUnhealthyMachines(ctx, controlPlane)
   274  
   275  		g.Expect(ret.IsZero()).To(BeFalse()) // Remediation completed, requeue
   276  		g.Expect(err).ToNot(HaveOccurred())
   277  
   278  		g.Expect(controlPlane.KCP.Annotations).To(HaveKey(controlplanev1.RemediationInProgressAnnotation))
   279  		remediationData, err := RemediationDataFromAnnotation(controlPlane.KCP.Annotations[controlplanev1.RemediationInProgressAnnotation])
   280  		g.Expect(err).ToNot(HaveOccurred())
   281  		g.Expect(remediationData.Machine).To(Equal(m1.Name))
   282  		g.Expect(remediationData.RetryCount).To(Equal(0))
   283  
   284  		assertMachineCondition(ctx, g, m1, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.RemediationInProgressReason, clusterv1.ConditionSeverityWarning, "")
   285  
   286  		err = env.Get(ctx, client.ObjectKey{Namespace: m1.Namespace, Name: m1.Name}, m1)
   287  		g.Expect(err).ToNot(HaveOccurred())
   288  		g.Expect(m1.ObjectMeta.DeletionTimestamp.IsZero()).To(BeFalse())
   289  
   290  		removeFinalizer(g, m1)
   291  		g.Expect(env.Cleanup(ctx, m1, m2, m3)).To(Succeed())
   292  	})
   293  	t.Run("Retry history is ignored if min healthy period is expired", func(t *testing.T) {
   294  		g := NewWithT(t)
   295  
   296  		minHealthyPeriod := 4 * controlplanev1.DefaultMinHealthyPeriod // big min healthy period, so we are user that we are not using DefaultMinHealthyPeriod.
   297  
   298  		m1 := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withMachineHealthCheckFailed(), withWaitBeforeDeleteFinalizer(), withRemediateForAnnotation(MustMarshalRemediationData(&RemediationData{
   299  			Machine:    "m0",
   300  			Timestamp:  metav1.Time{Time: time.Now().Add(-2 * minHealthyPeriod).UTC()}, // minHealthyPeriod already expired.
   301  			RetryCount: 3,
   302  		})))
   303  		m2 := createMachine(ctx, g, ns.Name, "m2-healthy-", withHealthyEtcdMember())
   304  		m3 := createMachine(ctx, g, ns.Name, "m3-healthy-", withHealthyEtcdMember())
   305  
   306  		controlPlane := &internal.ControlPlane{
   307  			KCP: &controlplanev1.KubeadmControlPlane{
   308  				Spec: controlplanev1.KubeadmControlPlaneSpec{
   309  					Replicas: utilptr.To[int32](3),
   310  					Version:  "v1.19.1",
   311  					RemediationStrategy: &controlplanev1.RemediationStrategy{
   312  						MaxRetry:         utilptr.To[int32](3),
   313  						MinHealthyPeriod: &metav1.Duration{Duration: minHealthyPeriod},
   314  					},
   315  				},
   316  			},
   317  			Cluster:  &clusterv1.Cluster{},
   318  			Machines: collections.FromMachines(m1, m2, m3),
   319  		}
   320  
   321  		r := &KubeadmControlPlaneReconciler{
   322  			Client:   env.GetClient(),
   323  			recorder: record.NewFakeRecorder(32),
   324  			managementCluster: &fakeManagementCluster{
   325  				Workload: fakeWorkloadCluster{
   326  					EtcdMembersResult: nodes(controlPlane.Machines),
   327  				},
   328  			},
   329  		}
   330  
   331  		ret, err := r.reconcileUnhealthyMachines(ctx, controlPlane)
   332  
   333  		g.Expect(ret.IsZero()).To(BeFalse()) // Remediation completed, requeue
   334  		g.Expect(err).ToNot(HaveOccurred())
   335  
   336  		g.Expect(controlPlane.KCP.Annotations).To(HaveKey(controlplanev1.RemediationInProgressAnnotation))
   337  		remediationData, err := RemediationDataFromAnnotation(controlPlane.KCP.Annotations[controlplanev1.RemediationInProgressAnnotation])
   338  		g.Expect(err).ToNot(HaveOccurred())
   339  		g.Expect(remediationData.Machine).To(Equal(m1.Name))
   340  		g.Expect(remediationData.RetryCount).To(Equal(0))
   341  
   342  		assertMachineCondition(ctx, g, m1, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.RemediationInProgressReason, clusterv1.ConditionSeverityWarning, "")
   343  
   344  		err = env.Get(ctx, client.ObjectKey{Namespace: m1.Namespace, Name: m1.Name}, m1)
   345  		g.Expect(err).ToNot(HaveOccurred())
   346  		g.Expect(m1.ObjectMeta.DeletionTimestamp.IsZero()).To(BeFalse())
   347  
   348  		removeFinalizer(g, m1)
   349  		g.Expect(env.Cleanup(ctx, m1, m2, m3)).To(Succeed())
   350  	})
   351  	t.Run("Remediation does not happen if RetryPeriod is not yet passed", func(t *testing.T) {
   352  		g := NewWithT(t)
   353  
   354  		m1 := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withMachineHealthCheckFailed(), withWaitBeforeDeleteFinalizer(), withRemediateForAnnotation(MustMarshalRemediationData(&RemediationData{
   355  			Machine:    "m0",
   356  			Timestamp:  metav1.Time{Time: time.Now().Add(-controlplanev1.DefaultMinHealthyPeriod / 2).UTC()}, // minHealthyPeriod not yet expired.
   357  			RetryCount: 2,
   358  		})))
   359  		m2 := createMachine(ctx, g, ns.Name, "m2-healthy-", withHealthyEtcdMember())
   360  		m3 := createMachine(ctx, g, ns.Name, "m3-healthy-", withHealthyEtcdMember())
   361  
   362  		controlPlane := &internal.ControlPlane{
   363  			KCP: &controlplanev1.KubeadmControlPlane{
   364  				Spec: controlplanev1.KubeadmControlPlaneSpec{
   365  					Replicas: utilptr.To[int32](3),
   366  					Version:  "v1.19.1",
   367  					RemediationStrategy: &controlplanev1.RemediationStrategy{
   368  						MaxRetry:    utilptr.To[int32](3),
   369  						RetryPeriod: metav1.Duration{Duration: controlplanev1.DefaultMinHealthyPeriod}, // RetryPeriod not yet expired.
   370  					},
   371  				},
   372  			},
   373  			Cluster:  &clusterv1.Cluster{},
   374  			Machines: collections.FromMachines(m1, m2, m3),
   375  		}
   376  
   377  		r := &KubeadmControlPlaneReconciler{
   378  			Client:   env.GetClient(),
   379  			recorder: record.NewFakeRecorder(32),
   380  			managementCluster: &fakeManagementCluster{
   381  				Workload: fakeWorkloadCluster{
   382  					EtcdMembersResult: nodes(controlPlane.Machines),
   383  				},
   384  			},
   385  		}
   386  
   387  		ret, err := r.reconcileUnhealthyMachines(ctx, controlPlane)
   388  
   389  		g.Expect(ret.IsZero()).To(BeTrue()) // Remediation skipped
   390  		g.Expect(err).ToNot(HaveOccurred())
   391  
   392  		g.Expect(controlPlane.KCP.Annotations).ToNot(HaveKey(controlplanev1.RemediationInProgressAnnotation))
   393  
   394  		assertMachineCondition(ctx, g, m1, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP can't remediate this machine because the operation already failed in the latest 1h0m0s (RetryPeriod)")
   395  
   396  		err = env.Get(ctx, client.ObjectKey{Namespace: m1.Namespace, Name: m1.Name}, m1)
   397  		g.Expect(err).ToNot(HaveOccurred())
   398  		g.Expect(m1.ObjectMeta.DeletionTimestamp.IsZero()).To(BeTrue())
   399  
   400  		removeFinalizer(g, m1)
   401  		g.Expect(env.Cleanup(ctx, m1, m2, m3)).To(Succeed())
   402  	})
   403  
   404  	// There are no preflight checks for when control plane is not yet initialized
   405  	// (it is the first CP, we can nuke it).
   406  
   407  	// Preflight checks for when control plane is already initialized.
   408  
   409  	t.Run("Remediation does not happen if desired replicas <= 1", func(t *testing.T) {
   410  		g := NewWithT(t)
   411  
   412  		m := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withMachineHealthCheckFailed())
   413  		controlPlane := &internal.ControlPlane{
   414  			KCP: &controlplanev1.KubeadmControlPlane{
   415  				Spec: controlplanev1.KubeadmControlPlaneSpec{
   416  					Replicas: utilptr.To[int32](1),
   417  					RolloutStrategy: &controlplanev1.RolloutStrategy{
   418  						RollingUpdate: &controlplanev1.RollingUpdate{
   419  							MaxSurge: &intstr.IntOrString{
   420  								IntVal: 1,
   421  							},
   422  						},
   423  					},
   424  				},
   425  				Status: controlplanev1.KubeadmControlPlaneStatus{
   426  					Initialized: true,
   427  				},
   428  			},
   429  			Cluster:  &clusterv1.Cluster{},
   430  			Machines: collections.FromMachines(m),
   431  		}
   432  		ret, err := r.reconcileUnhealthyMachines(ctx, controlPlane)
   433  
   434  		g.Expect(ret.IsZero()).To(BeTrue()) // Remediation skipped
   435  		g.Expect(err).ToNot(HaveOccurred())
   436  
   437  		g.Expect(controlPlane.KCP.Annotations).ToNot(HaveKey(controlplanev1.RemediationInProgressAnnotation))
   438  
   439  		assertMachineCondition(ctx, g, m, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP can't remediate if current replicas are less or equal to 1")
   440  
   441  		g.Expect(env.Cleanup(ctx, m)).To(Succeed())
   442  	})
   443  	t.Run("Remediation does not happen if there is another machine being deleted (not the one to be remediated)", func(t *testing.T) {
   444  		g := NewWithT(t)
   445  
   446  		m1 := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withMachineHealthCheckFailed())
   447  		m2 := createMachine(ctx, g, ns.Name, "m2-healthy-")
   448  		m3 := getDeletingMachine(ns.Name, "m3-deleting") // NB. This machine is not created, it gets only added to control plane
   449  		controlPlane := &internal.ControlPlane{
   450  			KCP: &controlplanev1.KubeadmControlPlane{
   451  				Spec: controlplanev1.KubeadmControlPlaneSpec{
   452  					Replicas: utilptr.To[int32](3),
   453  				},
   454  				Status: controlplanev1.KubeadmControlPlaneStatus{
   455  					Initialized: true,
   456  				},
   457  			},
   458  			Cluster:  &clusterv1.Cluster{},
   459  			Machines: collections.FromMachines(m1, m2, m3),
   460  		}
   461  		ret, err := r.reconcileUnhealthyMachines(ctx, controlPlane)
   462  
   463  		g.Expect(ret.IsZero()).To(BeTrue()) // Remediation skipped
   464  		g.Expect(err).ToNot(HaveOccurred())
   465  
   466  		g.Expect(controlPlane.KCP.Annotations).ToNot(HaveKey(controlplanev1.RemediationInProgressAnnotation))
   467  
   468  		assertMachineCondition(ctx, g, m1, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP waiting for control plane machine deletion to complete before triggering remediation")
   469  
   470  		g.Expect(env.Cleanup(ctx, m1, m2)).To(Succeed())
   471  	})
   472  	t.Run("Remediation does not happen if there is an healthy machine being provisioned", func(t *testing.T) {
   473  		g := NewWithT(t)
   474  
   475  		m1 := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withMachineHealthCheckFailed())
   476  		m2 := createMachine(ctx, g, ns.Name, "m2-healthy-")
   477  		m3 := createMachine(ctx, g, ns.Name, "m3-healthy-", withoutNodeRef()) // Provisioning
   478  		controlPlane := &internal.ControlPlane{
   479  			KCP: &controlplanev1.KubeadmControlPlane{
   480  				Spec: controlplanev1.KubeadmControlPlaneSpec{
   481  					Replicas: utilptr.To(int32(3)),
   482  				},
   483  				Status: controlplanev1.KubeadmControlPlaneStatus{
   484  					Initialized: true,
   485  				},
   486  			},
   487  			Cluster:  &clusterv1.Cluster{},
   488  			Machines: collections.FromMachines(m1, m2, m3),
   489  		}
   490  		ret, err := r.reconcileUnhealthyMachines(ctx, controlPlane)
   491  
   492  		g.Expect(ret.IsZero()).To(BeTrue()) // Remediation skipped
   493  		g.Expect(err).ToNot(HaveOccurred())
   494  
   495  		g.Expect(controlPlane.KCP.Annotations).ToNot(HaveKey(controlplanev1.RemediationInProgressAnnotation))
   496  
   497  		assertMachineCondition(ctx, g, m1, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP waiting for control plane machine provisioning to complete before triggering remediation")
   498  
   499  		g.Expect(env.Cleanup(ctx, m1, m2)).To(Succeed())
   500  	})
   501  	t.Run("Remediation does not happen if there is an healthy machine being provisioned - 4 CP (during 3 CP rolling upgrade)", func(t *testing.T) {
   502  		g := NewWithT(t)
   503  
   504  		m1 := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withMachineHealthCheckFailed())
   505  		m2 := createMachine(ctx, g, ns.Name, "m2-healthy-")
   506  		m3 := createMachine(ctx, g, ns.Name, "m3-healthy-")
   507  		m4 := createMachine(ctx, g, ns.Name, "m4-healthy-", withoutNodeRef()) // Provisioning
   508  		controlPlane := &internal.ControlPlane{
   509  			KCP: &controlplanev1.KubeadmControlPlane{
   510  				Spec: controlplanev1.KubeadmControlPlaneSpec{
   511  					Replicas: utilptr.To(int32(3)),
   512  				},
   513  				Status: controlplanev1.KubeadmControlPlaneStatus{
   514  					Initialized: true,
   515  				},
   516  			},
   517  			Cluster:  &clusterv1.Cluster{},
   518  			Machines: collections.FromMachines(m1, m2, m3, m4),
   519  		}
   520  		ret, err := r.reconcileUnhealthyMachines(ctx, controlPlane)
   521  
   522  		g.Expect(ret.IsZero()).To(BeTrue()) // Remediation skipped
   523  		g.Expect(err).ToNot(HaveOccurred())
   524  
   525  		g.Expect(controlPlane.KCP.Annotations).ToNot(HaveKey(controlplanev1.RemediationInProgressAnnotation))
   526  
   527  		assertMachineCondition(ctx, g, m1, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP waiting for control plane machine provisioning to complete before triggering remediation")
   528  
   529  		g.Expect(env.Cleanup(ctx, m1, m2)).To(Succeed())
   530  	})
   531  	t.Run("Remediation does not happen if there is at least one additional unhealthy etcd member on a 3 machine CP", func(t *testing.T) {
   532  		g := NewWithT(t)
   533  
   534  		m1 := createMachine(ctx, g, ns.Name, "m1-mhc-unhealthy-", withMachineHealthCheckFailed())
   535  		m2 := createMachine(ctx, g, ns.Name, "m2-etcd-unhealthy-", withUnhealthyEtcdMember())
   536  		m3 := createMachine(ctx, g, ns.Name, "m3-etcd-healthy-", withHealthyEtcdMember())
   537  
   538  		controlPlane := &internal.ControlPlane{
   539  			KCP: &controlplanev1.KubeadmControlPlane{
   540  				Spec: controlplanev1.KubeadmControlPlaneSpec{
   541  					Replicas: utilptr.To[int32](3),
   542  				},
   543  				Status: controlplanev1.KubeadmControlPlaneStatus{
   544  					Initialized: true,
   545  				},
   546  			},
   547  			Cluster:  &clusterv1.Cluster{},
   548  			Machines: collections.FromMachines(m1, m2, m3),
   549  		}
   550  
   551  		r := &KubeadmControlPlaneReconciler{
   552  			Client:   env.GetClient(),
   553  			recorder: record.NewFakeRecorder(32),
   554  			managementCluster: &fakeManagementCluster{
   555  				Workload: fakeWorkloadCluster{
   556  					EtcdMembersResult: nodes(controlPlane.Machines),
   557  				},
   558  			},
   559  		}
   560  		controlPlane.InjectTestManagementCluster(r.managementCluster)
   561  
   562  		ret, err := r.reconcileUnhealthyMachines(ctx, controlPlane)
   563  
   564  		g.Expect(ret.IsZero()).To(BeTrue()) // Remediation skipped
   565  		g.Expect(err).ToNot(HaveOccurred())
   566  
   567  		g.Expect(controlPlane.KCP.Annotations).ToNot(HaveKey(controlplanev1.RemediationInProgressAnnotation))
   568  
   569  		assertMachineCondition(ctx, g, m1, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP can't remediate this machine because this could result in etcd loosing quorum")
   570  
   571  		g.Expect(env.Cleanup(ctx, m1, m2, m3)).To(Succeed())
   572  	})
   573  	t.Run("Remediation does not happen if there are at least two additional unhealthy etcd member on a 5 machine CP", func(t *testing.T) {
   574  		g := NewWithT(t)
   575  
   576  		m1 := createMachine(ctx, g, ns.Name, "m1-mhc-unhealthy-", withMachineHealthCheckFailed())
   577  		m2 := createMachine(ctx, g, ns.Name, "m2-etcd-unhealthy-", withUnhealthyEtcdMember())
   578  		m3 := createMachine(ctx, g, ns.Name, "m3-etcd-unhealthy-", withUnhealthyEtcdMember())
   579  		m4 := createMachine(ctx, g, ns.Name, "m4-etcd-healthy-", withHealthyEtcdMember())
   580  		m5 := createMachine(ctx, g, ns.Name, "m5-etcd-healthy-", withHealthyEtcdMember())
   581  
   582  		controlPlane := &internal.ControlPlane{
   583  			KCP: &controlplanev1.KubeadmControlPlane{
   584  				Spec: controlplanev1.KubeadmControlPlaneSpec{
   585  					Replicas: utilptr.To[int32](5),
   586  				},
   587  				Status: controlplanev1.KubeadmControlPlaneStatus{
   588  					Initialized: true,
   589  				},
   590  			},
   591  			Cluster:  &clusterv1.Cluster{},
   592  			Machines: collections.FromMachines(m1, m2, m3, m4, m5),
   593  		}
   594  
   595  		r := &KubeadmControlPlaneReconciler{
   596  			Client:   env.GetClient(),
   597  			recorder: record.NewFakeRecorder(32),
   598  			managementCluster: &fakeManagementCluster{
   599  				Workload: fakeWorkloadCluster{
   600  					EtcdMembersResult: nodes(controlPlane.Machines),
   601  				},
   602  			},
   603  		}
   604  		controlPlane.InjectTestManagementCluster(r.managementCluster)
   605  
   606  		ret, err := r.reconcileUnhealthyMachines(ctx, controlPlane)
   607  
   608  		g.Expect(ret.IsZero()).To(BeTrue()) // Remediation skipped
   609  		g.Expect(err).ToNot(HaveOccurred())
   610  
   611  		g.Expect(controlPlane.KCP.Annotations).ToNot(HaveKey(controlplanev1.RemediationInProgressAnnotation))
   612  
   613  		assertMachineCondition(ctx, g, m1, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP can't remediate this machine because this could result in etcd loosing quorum")
   614  
   615  		g.Expect(env.Cleanup(ctx, m1, m2, m3, m4, m5)).To(Succeed())
   616  	})
   617  
   618  	// Remediation for when control plane is not yet initialized
   619  
   620  	t.Run("Remediation deletes unhealthy machine - 1 CP not initialized", func(t *testing.T) {
   621  		g := NewWithT(t)
   622  
   623  		m1 := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withMachineHealthCheckFailed(), withWaitBeforeDeleteFinalizer())
   624  
   625  		controlPlane := &internal.ControlPlane{
   626  			KCP: &controlplanev1.KubeadmControlPlane{
   627  				Spec: controlplanev1.KubeadmControlPlaneSpec{
   628  					Replicas: utilptr.To[int32](1),
   629  					Version:  "v1.19.1",
   630  				},
   631  				Status: controlplanev1.KubeadmControlPlaneStatus{
   632  					Initialized: false,
   633  				},
   634  			},
   635  			Cluster:  &clusterv1.Cluster{},
   636  			Machines: collections.FromMachines(m1),
   637  		}
   638  
   639  		r := &KubeadmControlPlaneReconciler{
   640  			Client:   env.GetClient(),
   641  			recorder: record.NewFakeRecorder(32),
   642  			managementCluster: &fakeManagementCluster{
   643  				Workload: fakeWorkloadCluster{
   644  					EtcdMembersResult: nodes(controlPlane.Machines),
   645  				},
   646  			},
   647  		}
   648  
   649  		ret, err := r.reconcileUnhealthyMachines(ctx, controlPlane)
   650  
   651  		g.Expect(ret.IsZero()).To(BeFalse()) // Remediation completed, requeue
   652  		g.Expect(err).ToNot(HaveOccurred())
   653  
   654  		g.Expect(controlPlane.KCP.Annotations).To(HaveKey(controlplanev1.RemediationInProgressAnnotation))
   655  		remediationData, err := RemediationDataFromAnnotation(controlPlane.KCP.Annotations[controlplanev1.RemediationInProgressAnnotation])
   656  		g.Expect(err).ToNot(HaveOccurred())
   657  		g.Expect(remediationData.Machine).To(Equal(m1.Name))
   658  		g.Expect(remediationData.RetryCount).To(Equal(0))
   659  
   660  		assertMachineCondition(ctx, g, m1, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.RemediationInProgressReason, clusterv1.ConditionSeverityWarning, "")
   661  
   662  		err = env.Get(ctx, client.ObjectKey{Namespace: m1.Namespace, Name: m1.Name}, m1)
   663  		g.Expect(err).ToNot(HaveOccurred())
   664  		g.Expect(m1.ObjectMeta.DeletionTimestamp.IsZero()).To(BeFalse())
   665  
   666  		removeFinalizer(g, m1)
   667  		g.Expect(env.Cleanup(ctx, m1)).To(Succeed())
   668  	})
   669  	t.Run("Subsequent remediation of the same machine increase retry count - 1 CP not initialized", func(t *testing.T) {
   670  		g := NewWithT(t)
   671  
   672  		m1 := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withMachineHealthCheckFailed(), withWaitBeforeDeleteFinalizer())
   673  
   674  		controlPlane := &internal.ControlPlane{
   675  			KCP: &controlplanev1.KubeadmControlPlane{
   676  				Spec: controlplanev1.KubeadmControlPlaneSpec{
   677  					Replicas: utilptr.To[int32](1),
   678  					Version:  "v1.19.1",
   679  				},
   680  				Status: controlplanev1.KubeadmControlPlaneStatus{
   681  					Initialized: false,
   682  				},
   683  			},
   684  			Cluster:  &clusterv1.Cluster{},
   685  			Machines: collections.FromMachines(m1),
   686  		}
   687  
   688  		// First reconcile, remediate machine m1 for the first time
   689  		r := &KubeadmControlPlaneReconciler{
   690  			Client:   env.GetClient(),
   691  			recorder: record.NewFakeRecorder(32),
   692  			managementCluster: &fakeManagementCluster{
   693  				Workload: fakeWorkloadCluster{
   694  					EtcdMembersResult: nodes(controlPlane.Machines),
   695  				},
   696  			},
   697  		}
   698  
   699  		ret, err := r.reconcileUnhealthyMachines(ctx, controlPlane)
   700  
   701  		g.Expect(ret.IsZero()).To(BeFalse()) // Remediation completed, requeue
   702  		g.Expect(err).ToNot(HaveOccurred())
   703  
   704  		g.Expect(controlPlane.KCP.Annotations).To(HaveKey(controlplanev1.RemediationInProgressAnnotation))
   705  		remediationData, err := RemediationDataFromAnnotation(controlPlane.KCP.Annotations[controlplanev1.RemediationInProgressAnnotation])
   706  		g.Expect(err).ToNot(HaveOccurred())
   707  		g.Expect(remediationData.Machine).To(Equal(m1.Name))
   708  		g.Expect(remediationData.RetryCount).To(Equal(0))
   709  
   710  		assertMachineCondition(ctx, g, m1, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.RemediationInProgressReason, clusterv1.ConditionSeverityWarning, "")
   711  
   712  		err = env.Get(ctx, client.ObjectKey{Namespace: m1.Namespace, Name: m1.Name}, m1)
   713  		g.Expect(err).ToNot(HaveOccurred())
   714  		g.Expect(m1.ObjectMeta.DeletionTimestamp.IsZero()).To(BeFalse())
   715  
   716  		removeFinalizer(g, m1)
   717  		g.Expect(env.CleanupAndWait(ctx, m1)).To(Succeed())
   718  
   719  		for i := 2; i < 4; i++ {
   720  			// Simulate the creation of a replacement for 0.
   721  			mi := createMachine(ctx, g, ns.Name, fmt.Sprintf("m%d-unhealthy-", i), withMachineHealthCheckFailed(), withWaitBeforeDeleteFinalizer(), withRemediateForAnnotation(MustMarshalRemediationData(remediationData)))
   722  
   723  			// Simulate KCP dropping RemediationInProgressAnnotation after creating the replacement machine.
   724  			delete(controlPlane.KCP.Annotations, controlplanev1.RemediationInProgressAnnotation)
   725  
   726  			controlPlane.Machines = collections.FromMachines(mi)
   727  
   728  			// Reconcile unhealthy replacements for m1.
   729  			r.managementCluster = &fakeManagementCluster{
   730  				Workload: fakeWorkloadCluster{
   731  					EtcdMembersResult: nodes(collections.FromMachines(mi)),
   732  				},
   733  			}
   734  			ret, err := r.reconcileUnhealthyMachines(ctx, controlPlane)
   735  
   736  			g.Expect(ret.IsZero()).To(BeFalse()) // Remediation completed, requeue
   737  			g.Expect(err).ToNot(HaveOccurred())
   738  
   739  			g.Expect(controlPlane.KCP.Annotations).To(HaveKey(controlplanev1.RemediationInProgressAnnotation))
   740  			remediationData, err = RemediationDataFromAnnotation(controlPlane.KCP.Annotations[controlplanev1.RemediationInProgressAnnotation])
   741  			g.Expect(err).ToNot(HaveOccurred())
   742  			g.Expect(remediationData.Machine).To(Equal(mi.Name))
   743  			g.Expect(remediationData.RetryCount).To(Equal(i - 1))
   744  
   745  			assertMachineCondition(ctx, g, mi, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.RemediationInProgressReason, clusterv1.ConditionSeverityWarning, "")
   746  
   747  			err = env.Get(ctx, client.ObjectKey{Namespace: mi.Namespace, Name: mi.Name}, mi)
   748  			g.Expect(err).ToNot(HaveOccurred())
   749  			g.Expect(mi.ObjectMeta.DeletionTimestamp.IsZero()).To(BeFalse())
   750  
   751  			removeFinalizer(g, mi)
   752  			g.Expect(env.CleanupAndWait(ctx, mi)).To(Succeed())
   753  		}
   754  	})
   755  
   756  	// Remediation for when control plane is already initialized
   757  
   758  	t.Run("Remediation deletes unhealthy machine - 2 CP (during 1 CP rolling upgrade)", func(t *testing.T) {
   759  		g := NewWithT(t)
   760  
   761  		m1 := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withMachineHealthCheckFailed(), withWaitBeforeDeleteFinalizer())
   762  		m2 := createMachine(ctx, g, ns.Name, "m2-healthy-", withHealthyEtcdMember())
   763  
   764  		controlPlane := &internal.ControlPlane{
   765  			KCP: &controlplanev1.KubeadmControlPlane{
   766  				Spec: controlplanev1.KubeadmControlPlaneSpec{
   767  					Replicas: utilptr.To[int32](2),
   768  					Version:  "v1.19.1",
   769  				},
   770  				Status: controlplanev1.KubeadmControlPlaneStatus{
   771  					Initialized: true,
   772  				},
   773  			},
   774  			Cluster:  &clusterv1.Cluster{},
   775  			Machines: collections.FromMachines(m1, m2),
   776  		}
   777  
   778  		r := &KubeadmControlPlaneReconciler{
   779  			Client:   env.GetClient(),
   780  			recorder: record.NewFakeRecorder(32),
   781  			managementCluster: &fakeManagementCluster{
   782  				Workload: fakeWorkloadCluster{
   783  					EtcdMembersResult: nodes(controlPlane.Machines),
   784  				},
   785  			},
   786  		}
   787  		controlPlane.InjectTestManagementCluster(r.managementCluster)
   788  
   789  		ret, err := r.reconcileUnhealthyMachines(ctx, controlPlane)
   790  
   791  		g.Expect(ret.IsZero()).To(BeFalse()) // Remediation completed, requeue
   792  		g.Expect(err).ToNot(HaveOccurred())
   793  
   794  		g.Expect(controlPlane.KCP.Annotations).To(HaveKey(controlplanev1.RemediationInProgressAnnotation))
   795  		remediationData, err := RemediationDataFromAnnotation(controlPlane.KCP.Annotations[controlplanev1.RemediationInProgressAnnotation])
   796  		g.Expect(err).ToNot(HaveOccurred())
   797  		g.Expect(remediationData.Machine).To(Equal(m1.Name))
   798  		g.Expect(remediationData.RetryCount).To(Equal(0))
   799  
   800  		assertMachineCondition(ctx, g, m1, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.RemediationInProgressReason, clusterv1.ConditionSeverityWarning, "")
   801  
   802  		err = env.Get(ctx, client.ObjectKey{Namespace: m1.Namespace, Name: m1.Name}, m1)
   803  		g.Expect(err).ToNot(HaveOccurred())
   804  		g.Expect(m1.ObjectMeta.DeletionTimestamp.IsZero()).To(BeFalse())
   805  
   806  		removeFinalizer(g, m1)
   807  		g.Expect(env.Cleanup(ctx, m1, m2)).To(Succeed())
   808  	})
   809  	t.Run("Remediation deletes unhealthy machine - 3 CP", func(t *testing.T) {
   810  		g := NewWithT(t)
   811  
   812  		m1 := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withMachineHealthCheckFailed(), withWaitBeforeDeleteFinalizer())
   813  		m2 := createMachine(ctx, g, ns.Name, "m2-healthy-", withHealthyEtcdMember())
   814  		m3 := createMachine(ctx, g, ns.Name, "m3-healthy-", withHealthyEtcdMember())
   815  
   816  		controlPlane := &internal.ControlPlane{
   817  			KCP: &controlplanev1.KubeadmControlPlane{
   818  				Spec: controlplanev1.KubeadmControlPlaneSpec{
   819  					Replicas: utilptr.To[int32](3),
   820  					Version:  "v1.19.1",
   821  				},
   822  				Status: controlplanev1.KubeadmControlPlaneStatus{
   823  					Initialized: true,
   824  				},
   825  			},
   826  			Cluster:  &clusterv1.Cluster{},
   827  			Machines: collections.FromMachines(m1, m2, m3),
   828  		}
   829  
   830  		r := &KubeadmControlPlaneReconciler{
   831  			Client:   env.GetClient(),
   832  			recorder: record.NewFakeRecorder(32),
   833  			managementCluster: &fakeManagementCluster{
   834  				Workload: fakeWorkloadCluster{
   835  					EtcdMembersResult: nodes(controlPlane.Machines),
   836  				},
   837  			},
   838  		}
   839  		controlPlane.InjectTestManagementCluster(r.managementCluster)
   840  
   841  		ret, err := r.reconcileUnhealthyMachines(ctx, controlPlane)
   842  
   843  		g.Expect(ret.IsZero()).To(BeFalse()) // Remediation completed, requeue
   844  		g.Expect(err).ToNot(HaveOccurred())
   845  
   846  		g.Expect(controlPlane.KCP.Annotations).To(HaveKey(controlplanev1.RemediationInProgressAnnotation))
   847  		remediationData, err := RemediationDataFromAnnotation(controlPlane.KCP.Annotations[controlplanev1.RemediationInProgressAnnotation])
   848  		g.Expect(err).ToNot(HaveOccurred())
   849  		g.Expect(remediationData.Machine).To(Equal(m1.Name))
   850  		g.Expect(remediationData.RetryCount).To(Equal(0))
   851  
   852  		assertMachineCondition(ctx, g, m1, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.RemediationInProgressReason, clusterv1.ConditionSeverityWarning, "")
   853  
   854  		err = env.Get(ctx, client.ObjectKey{Namespace: m1.Namespace, Name: m1.Name}, m1)
   855  		g.Expect(err).ToNot(HaveOccurred())
   856  		g.Expect(m1.ObjectMeta.DeletionTimestamp.IsZero()).To(BeFalse())
   857  
   858  		removeFinalizer(g, m1)
   859  		g.Expect(env.Cleanup(ctx, m1, m2, m3)).To(Succeed())
   860  	})
   861  	t.Run("Remediation deletes unhealthy machine failed to provision - 3 CP", func(t *testing.T) {
   862  		g := NewWithT(t)
   863  
   864  		m1 := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withMachineHealthCheckFailed(), withWaitBeforeDeleteFinalizer(), withoutNodeRef())
   865  		m2 := createMachine(ctx, g, ns.Name, "m2-healthy-", withHealthyEtcdMember())
   866  		m3 := createMachine(ctx, g, ns.Name, "m3-healthy-", withHealthyEtcdMember())
   867  
   868  		controlPlane := &internal.ControlPlane{
   869  			KCP: &controlplanev1.KubeadmControlPlane{
   870  				Spec: controlplanev1.KubeadmControlPlaneSpec{
   871  					Replicas: utilptr.To(int32(3)),
   872  					Version:  "v1.19.1",
   873  				},
   874  				Status: controlplanev1.KubeadmControlPlaneStatus{
   875  					Initialized: true,
   876  				},
   877  			},
   878  			Cluster:  &clusterv1.Cluster{},
   879  			Machines: collections.FromMachines(m1, m2, m3),
   880  		}
   881  
   882  		r := &KubeadmControlPlaneReconciler{
   883  			Client:   env.GetClient(),
   884  			recorder: record.NewFakeRecorder(32),
   885  			managementCluster: &fakeManagementCluster{
   886  				Workload: fakeWorkloadCluster{
   887  					EtcdMembersResult: nodes(controlPlane.Machines),
   888  				},
   889  			},
   890  		}
   891  		controlPlane.InjectTestManagementCluster(r.managementCluster)
   892  
   893  		ret, err := r.reconcileUnhealthyMachines(ctx, controlPlane)
   894  
   895  		g.Expect(ret.IsZero()).To(BeFalse()) // Remediation completed, requeue
   896  		g.Expect(err).ToNot(HaveOccurred())
   897  
   898  		g.Expect(controlPlane.KCP.Annotations).To(HaveKey(controlplanev1.RemediationInProgressAnnotation))
   899  		remediationData, err := RemediationDataFromAnnotation(controlPlane.KCP.Annotations[controlplanev1.RemediationInProgressAnnotation])
   900  		g.Expect(err).ToNot(HaveOccurred())
   901  		g.Expect(remediationData.Machine).To(Equal(m1.Name))
   902  		g.Expect(remediationData.RetryCount).To(Equal(0))
   903  
   904  		assertMachineCondition(ctx, g, m1, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.RemediationInProgressReason, clusterv1.ConditionSeverityWarning, "")
   905  
   906  		err = env.Get(ctx, client.ObjectKey{Namespace: m1.Namespace, Name: m1.Name}, m1)
   907  		g.Expect(err).ToNot(HaveOccurred())
   908  		g.Expect(m1.ObjectMeta.DeletionTimestamp.IsZero()).To(BeFalse())
   909  
   910  		removeFinalizer(g, m1)
   911  		g.Expect(env.Cleanup(ctx, m1, m2, m3)).To(Succeed())
   912  	})
   913  	t.Run("Remediation deletes unhealthy machine - 4 CP (during 3 CP rolling upgrade)", func(t *testing.T) {
   914  		g := NewWithT(t)
   915  
   916  		m1 := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withMachineHealthCheckFailed(), withWaitBeforeDeleteFinalizer())
   917  		m2 := createMachine(ctx, g, ns.Name, "m2-healthy-", withHealthyEtcdMember())
   918  		m3 := createMachine(ctx, g, ns.Name, "m3-healthy-", withHealthyEtcdMember())
   919  		m4 := createMachine(ctx, g, ns.Name, "m4-healthy-", withHealthyEtcdMember())
   920  
   921  		controlPlane := &internal.ControlPlane{
   922  			KCP: &controlplanev1.KubeadmControlPlane{
   923  				Spec: controlplanev1.KubeadmControlPlaneSpec{
   924  					Replicas: utilptr.To[int32](4),
   925  					Version:  "v1.19.1",
   926  				},
   927  				Status: controlplanev1.KubeadmControlPlaneStatus{
   928  					Initialized: true,
   929  				},
   930  			},
   931  			Cluster:  &clusterv1.Cluster{},
   932  			Machines: collections.FromMachines(m1, m2, m3, m4),
   933  		}
   934  
   935  		r := &KubeadmControlPlaneReconciler{
   936  			Client:   env.GetClient(),
   937  			recorder: record.NewFakeRecorder(32),
   938  			managementCluster: &fakeManagementCluster{
   939  				Workload: fakeWorkloadCluster{
   940  					EtcdMembersResult: nodes(controlPlane.Machines),
   941  				},
   942  			},
   943  		}
   944  		controlPlane.InjectTestManagementCluster(r.managementCluster)
   945  
   946  		ret, err := r.reconcileUnhealthyMachines(ctx, controlPlane)
   947  
   948  		g.Expect(ret.IsZero()).To(BeFalse()) // Remediation completed, requeue
   949  		g.Expect(err).ToNot(HaveOccurred())
   950  
   951  		g.Expect(controlPlane.KCP.Annotations).To(HaveKey(controlplanev1.RemediationInProgressAnnotation))
   952  		remediationData, err := RemediationDataFromAnnotation(controlPlane.KCP.Annotations[controlplanev1.RemediationInProgressAnnotation])
   953  		g.Expect(err).ToNot(HaveOccurred())
   954  		g.Expect(remediationData.Machine).To(Equal(m1.Name))
   955  		g.Expect(remediationData.RetryCount).To(Equal(0))
   956  
   957  		assertMachineCondition(ctx, g, m1, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.RemediationInProgressReason, clusterv1.ConditionSeverityWarning, "")
   958  
   959  		err = env.Get(ctx, client.ObjectKey{Namespace: m1.Namespace, Name: m1.Name}, m1)
   960  		g.Expect(err).ToNot(HaveOccurred())
   961  		g.Expect(m1.ObjectMeta.DeletionTimestamp.IsZero()).To(BeFalse())
   962  
   963  		removeFinalizer(g, m1)
   964  		g.Expect(env.Cleanup(ctx, m1, m2, m3, m4)).To(Succeed())
   965  	})
   966  	t.Run("Remediation deletes unhealthy machine failed to provision - 4 CP (during 3 CP rolling upgrade)", func(t *testing.T) {
   967  		g := NewWithT(t)
   968  
   969  		m1 := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withMachineHealthCheckFailed(), withWaitBeforeDeleteFinalizer(), withoutNodeRef())
   970  		m2 := createMachine(ctx, g, ns.Name, "m2-healthy-", withHealthyEtcdMember())
   971  		m3 := createMachine(ctx, g, ns.Name, "m3-healthy-", withHealthyEtcdMember())
   972  		m4 := createMachine(ctx, g, ns.Name, "m4-healthy-", withHealthyEtcdMember())
   973  
   974  		controlPlane := &internal.ControlPlane{
   975  			KCP: &controlplanev1.KubeadmControlPlane{
   976  				Spec: controlplanev1.KubeadmControlPlaneSpec{
   977  					Replicas: utilptr.To(int32(4)),
   978  					Version:  "v1.19.1",
   979  				},
   980  				Status: controlplanev1.KubeadmControlPlaneStatus{
   981  					Initialized: true,
   982  				},
   983  			},
   984  			Cluster:  &clusterv1.Cluster{},
   985  			Machines: collections.FromMachines(m1, m2, m3, m4),
   986  		}
   987  
   988  		r := &KubeadmControlPlaneReconciler{
   989  			Client:   env.GetClient(),
   990  			recorder: record.NewFakeRecorder(32),
   991  			managementCluster: &fakeManagementCluster{
   992  				Workload: fakeWorkloadCluster{
   993  					EtcdMembersResult: nodes(controlPlane.Machines),
   994  				},
   995  			},
   996  		}
   997  		controlPlane.InjectTestManagementCluster(r.managementCluster)
   998  
   999  		ret, err := r.reconcileUnhealthyMachines(ctx, controlPlane)
  1000  
  1001  		g.Expect(ret.IsZero()).To(BeFalse()) // Remediation completed, requeue
  1002  		g.Expect(err).ToNot(HaveOccurred())
  1003  
  1004  		g.Expect(controlPlane.KCP.Annotations).To(HaveKey(controlplanev1.RemediationInProgressAnnotation))
  1005  		remediationData, err := RemediationDataFromAnnotation(controlPlane.KCP.Annotations[controlplanev1.RemediationInProgressAnnotation])
  1006  		g.Expect(err).ToNot(HaveOccurred())
  1007  		g.Expect(remediationData.Machine).To(Equal(m1.Name))
  1008  		g.Expect(remediationData.RetryCount).To(Equal(0))
  1009  
  1010  		assertMachineCondition(ctx, g, m1, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.RemediationInProgressReason, clusterv1.ConditionSeverityWarning, "")
  1011  
  1012  		err = env.Get(ctx, client.ObjectKey{Namespace: m1.Namespace, Name: m1.Name}, m1)
  1013  		g.Expect(err).ToNot(HaveOccurred())
  1014  		g.Expect(m1.ObjectMeta.DeletionTimestamp.IsZero()).To(BeFalse())
  1015  
  1016  		removeFinalizer(g, m1)
  1017  		g.Expect(env.Cleanup(ctx, m1, m2, m3, m4)).To(Succeed())
  1018  	})
  1019  	t.Run("Remediation fails gracefully if no healthy Control Planes are available to become etcd leader", func(t *testing.T) {
  1020  		g := NewWithT(t)
  1021  
  1022  		m1 := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withMachineHealthCheckFailed(), withWaitBeforeDeleteFinalizer())
  1023  		m2 := createMachine(ctx, g, ns.Name, "m2-healthy-", withMachineHealthCheckFailed(), withHealthyEtcdMember())
  1024  		m3 := createMachine(ctx, g, ns.Name, "m3-healthy-", withMachineHealthCheckFailed(), withHealthyEtcdMember())
  1025  		m4 := createMachine(ctx, g, ns.Name, "m4-healthy-", withMachineHealthCheckFailed(), withHealthyEtcdMember())
  1026  
  1027  		controlPlane := &internal.ControlPlane{
  1028  			KCP: &controlplanev1.KubeadmControlPlane{
  1029  				Spec: controlplanev1.KubeadmControlPlaneSpec{
  1030  					Replicas: utilptr.To[int32](4),
  1031  					Version:  "v1.19.1",
  1032  				},
  1033  				Status: controlplanev1.KubeadmControlPlaneStatus{
  1034  					Initialized: true,
  1035  				},
  1036  			},
  1037  			Cluster:  &clusterv1.Cluster{},
  1038  			Machines: collections.FromMachines(m1, m2, m3, m4),
  1039  		}
  1040  
  1041  		r := &KubeadmControlPlaneReconciler{
  1042  			Client:   env.GetClient(),
  1043  			recorder: record.NewFakeRecorder(32),
  1044  			managementCluster: &fakeManagementCluster{
  1045  				Workload: fakeWorkloadCluster{
  1046  					EtcdMembersResult: nodes(controlPlane.Machines),
  1047  				},
  1048  			},
  1049  		}
  1050  		controlPlane.InjectTestManagementCluster(r.managementCluster)
  1051  
  1052  		_, err = r.reconcileUnhealthyMachines(ctx, controlPlane)
  1053  		g.Expect(err).ToNot(HaveOccurred())
  1054  
  1055  		g.Expect(controlPlane.KCP.Annotations).ToNot(HaveKey(controlplanev1.RemediationInProgressAnnotation))
  1056  
  1057  		assertMachineCondition(ctx, g, m1, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.RemediationFailedReason, clusterv1.ConditionSeverityWarning,
  1058  			"A control plane machine needs remediation, but there is no healthy machine to forward etcd leadership to. Skipping remediation")
  1059  
  1060  		removeFinalizer(g, m1)
  1061  		g.Expect(env.Cleanup(ctx, m1, m2, m3, m4)).To(Succeed())
  1062  	})
  1063  	t.Run("Subsequent remediation of the same machine increase retry count - 3 CP", func(t *testing.T) {
  1064  		g := NewWithT(t)
  1065  
  1066  		m1 := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withMachineHealthCheckFailed(), withWaitBeforeDeleteFinalizer())
  1067  		m2 := createMachine(ctx, g, ns.Name, "m2-healthy-", withHealthyEtcdMember())
  1068  		m3 := createMachine(ctx, g, ns.Name, "m3-healthy-", withHealthyEtcdMember())
  1069  
  1070  		controlPlane := &internal.ControlPlane{
  1071  			KCP: &controlplanev1.KubeadmControlPlane{
  1072  				Spec: controlplanev1.KubeadmControlPlaneSpec{
  1073  					Replicas: utilptr.To[int32](1),
  1074  					Version:  "v1.19.1",
  1075  				},
  1076  				Status: controlplanev1.KubeadmControlPlaneStatus{
  1077  					Initialized: false,
  1078  				},
  1079  			},
  1080  			Cluster:  &clusterv1.Cluster{},
  1081  			Machines: collections.FromMachines(m1, m2, m3),
  1082  		}
  1083  
  1084  		// First reconcile, remediate machine m1 for the first time
  1085  		r := &KubeadmControlPlaneReconciler{
  1086  			Client:   env.GetClient(),
  1087  			recorder: record.NewFakeRecorder(32),
  1088  			managementCluster: &fakeManagementCluster{
  1089  				Workload: fakeWorkloadCluster{
  1090  					EtcdMembersResult: nodes(controlPlane.Machines),
  1091  				},
  1092  			},
  1093  		}
  1094  
  1095  		ret, err := r.reconcileUnhealthyMachines(ctx, controlPlane)
  1096  
  1097  		g.Expect(ret.IsZero()).To(BeFalse()) // Remediation completed, requeue
  1098  		g.Expect(err).ToNot(HaveOccurred())
  1099  
  1100  		g.Expect(controlPlane.KCP.Annotations).To(HaveKey(controlplanev1.RemediationInProgressAnnotation))
  1101  		remediationData, err := RemediationDataFromAnnotation(controlPlane.KCP.Annotations[controlplanev1.RemediationInProgressAnnotation])
  1102  		g.Expect(err).ToNot(HaveOccurred())
  1103  		g.Expect(remediationData.Machine).To(Equal(m1.Name))
  1104  		g.Expect(remediationData.RetryCount).To(Equal(0))
  1105  
  1106  		assertMachineCondition(ctx, g, m1, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.RemediationInProgressReason, clusterv1.ConditionSeverityWarning, "")
  1107  
  1108  		err = env.Get(ctx, client.ObjectKey{Namespace: m1.Namespace, Name: m1.Name}, m1)
  1109  		g.Expect(err).ToNot(HaveOccurred())
  1110  		g.Expect(m1.ObjectMeta.DeletionTimestamp.IsZero()).To(BeFalse())
  1111  
  1112  		removeFinalizer(g, m1)
  1113  		g.Expect(env.CleanupAndWait(ctx, m1)).To(Succeed())
  1114  
  1115  		for i := 5; i < 6; i++ {
  1116  			// Simulate the creation of a replacement for m1.
  1117  			mi := createMachine(ctx, g, ns.Name, fmt.Sprintf("m%d-unhealthy-", i), withMachineHealthCheckFailed(), withWaitBeforeDeleteFinalizer(), withRemediateForAnnotation(MustMarshalRemediationData(remediationData)))
  1118  
  1119  			// Simulate KCP dropping RemediationInProgressAnnotation after creating the replacement machine.
  1120  			delete(controlPlane.KCP.Annotations, controlplanev1.RemediationInProgressAnnotation)
  1121  			controlPlane.Machines = collections.FromMachines(mi, m2, m3)
  1122  
  1123  			// Reconcile unhealthy replacements for m1.
  1124  			r.managementCluster = &fakeManagementCluster{
  1125  				Workload: fakeWorkloadCluster{
  1126  					EtcdMembersResult: nodes(collections.FromMachines(mi, m2, m3)),
  1127  				},
  1128  			}
  1129  
  1130  			ret, err := r.reconcileUnhealthyMachines(ctx, controlPlane)
  1131  
  1132  			g.Expect(ret.IsZero()).To(BeFalse()) // Remediation completed, requeue
  1133  			g.Expect(err).ToNot(HaveOccurred())
  1134  
  1135  			g.Expect(controlPlane.KCP.Annotations).To(HaveKey(controlplanev1.RemediationInProgressAnnotation))
  1136  			remediationData, err := RemediationDataFromAnnotation(controlPlane.KCP.Annotations[controlplanev1.RemediationInProgressAnnotation])
  1137  			g.Expect(err).ToNot(HaveOccurred())
  1138  			g.Expect(remediationData.Machine).To(Equal(mi.Name))
  1139  			g.Expect(remediationData.RetryCount).To(Equal(i - 4))
  1140  
  1141  			assertMachineCondition(ctx, g, mi, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.RemediationInProgressReason, clusterv1.ConditionSeverityWarning, "")
  1142  
  1143  			err = env.Get(ctx, client.ObjectKey{Namespace: mi.Namespace, Name: mi.Name}, mi)
  1144  			g.Expect(err).ToNot(HaveOccurred())
  1145  			g.Expect(mi.ObjectMeta.DeletionTimestamp.IsZero()).To(BeFalse())
  1146  
  1147  			removeFinalizer(g, mi)
  1148  			g.Expect(env.CleanupAndWait(ctx, mi)).To(Succeed())
  1149  		}
  1150  
  1151  		g.Expect(env.CleanupAndWait(ctx, m2, m3)).To(Succeed())
  1152  	})
  1153  }
  1154  
  1155  func TestReconcileUnhealthyMachinesSequences(t *testing.T) {
  1156  	var removeFinalizer = func(g *WithT, m *clusterv1.Machine) {
  1157  		patchHelper, err := patch.NewHelper(m, env.GetClient())
  1158  		g.Expect(err).ToNot(HaveOccurred())
  1159  		m.ObjectMeta.Finalizers = nil
  1160  		g.Expect(patchHelper.Patch(ctx, m)).To(Succeed())
  1161  	}
  1162  
  1163  	t.Run("Remediates the first CP machine having problems to come up", func(t *testing.T) {
  1164  		g := NewWithT(t)
  1165  
  1166  		ns, err := env.CreateNamespace(ctx, "ns1")
  1167  		g.Expect(err).ToNot(HaveOccurred())
  1168  		defer func() {
  1169  			g.Expect(env.Cleanup(ctx, ns)).To(Succeed())
  1170  		}()
  1171  
  1172  		// Control plane not initialized yet, First CP is unhealthy and gets remediated:
  1173  
  1174  		m1 := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withMachineHealthCheckFailed(), withWaitBeforeDeleteFinalizer())
  1175  
  1176  		controlPlane := &internal.ControlPlane{
  1177  			KCP: &controlplanev1.KubeadmControlPlane{
  1178  				Spec: controlplanev1.KubeadmControlPlaneSpec{
  1179  					Replicas: utilptr.To[int32](3),
  1180  					Version:  "v1.19.1",
  1181  				},
  1182  				Status: controlplanev1.KubeadmControlPlaneStatus{
  1183  					Initialized: false,
  1184  				},
  1185  			},
  1186  			Cluster:  &clusterv1.Cluster{},
  1187  			Machines: collections.FromMachines(m1),
  1188  		}
  1189  
  1190  		r := &KubeadmControlPlaneReconciler{
  1191  			Client:   env.GetClient(),
  1192  			recorder: record.NewFakeRecorder(32),
  1193  			managementCluster: &fakeManagementCluster{
  1194  				Workload: fakeWorkloadCluster{
  1195  					EtcdMembersResult: nodes(controlPlane.Machines),
  1196  				},
  1197  			},
  1198  		}
  1199  
  1200  		ret, err := r.reconcileUnhealthyMachines(ctx, controlPlane)
  1201  
  1202  		g.Expect(ret.IsZero()).To(BeFalse()) // Remediation completed, requeue
  1203  		g.Expect(err).ToNot(HaveOccurred())
  1204  
  1205  		g.Expect(controlPlane.KCP.Annotations).To(HaveKey(controlplanev1.RemediationInProgressAnnotation))
  1206  		remediationData, err := RemediationDataFromAnnotation(controlPlane.KCP.Annotations[controlplanev1.RemediationInProgressAnnotation])
  1207  		g.Expect(err).ToNot(HaveOccurred())
  1208  		g.Expect(remediationData.Machine).To(Equal(m1.Name))
  1209  		g.Expect(remediationData.RetryCount).To(Equal(0))
  1210  
  1211  		assertMachineCondition(ctx, g, m1, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.RemediationInProgressReason, clusterv1.ConditionSeverityWarning, "")
  1212  
  1213  		err = env.Get(ctx, client.ObjectKey{Namespace: m1.Namespace, Name: m1.Name}, m1)
  1214  		g.Expect(err).ToNot(HaveOccurred())
  1215  		g.Expect(m1.ObjectMeta.DeletionTimestamp.IsZero()).To(BeFalse())
  1216  
  1217  		removeFinalizer(g, m1)
  1218  		g.Expect(env.Cleanup(ctx, m1)).To(Succeed())
  1219  
  1220  		// Fake scaling up, which creates a remediation machine, fast forwards to when also the replacement machine is marked unhealthy.
  1221  		// NOTE: scale up also resets remediation in progress and remediation counts.
  1222  
  1223  		m2 := createMachine(ctx, g, ns.Name, "m2-unhealthy-", withMachineHealthCheckFailed(), withWaitBeforeDeleteFinalizer(), withRemediateForAnnotation(MustMarshalRemediationData(remediationData)))
  1224  		delete(controlPlane.KCP.Annotations, controlplanev1.RemediationInProgressAnnotation)
  1225  
  1226  		// Control plane not initialized yet, Second CP is unhealthy and gets remediated (retry 2)
  1227  
  1228  		controlPlane.Machines = collections.FromMachines(m2)
  1229  		r.managementCluster = &fakeManagementCluster{
  1230  			Workload: fakeWorkloadCluster{
  1231  				EtcdMembersResult: nodes(controlPlane.Machines),
  1232  			},
  1233  		}
  1234  
  1235  		ret, err = r.reconcileUnhealthyMachines(ctx, controlPlane)
  1236  
  1237  		g.Expect(ret.IsZero()).To(BeFalse()) // Remediation completed, requeue
  1238  		g.Expect(err).ToNot(HaveOccurred())
  1239  
  1240  		g.Expect(controlPlane.KCP.Annotations).To(HaveKey(controlplanev1.RemediationInProgressAnnotation))
  1241  		remediationData, err = RemediationDataFromAnnotation(controlPlane.KCP.Annotations[controlplanev1.RemediationInProgressAnnotation])
  1242  		g.Expect(err).ToNot(HaveOccurred())
  1243  		g.Expect(remediationData.Machine).To(Equal(m2.Name))
  1244  		g.Expect(remediationData.RetryCount).To(Equal(1))
  1245  
  1246  		assertMachineCondition(ctx, g, m2, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.RemediationInProgressReason, clusterv1.ConditionSeverityWarning, "")
  1247  
  1248  		err = env.Get(ctx, client.ObjectKey{Namespace: m2.Namespace, Name: m2.Name}, m1)
  1249  		g.Expect(err).ToNot(HaveOccurred())
  1250  		g.Expect(m2.ObjectMeta.DeletionTimestamp.IsZero()).To(BeFalse())
  1251  
  1252  		removeFinalizer(g, m2)
  1253  		g.Expect(env.Cleanup(ctx, m2)).To(Succeed())
  1254  
  1255  		// Fake scaling up, which creates a remediation machine, which is healthy.
  1256  		// NOTE: scale up also resets remediation in progress and remediation counts.
  1257  
  1258  		m3 := createMachine(ctx, g, ns.Name, "m3-healthy-", withHealthyEtcdMember(), withRemediateForAnnotation(MustMarshalRemediationData(remediationData)))
  1259  		delete(controlPlane.KCP.Annotations, controlplanev1.RemediationInProgressAnnotation)
  1260  
  1261  		g.Expect(env.Cleanup(ctx, m3)).To(Succeed())
  1262  	})
  1263  
  1264  	t.Run("Remediates the second CP machine having problems to come up", func(t *testing.T) {
  1265  		g := NewWithT(t)
  1266  
  1267  		ns, err := env.CreateNamespace(ctx, "ns1")
  1268  		g.Expect(err).ToNot(HaveOccurred())
  1269  		defer func() {
  1270  			g.Expect(env.Cleanup(ctx, ns)).To(Succeed())
  1271  		}()
  1272  
  1273  		// Control plane initialized yet, First CP healthy, second CP is unhealthy and gets remediated:
  1274  
  1275  		m1 := createMachine(ctx, g, ns.Name, "m1-healthy-", withHealthyEtcdMember())
  1276  		m2 := createMachine(ctx, g, ns.Name, "m2-unhealthy-", withMachineHealthCheckFailed(), withWaitBeforeDeleteFinalizer())
  1277  
  1278  		controlPlane := &internal.ControlPlane{
  1279  			KCP: &controlplanev1.KubeadmControlPlane{
  1280  				Spec: controlplanev1.KubeadmControlPlaneSpec{
  1281  					Replicas: utilptr.To[int32](3),
  1282  					Version:  "v1.19.1",
  1283  					RolloutStrategy: &controlplanev1.RolloutStrategy{
  1284  						RollingUpdate: &controlplanev1.RollingUpdate{
  1285  							MaxSurge: &intstr.IntOrString{
  1286  								IntVal: 1,
  1287  							},
  1288  						},
  1289  					},
  1290  				},
  1291  				Status: controlplanev1.KubeadmControlPlaneStatus{
  1292  					Initialized: true,
  1293  				},
  1294  			},
  1295  			Cluster:  &clusterv1.Cluster{},
  1296  			Machines: collections.FromMachines(m1, m2),
  1297  		}
  1298  
  1299  		r := &KubeadmControlPlaneReconciler{
  1300  			Client:   env.GetClient(),
  1301  			recorder: record.NewFakeRecorder(32),
  1302  			managementCluster: &fakeManagementCluster{
  1303  				Workload: fakeWorkloadCluster{
  1304  					EtcdMembersResult: nodes(controlPlane.Machines),
  1305  				},
  1306  			},
  1307  		}
  1308  		controlPlane.InjectTestManagementCluster(r.managementCluster)
  1309  
  1310  		ret, err := r.reconcileUnhealthyMachines(ctx, controlPlane)
  1311  
  1312  		g.Expect(ret.IsZero()).To(BeFalse()) // Remediation completed, requeue
  1313  		g.Expect(err).ToNot(HaveOccurred())
  1314  
  1315  		g.Expect(controlPlane.KCP.Annotations).To(HaveKey(controlplanev1.RemediationInProgressAnnotation))
  1316  		remediationData, err := RemediationDataFromAnnotation(controlPlane.KCP.Annotations[controlplanev1.RemediationInProgressAnnotation])
  1317  		g.Expect(err).ToNot(HaveOccurred())
  1318  		g.Expect(remediationData.Machine).To(Equal(m2.Name))
  1319  		g.Expect(remediationData.RetryCount).To(Equal(0))
  1320  
  1321  		assertMachineCondition(ctx, g, m2, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.RemediationInProgressReason, clusterv1.ConditionSeverityWarning, "")
  1322  
  1323  		err = env.Get(ctx, client.ObjectKey{Namespace: m2.Namespace, Name: m2.Name}, m2)
  1324  		g.Expect(err).ToNot(HaveOccurred())
  1325  		g.Expect(m2.ObjectMeta.DeletionTimestamp.IsZero()).To(BeFalse())
  1326  
  1327  		removeFinalizer(g, m2)
  1328  		g.Expect(env.Cleanup(ctx, m2)).To(Succeed())
  1329  
  1330  		// Fake scaling up, which creates a remediation machine, fast forwards to when also the replacement machine is marked unhealthy.
  1331  		// NOTE: scale up also resets remediation in progress and remediation counts.
  1332  
  1333  		m3 := createMachine(ctx, g, ns.Name, "m3-unhealthy-", withMachineHealthCheckFailed(), withWaitBeforeDeleteFinalizer(), withRemediateForAnnotation(MustMarshalRemediationData(remediationData)))
  1334  		delete(controlPlane.KCP.Annotations, controlplanev1.RemediationInProgressAnnotation)
  1335  
  1336  		// Control plane not initialized yet, Second CP is unhealthy and gets remediated (retry 2)
  1337  
  1338  		controlPlane.Machines = collections.FromMachines(m1, m3)
  1339  		r.managementCluster = &fakeManagementCluster{
  1340  			Workload: fakeWorkloadCluster{
  1341  				EtcdMembersResult: nodes(controlPlane.Machines),
  1342  			},
  1343  		}
  1344  		controlPlane.InjectTestManagementCluster(r.managementCluster)
  1345  
  1346  		ret, err = r.reconcileUnhealthyMachines(ctx, controlPlane)
  1347  
  1348  		g.Expect(ret.IsZero()).To(BeFalse()) // Remediation completed, requeue
  1349  		g.Expect(err).ToNot(HaveOccurred())
  1350  
  1351  		g.Expect(controlPlane.KCP.Annotations).To(HaveKey(controlplanev1.RemediationInProgressAnnotation))
  1352  		remediationData, err = RemediationDataFromAnnotation(controlPlane.KCP.Annotations[controlplanev1.RemediationInProgressAnnotation])
  1353  		g.Expect(err).ToNot(HaveOccurred())
  1354  		g.Expect(remediationData.Machine).To(Equal(m3.Name))
  1355  		g.Expect(remediationData.RetryCount).To(Equal(1))
  1356  
  1357  		assertMachineCondition(ctx, g, m3, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.RemediationInProgressReason, clusterv1.ConditionSeverityWarning, "")
  1358  
  1359  		err = env.Get(ctx, client.ObjectKey{Namespace: m3.Namespace, Name: m3.Name}, m3)
  1360  		g.Expect(err).ToNot(HaveOccurred())
  1361  		g.Expect(m3.ObjectMeta.DeletionTimestamp.IsZero()).To(BeFalse())
  1362  
  1363  		removeFinalizer(g, m3)
  1364  		g.Expect(env.Cleanup(ctx, m3)).To(Succeed())
  1365  
  1366  		// Fake scaling up, which creates a remediation machine, which is healthy.
  1367  		// NOTE: scale up also resets remediation in progress and remediation counts.
  1368  
  1369  		m4 := createMachine(ctx, g, ns.Name, "m4-healthy-", withHealthyEtcdMember(), withRemediateForAnnotation(MustMarshalRemediationData(remediationData)))
  1370  		delete(controlPlane.KCP.Annotations, controlplanev1.RemediationInProgressAnnotation)
  1371  
  1372  		g.Expect(env.Cleanup(ctx, m1, m4)).To(Succeed())
  1373  	})
  1374  
  1375  	t.Run("Remediates only one CP machine in case of multiple failures", func(t *testing.T) {
  1376  		g := NewWithT(t)
  1377  
  1378  		ns, err := env.CreateNamespace(ctx, "ns1")
  1379  		g.Expect(err).ToNot(HaveOccurred())
  1380  		defer func() {
  1381  			g.Expect(env.Cleanup(ctx, ns)).To(Succeed())
  1382  		}()
  1383  
  1384  		// Control plane initialized yet, First CP healthy, second and third CP are unhealthy. second gets remediated:
  1385  
  1386  		m1 := createMachine(ctx, g, ns.Name, "m1-healthy-", withHealthyEtcdMember())
  1387  		m2 := createMachine(ctx, g, ns.Name, "m2-unhealthy-", withHealthyEtcdMember(), withMachineHealthCheckFailed(), withWaitBeforeDeleteFinalizer())
  1388  		m3 := createMachine(ctx, g, ns.Name, "m3-unhealthy-", withHealthyEtcdMember(), withMachineHealthCheckFailed(), withWaitBeforeDeleteFinalizer())
  1389  
  1390  		controlPlane := &internal.ControlPlane{
  1391  			KCP: &controlplanev1.KubeadmControlPlane{
  1392  				Spec: controlplanev1.KubeadmControlPlaneSpec{
  1393  					Replicas: utilptr.To[int32](3),
  1394  					Version:  "v1.19.1",
  1395  					RolloutStrategy: &controlplanev1.RolloutStrategy{
  1396  						RollingUpdate: &controlplanev1.RollingUpdate{
  1397  							MaxSurge: &intstr.IntOrString{
  1398  								IntVal: 1,
  1399  							},
  1400  						},
  1401  					},
  1402  				},
  1403  				Status: controlplanev1.KubeadmControlPlaneStatus{
  1404  					Initialized: true,
  1405  				},
  1406  			},
  1407  			Cluster:  &clusterv1.Cluster{},
  1408  			Machines: collections.FromMachines(m1, m2, m3),
  1409  		}
  1410  
  1411  		r := &KubeadmControlPlaneReconciler{
  1412  			Client:   env.GetClient(),
  1413  			recorder: record.NewFakeRecorder(32),
  1414  			managementCluster: &fakeManagementCluster{
  1415  				Workload: fakeWorkloadCluster{
  1416  					EtcdMembersResult: nodes(controlPlane.Machines),
  1417  				},
  1418  			},
  1419  		}
  1420  		controlPlane.InjectTestManagementCluster(r.managementCluster)
  1421  
  1422  		ret, err := r.reconcileUnhealthyMachines(ctx, controlPlane)
  1423  
  1424  		g.Expect(ret.IsZero()).To(BeFalse()) // Remediation completed, requeue
  1425  		g.Expect(err).ToNot(HaveOccurred())
  1426  
  1427  		g.Expect(controlPlane.KCP.Annotations).To(HaveKey(controlplanev1.RemediationInProgressAnnotation))
  1428  		remediationData, err := RemediationDataFromAnnotation(controlPlane.KCP.Annotations[controlplanev1.RemediationInProgressAnnotation])
  1429  		g.Expect(err).ToNot(HaveOccurred())
  1430  		g.Expect(remediationData.Machine).To(Equal(m2.Name))
  1431  		g.Expect(remediationData.RetryCount).To(Equal(0))
  1432  
  1433  		assertMachineCondition(ctx, g, m2, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.RemediationInProgressReason, clusterv1.ConditionSeverityWarning, "")
  1434  		assertMachineCondition(ctx, g, m3, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "")
  1435  
  1436  		err = env.Get(ctx, client.ObjectKey{Namespace: m2.Namespace, Name: m2.Name}, m2)
  1437  		g.Expect(err).ToNot(HaveOccurred())
  1438  		g.Expect(m2.ObjectMeta.DeletionTimestamp.IsZero()).To(BeFalse())
  1439  
  1440  		removeFinalizer(g, m2)
  1441  		g.Expect(env.Cleanup(ctx, m2)).To(Succeed())
  1442  
  1443  		// Check next reconcile does not further remediate
  1444  
  1445  		controlPlane.Machines = collections.FromMachines(m1, m3)
  1446  		r.managementCluster = &fakeManagementCluster{
  1447  			Workload: fakeWorkloadCluster{
  1448  				EtcdMembersResult: nodes(controlPlane.Machines),
  1449  			},
  1450  		}
  1451  
  1452  		ret, err = r.reconcileUnhealthyMachines(ctx, controlPlane)
  1453  
  1454  		g.Expect(ret.IsZero()).To(BeTrue()) // Remediation skipped
  1455  		g.Expect(err).ToNot(HaveOccurred())
  1456  
  1457  		g.Expect(env.Cleanup(ctx, m1)).To(Succeed())
  1458  	})
  1459  }
  1460  
  1461  func TestCanSafelyRemoveEtcdMember(t *testing.T) {
  1462  	g := NewWithT(t)
  1463  
  1464  	ns, err := env.CreateNamespace(ctx, "ns1")
  1465  	g.Expect(err).ToNot(HaveOccurred())
  1466  	defer func() {
  1467  		g.Expect(env.Cleanup(ctx, ns)).To(Succeed())
  1468  	}()
  1469  
  1470  	t.Run("Can't safely remediate 1 machine CP", func(t *testing.T) {
  1471  		g := NewWithT(t)
  1472  
  1473  		m1 := createMachine(ctx, g, ns.Name, "m1-mhc-unhealthy-", withMachineHealthCheckFailed())
  1474  
  1475  		controlPlane := &internal.ControlPlane{
  1476  			KCP: &controlplanev1.KubeadmControlPlane{Spec: controlplanev1.KubeadmControlPlaneSpec{
  1477  				Replicas: utilptr.To[int32](1),
  1478  			}},
  1479  			Cluster:  &clusterv1.Cluster{},
  1480  			Machines: collections.FromMachines(m1),
  1481  		}
  1482  
  1483  		r := &KubeadmControlPlaneReconciler{
  1484  			Client:   env.GetClient(),
  1485  			recorder: record.NewFakeRecorder(32),
  1486  			managementCluster: &fakeManagementCluster{
  1487  				Workload: fakeWorkloadCluster{
  1488  					EtcdMembersResult: nodes(controlPlane.Machines),
  1489  				},
  1490  			},
  1491  		}
  1492  		controlPlane.InjectTestManagementCluster(r.managementCluster)
  1493  
  1494  		ret, err := r.canSafelyRemoveEtcdMember(ctx, controlPlane, m1)
  1495  		g.Expect(ret).To(BeFalse())
  1496  		g.Expect(err).ToNot(HaveOccurred())
  1497  
  1498  		g.Expect(env.Cleanup(ctx, m1)).To(Succeed())
  1499  	})
  1500  
  1501  	t.Run("Can safely remediate 2 machine CP without additional etcd member failures", func(t *testing.T) {
  1502  		g := NewWithT(t)
  1503  
  1504  		m1 := createMachine(ctx, g, ns.Name, "m1-mhc-unhealthy-", withMachineHealthCheckFailed())
  1505  		m2 := createMachine(ctx, g, ns.Name, "m2-etcd-healthy-", withHealthyEtcdMember())
  1506  
  1507  		controlPlane := &internal.ControlPlane{
  1508  			KCP: &controlplanev1.KubeadmControlPlane{Spec: controlplanev1.KubeadmControlPlaneSpec{
  1509  				Replicas: utilptr.To[int32](3),
  1510  			}},
  1511  			Cluster:  &clusterv1.Cluster{},
  1512  			Machines: collections.FromMachines(m1, m2),
  1513  		}
  1514  
  1515  		r := &KubeadmControlPlaneReconciler{
  1516  			Client:   env.GetClient(),
  1517  			recorder: record.NewFakeRecorder(32),
  1518  			managementCluster: &fakeManagementCluster{
  1519  				Workload: fakeWorkloadCluster{
  1520  					EtcdMembersResult: nodes(controlPlane.Machines),
  1521  				},
  1522  			},
  1523  		}
  1524  		controlPlane.InjectTestManagementCluster(r.managementCluster)
  1525  
  1526  		ret, err := r.canSafelyRemoveEtcdMember(ctx, controlPlane, m1)
  1527  		g.Expect(ret).To(BeTrue())
  1528  		g.Expect(err).ToNot(HaveOccurred())
  1529  
  1530  		g.Expect(env.Cleanup(ctx, m1, m2)).To(Succeed())
  1531  	})
  1532  	t.Run("Can safely remediate 2 machines CP when the etcd member being remediated is missing", func(t *testing.T) {
  1533  		g := NewWithT(t)
  1534  
  1535  		m1 := createMachine(ctx, g, ns.Name, "m1-mhc-unhealthy-", withMachineHealthCheckFailed())
  1536  		m2 := createMachine(ctx, g, ns.Name, "m2-etcd-healthy-", withHealthyEtcdMember())
  1537  
  1538  		controlPlane := &internal.ControlPlane{
  1539  			KCP: &controlplanev1.KubeadmControlPlane{Spec: controlplanev1.KubeadmControlPlaneSpec{
  1540  				Replicas: utilptr.To[int32](3),
  1541  			}},
  1542  			Cluster:  &clusterv1.Cluster{},
  1543  			Machines: collections.FromMachines(m1, m2),
  1544  		}
  1545  
  1546  		members := make([]string, 0, len(controlPlane.Machines)-1)
  1547  		for _, n := range nodes(controlPlane.Machines) {
  1548  			if !strings.Contains(n, "m1-mhc-unhealthy-") {
  1549  				members = append(members, n)
  1550  			}
  1551  		}
  1552  
  1553  		r := &KubeadmControlPlaneReconciler{
  1554  			Client:   env.GetClient(),
  1555  			recorder: record.NewFakeRecorder(32),
  1556  			managementCluster: &fakeManagementCluster{
  1557  				Workload: fakeWorkloadCluster{
  1558  					EtcdMembersResult: members,
  1559  				},
  1560  			},
  1561  		}
  1562  		controlPlane.InjectTestManagementCluster(r.managementCluster)
  1563  
  1564  		ret, err := r.canSafelyRemoveEtcdMember(ctx, controlPlane, m1)
  1565  		g.Expect(ret).To(BeTrue())
  1566  		g.Expect(err).ToNot(HaveOccurred())
  1567  
  1568  		g.Expect(env.Cleanup(ctx, m1, m2)).To(Succeed())
  1569  	})
  1570  	t.Run("Can't safely remediate 2 machines CP with one additional etcd member failure", func(t *testing.T) {
  1571  		g := NewWithT(t)
  1572  
  1573  		m1 := createMachine(ctx, g, ns.Name, "m1-mhc-unhealthy-", withMachineHealthCheckFailed())
  1574  		m2 := createMachine(ctx, g, ns.Name, "m2-etcd-unhealthy-", withUnhealthyEtcdMember())
  1575  
  1576  		controlPlane := &internal.ControlPlane{
  1577  			KCP: &controlplanev1.KubeadmControlPlane{Spec: controlplanev1.KubeadmControlPlaneSpec{
  1578  				Replicas: utilptr.To[int32](3),
  1579  			}},
  1580  			Cluster:  &clusterv1.Cluster{},
  1581  			Machines: collections.FromMachines(m1, m2),
  1582  		}
  1583  
  1584  		r := &KubeadmControlPlaneReconciler{
  1585  			Client:   env.GetClient(),
  1586  			recorder: record.NewFakeRecorder(32),
  1587  			managementCluster: &fakeManagementCluster{
  1588  				Workload: fakeWorkloadCluster{
  1589  					EtcdMembersResult: nodes(controlPlane.Machines),
  1590  				},
  1591  			},
  1592  		}
  1593  		controlPlane.InjectTestManagementCluster(r.managementCluster)
  1594  
  1595  		ret, err := r.canSafelyRemoveEtcdMember(ctx, controlPlane, m1)
  1596  		g.Expect(ret).To(BeFalse())
  1597  		g.Expect(err).ToNot(HaveOccurred())
  1598  
  1599  		g.Expect(env.Cleanup(ctx, m1, m2)).To(Succeed())
  1600  	})
  1601  	t.Run("Can safely remediate 3 machines CP without additional etcd member failures", func(t *testing.T) {
  1602  		g := NewWithT(t)
  1603  
  1604  		m1 := createMachine(ctx, g, ns.Name, "m1-mhc-unhealthy-", withMachineHealthCheckFailed())
  1605  		m2 := createMachine(ctx, g, ns.Name, "m2-etcd-healthy-", withHealthyEtcdMember())
  1606  		m3 := createMachine(ctx, g, ns.Name, "m3-etcd-healthy-", withHealthyEtcdMember())
  1607  
  1608  		controlPlane := &internal.ControlPlane{
  1609  			KCP: &controlplanev1.KubeadmControlPlane{Spec: controlplanev1.KubeadmControlPlaneSpec{
  1610  				Replicas: utilptr.To[int32](3),
  1611  			}},
  1612  			Cluster:  &clusterv1.Cluster{},
  1613  			Machines: collections.FromMachines(m1, m2, m3),
  1614  		}
  1615  
  1616  		r := &KubeadmControlPlaneReconciler{
  1617  			Client:   env.GetClient(),
  1618  			recorder: record.NewFakeRecorder(32),
  1619  			managementCluster: &fakeManagementCluster{
  1620  				Workload: fakeWorkloadCluster{
  1621  					EtcdMembersResult: nodes(controlPlane.Machines),
  1622  				},
  1623  			},
  1624  		}
  1625  		controlPlane.InjectTestManagementCluster(r.managementCluster)
  1626  
  1627  		ret, err := r.canSafelyRemoveEtcdMember(ctx, controlPlane, m1)
  1628  		g.Expect(ret).To(BeTrue())
  1629  		g.Expect(err).ToNot(HaveOccurred())
  1630  
  1631  		g.Expect(env.Cleanup(ctx, m1, m2, m3)).To(Succeed())
  1632  	})
  1633  	t.Run("Can safely remediate 3 machines CP when the etcd member being remediated is missing", func(t *testing.T) {
  1634  		g := NewWithT(t)
  1635  
  1636  		m1 := createMachine(ctx, g, ns.Name, "m1-mhc-unhealthy-", withMachineHealthCheckFailed())
  1637  		m2 := createMachine(ctx, g, ns.Name, "m2-etcd-healthy-", withHealthyEtcdMember())
  1638  		m3 := createMachine(ctx, g, ns.Name, "m3-etcd-healthy-", withHealthyEtcdMember())
  1639  
  1640  		controlPlane := &internal.ControlPlane{
  1641  			KCP: &controlplanev1.KubeadmControlPlane{Spec: controlplanev1.KubeadmControlPlaneSpec{
  1642  				Replicas: utilptr.To[int32](3),
  1643  			}},
  1644  			Cluster:  &clusterv1.Cluster{},
  1645  			Machines: collections.FromMachines(m1, m2, m3),
  1646  		}
  1647  
  1648  		members := make([]string, 0, len(controlPlane.Machines)-1)
  1649  		for _, n := range nodes(controlPlane.Machines) {
  1650  			if !strings.Contains(n, "m1-mhc-unhealthy-") {
  1651  				members = append(members, n)
  1652  			}
  1653  		}
  1654  
  1655  		r := &KubeadmControlPlaneReconciler{
  1656  			Client:   env.GetClient(),
  1657  			recorder: record.NewFakeRecorder(32),
  1658  			managementCluster: &fakeManagementCluster{
  1659  				Workload: fakeWorkloadCluster{
  1660  					EtcdMembersResult: members,
  1661  				},
  1662  			},
  1663  		}
  1664  		controlPlane.InjectTestManagementCluster(r.managementCluster)
  1665  
  1666  		ret, err := r.canSafelyRemoveEtcdMember(ctx, controlPlane, m1)
  1667  		g.Expect(ret).To(BeTrue())
  1668  		g.Expect(err).ToNot(HaveOccurred())
  1669  
  1670  		g.Expect(env.Cleanup(ctx, m1, m2, m3)).To(Succeed())
  1671  	})
  1672  	t.Run("Can't safely remediate 3 machines CP with one additional etcd member failure", func(t *testing.T) {
  1673  		g := NewWithT(t)
  1674  
  1675  		m1 := createMachine(ctx, g, ns.Name, "m1-mhc-unhealthy-", withMachineHealthCheckFailed())
  1676  		m2 := createMachine(ctx, g, ns.Name, "m2-etcd-unhealthy-", withUnhealthyEtcdMember())
  1677  		m3 := createMachine(ctx, g, ns.Name, "m3-etcd-healthy-", withHealthyEtcdMember())
  1678  
  1679  		controlPlane := &internal.ControlPlane{
  1680  			KCP: &controlplanev1.KubeadmControlPlane{Spec: controlplanev1.KubeadmControlPlaneSpec{
  1681  				Replicas: utilptr.To[int32](3),
  1682  			}},
  1683  			Cluster:  &clusterv1.Cluster{},
  1684  			Machines: collections.FromMachines(m1, m2, m3),
  1685  		}
  1686  
  1687  		r := &KubeadmControlPlaneReconciler{
  1688  			Client:   env.GetClient(),
  1689  			recorder: record.NewFakeRecorder(32),
  1690  			managementCluster: &fakeManagementCluster{
  1691  				Workload: fakeWorkloadCluster{
  1692  					EtcdMembersResult: nodes(controlPlane.Machines),
  1693  				},
  1694  			},
  1695  		}
  1696  		controlPlane.InjectTestManagementCluster(r.managementCluster)
  1697  
  1698  		ret, err := r.canSafelyRemoveEtcdMember(ctx, controlPlane, m1)
  1699  		g.Expect(ret).To(BeFalse())
  1700  		g.Expect(err).ToNot(HaveOccurred())
  1701  
  1702  		g.Expect(env.Cleanup(ctx, m1, m2, m3)).To(Succeed())
  1703  	})
  1704  	t.Run("Can safely remediate 5 machines CP less than 2 additional etcd member failures", func(t *testing.T) {
  1705  		g := NewWithT(t)
  1706  
  1707  		m1 := createMachine(ctx, g, ns.Name, "m1-mhc-unhealthy-", withMachineHealthCheckFailed())
  1708  		m2 := createMachine(ctx, g, ns.Name, "m2-etcd-unhealthy-", withUnhealthyEtcdMember())
  1709  		m3 := createMachine(ctx, g, ns.Name, "m3-etcd-healthy-", withHealthyEtcdMember())
  1710  		m4 := createMachine(ctx, g, ns.Name, "m4-etcd-healthy-", withHealthyEtcdMember())
  1711  		m5 := createMachine(ctx, g, ns.Name, "m5-etcd-healthy-", withHealthyEtcdMember())
  1712  
  1713  		controlPlane := &internal.ControlPlane{
  1714  			KCP: &controlplanev1.KubeadmControlPlane{Spec: controlplanev1.KubeadmControlPlaneSpec{
  1715  				Replicas: utilptr.To[int32](5),
  1716  			}},
  1717  			Cluster:  &clusterv1.Cluster{},
  1718  			Machines: collections.FromMachines(m1, m2, m3, m4, m5),
  1719  		}
  1720  
  1721  		r := &KubeadmControlPlaneReconciler{
  1722  			Client:   env.GetClient(),
  1723  			recorder: record.NewFakeRecorder(32),
  1724  			managementCluster: &fakeManagementCluster{
  1725  				Workload: fakeWorkloadCluster{
  1726  					EtcdMembersResult: nodes(controlPlane.Machines),
  1727  				},
  1728  			},
  1729  		}
  1730  		controlPlane.InjectTestManagementCluster(r.managementCluster)
  1731  
  1732  		ret, err := r.canSafelyRemoveEtcdMember(ctx, controlPlane, m1)
  1733  		g.Expect(ret).To(BeTrue())
  1734  		g.Expect(err).ToNot(HaveOccurred())
  1735  
  1736  		g.Expect(env.Cleanup(ctx, m1, m2, m3, m4, m5)).To(Succeed())
  1737  	})
  1738  	t.Run("Can't safely remediate 5 machines CP with 2 additional etcd member failures", func(t *testing.T) {
  1739  		g := NewWithT(t)
  1740  
  1741  		m1 := createMachine(ctx, g, ns.Name, "m1-mhc-unhealthy-", withMachineHealthCheckFailed())
  1742  		m2 := createMachine(ctx, g, ns.Name, "m2-etcd-unhealthy-", withUnhealthyEtcdMember())
  1743  		m3 := createMachine(ctx, g, ns.Name, "m3-etcd-unhealthy-", withUnhealthyEtcdMember())
  1744  		m4 := createMachine(ctx, g, ns.Name, "m4-etcd-healthy-", withHealthyEtcdMember())
  1745  		m5 := createMachine(ctx, g, ns.Name, "m5-etcd-healthy-", withHealthyEtcdMember())
  1746  
  1747  		controlPlane := &internal.ControlPlane{
  1748  			KCP: &controlplanev1.KubeadmControlPlane{Spec: controlplanev1.KubeadmControlPlaneSpec{
  1749  				Replicas: utilptr.To[int32](7),
  1750  			}},
  1751  			Cluster:  &clusterv1.Cluster{},
  1752  			Machines: collections.FromMachines(m1, m2, m3, m4, m5),
  1753  		}
  1754  
  1755  		r := &KubeadmControlPlaneReconciler{
  1756  			Client:   env.GetClient(),
  1757  			recorder: record.NewFakeRecorder(32),
  1758  			managementCluster: &fakeManagementCluster{
  1759  				Workload: fakeWorkloadCluster{
  1760  					EtcdMembersResult: nodes(controlPlane.Machines),
  1761  				},
  1762  			},
  1763  		}
  1764  		controlPlane.InjectTestManagementCluster(r.managementCluster)
  1765  
  1766  		ret, err := r.canSafelyRemoveEtcdMember(ctx, controlPlane, m1)
  1767  		g.Expect(ret).To(BeFalse())
  1768  		g.Expect(err).ToNot(HaveOccurred())
  1769  
  1770  		g.Expect(env.Cleanup(ctx, m1, m2, m3, m4, m5)).To(Succeed())
  1771  	})
  1772  	t.Run("Can safely remediate 7 machines CP with less than 3 additional etcd member failures", func(t *testing.T) {
  1773  		g := NewWithT(t)
  1774  
  1775  		m1 := createMachine(ctx, g, ns.Name, "m1-mhc-unhealthy-", withMachineHealthCheckFailed())
  1776  		m2 := createMachine(ctx, g, ns.Name, "m2-etcd-unhealthy-", withUnhealthyEtcdMember())
  1777  		m3 := createMachine(ctx, g, ns.Name, "m3-etcd-unhealthy-", withUnhealthyEtcdMember())
  1778  		m4 := createMachine(ctx, g, ns.Name, "m4-etcd-healthy-", withHealthyEtcdMember())
  1779  		m5 := createMachine(ctx, g, ns.Name, "m5-etcd-healthy-", withHealthyEtcdMember())
  1780  		m6 := createMachine(ctx, g, ns.Name, "m6-etcd-healthy-", withHealthyEtcdMember())
  1781  		m7 := createMachine(ctx, g, ns.Name, "m7-etcd-healthy-", withHealthyEtcdMember())
  1782  
  1783  		controlPlane := &internal.ControlPlane{
  1784  			KCP: &controlplanev1.KubeadmControlPlane{Spec: controlplanev1.KubeadmControlPlaneSpec{
  1785  				Replicas: utilptr.To[int32](7),
  1786  			}},
  1787  			Cluster:  &clusterv1.Cluster{},
  1788  			Machines: collections.FromMachines(m1, m2, m3, m4, m5, m6, m7),
  1789  		}
  1790  
  1791  		r := &KubeadmControlPlaneReconciler{
  1792  			Client:   env.GetClient(),
  1793  			recorder: record.NewFakeRecorder(32),
  1794  			managementCluster: &fakeManagementCluster{
  1795  				Workload: fakeWorkloadCluster{
  1796  					EtcdMembersResult: nodes(controlPlane.Machines),
  1797  				},
  1798  			},
  1799  		}
  1800  		controlPlane.InjectTestManagementCluster(r.managementCluster)
  1801  
  1802  		ret, err := r.canSafelyRemoveEtcdMember(ctx, controlPlane, m1)
  1803  		g.Expect(ret).To(BeTrue())
  1804  		g.Expect(err).ToNot(HaveOccurred())
  1805  
  1806  		g.Expect(env.Cleanup(ctx, m1, m2, m3, m4, m5, m6, m7)).To(Succeed())
  1807  	})
  1808  	t.Run("Can't safely remediate 7 machines CP with 3 additional etcd member failures", func(t *testing.T) {
  1809  		g := NewWithT(t)
  1810  
  1811  		m1 := createMachine(ctx, g, ns.Name, "m1-mhc-unhealthy-", withMachineHealthCheckFailed())
  1812  		m2 := createMachine(ctx, g, ns.Name, "m2-etcd-unhealthy-", withUnhealthyEtcdMember())
  1813  		m3 := createMachine(ctx, g, ns.Name, "m3-etcd-unhealthy-", withUnhealthyEtcdMember())
  1814  		m4 := createMachine(ctx, g, ns.Name, "m4-etcd-unhealthy-", withUnhealthyEtcdMember())
  1815  		m5 := createMachine(ctx, g, ns.Name, "m5-etcd-healthy-", withHealthyEtcdMember())
  1816  		m6 := createMachine(ctx, g, ns.Name, "m6-etcd-healthy-", withHealthyEtcdMember())
  1817  		m7 := createMachine(ctx, g, ns.Name, "m7-etcd-healthy-", withHealthyEtcdMember())
  1818  
  1819  		controlPlane := &internal.ControlPlane{
  1820  			KCP: &controlplanev1.KubeadmControlPlane{Spec: controlplanev1.KubeadmControlPlaneSpec{
  1821  				Replicas: utilptr.To[int32](5),
  1822  			}},
  1823  			Cluster:  &clusterv1.Cluster{},
  1824  			Machines: collections.FromMachines(m1, m2, m3, m4, m5, m6, m7),
  1825  		}
  1826  
  1827  		r := &KubeadmControlPlaneReconciler{
  1828  			Client:   env.GetClient(),
  1829  			recorder: record.NewFakeRecorder(32),
  1830  			managementCluster: &fakeManagementCluster{
  1831  				Workload: fakeWorkloadCluster{
  1832  					EtcdMembersResult: nodes(controlPlane.Machines),
  1833  				},
  1834  			},
  1835  		}
  1836  		controlPlane.InjectTestManagementCluster(r.managementCluster)
  1837  
  1838  		ret, err := r.canSafelyRemoveEtcdMember(ctx, controlPlane, m1)
  1839  		g.Expect(ret).To(BeFalse())
  1840  		g.Expect(err).ToNot(HaveOccurred())
  1841  
  1842  		g.Expect(env.Cleanup(ctx, m1, m2, m3, m4, m5, m6, m7)).To(Succeed())
  1843  	})
  1844  }
  1845  
  1846  func nodes(machines collections.Machines) []string {
  1847  	nodes := make([]string, 0, machines.Len())
  1848  	for _, m := range machines {
  1849  		if m.Status.NodeRef != nil {
  1850  			nodes = append(nodes, m.Status.NodeRef.Name)
  1851  		}
  1852  	}
  1853  	return nodes
  1854  }
  1855  
  1856  type machineOption func(*clusterv1.Machine)
  1857  
  1858  func withMachineHealthCheckFailed() machineOption {
  1859  	return func(machine *clusterv1.Machine) {
  1860  		conditions.MarkFalse(machine, clusterv1.MachineHealthCheckSucceededCondition, clusterv1.MachineHasFailureReason, clusterv1.ConditionSeverityWarning, "")
  1861  		conditions.MarkFalse(machine, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "")
  1862  	}
  1863  }
  1864  
  1865  func withStuckRemediation() machineOption {
  1866  	return func(machine *clusterv1.Machine) {
  1867  		conditions.MarkTrue(machine, clusterv1.MachineHealthCheckSucceededCondition)
  1868  		conditions.MarkFalse(machine, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "")
  1869  	}
  1870  }
  1871  
  1872  func withHealthyEtcdMember() machineOption {
  1873  	return func(machine *clusterv1.Machine) {
  1874  		conditions.MarkTrue(machine, controlplanev1.MachineEtcdMemberHealthyCondition)
  1875  	}
  1876  }
  1877  
  1878  func withUnhealthyEtcdMember() machineOption {
  1879  	return func(machine *clusterv1.Machine) {
  1880  		conditions.MarkFalse(machine, controlplanev1.MachineEtcdMemberHealthyCondition, controlplanev1.EtcdMemberUnhealthyReason, clusterv1.ConditionSeverityError, "")
  1881  	}
  1882  }
  1883  
  1884  func withUnhealthyAPIServerPod() machineOption {
  1885  	return func(machine *clusterv1.Machine) {
  1886  		conditions.MarkFalse(machine, controlplanev1.MachineAPIServerPodHealthyCondition, controlplanev1.ControlPlaneComponentsUnhealthyReason, clusterv1.ConditionSeverityError, "")
  1887  	}
  1888  }
  1889  
  1890  func withNodeRef(ref string) machineOption {
  1891  	return func(machine *clusterv1.Machine) {
  1892  		machine.Status.NodeRef = &corev1.ObjectReference{
  1893  			Kind: "Node",
  1894  			Name: ref,
  1895  		}
  1896  	}
  1897  }
  1898  
  1899  func withoutNodeRef() machineOption {
  1900  	return func(machine *clusterv1.Machine) {
  1901  		machine.Status.NodeRef = nil
  1902  	}
  1903  }
  1904  
  1905  func withRemediateForAnnotation(remediatedFor string) machineOption {
  1906  	return func(machine *clusterv1.Machine) {
  1907  		if machine.Annotations == nil {
  1908  			machine.Annotations = map[string]string{}
  1909  		}
  1910  		machine.Annotations[controlplanev1.RemediationForAnnotation] = remediatedFor
  1911  	}
  1912  }
  1913  
  1914  func withWaitBeforeDeleteFinalizer() machineOption {
  1915  	return func(machine *clusterv1.Machine) {
  1916  		machine.Finalizers = []string{"wait-before-delete"}
  1917  	}
  1918  }
  1919  
  1920  func createMachine(ctx context.Context, g *WithT, namespace, name string, options ...machineOption) *clusterv1.Machine {
  1921  	m := &clusterv1.Machine{
  1922  		ObjectMeta: metav1.ObjectMeta{
  1923  			Namespace:    namespace,
  1924  			GenerateName: name,
  1925  		},
  1926  		Spec: clusterv1.MachineSpec{
  1927  			ClusterName: "cluster",
  1928  			Bootstrap: clusterv1.Bootstrap{
  1929  				DataSecretName: utilptr.To("secret"),
  1930  			},
  1931  		},
  1932  	}
  1933  	g.Expect(env.CreateAndWait(ctx, m)).To(Succeed())
  1934  
  1935  	patchHelper, err := patch.NewHelper(m, env.GetClient())
  1936  	g.Expect(err).ToNot(HaveOccurred())
  1937  
  1938  	for _, opt := range append([]machineOption{withNodeRef(fmt.Sprintf("node-%s", m.Name))}, options...) {
  1939  		opt(m)
  1940  	}
  1941  
  1942  	g.Expect(patchHelper.Patch(ctx, m)).To(Succeed())
  1943  	return m
  1944  }
  1945  
  1946  func getDeletingMachine(namespace, name string, options ...machineOption) *clusterv1.Machine {
  1947  	deletionTime := metav1.Now()
  1948  	m := &clusterv1.Machine{
  1949  		ObjectMeta: metav1.ObjectMeta{
  1950  			Namespace:         namespace,
  1951  			Name:              name,
  1952  			DeletionTimestamp: &deletionTime,
  1953  		},
  1954  		Spec: clusterv1.MachineSpec{
  1955  			ClusterName: "cluster",
  1956  			Bootstrap: clusterv1.Bootstrap{
  1957  				DataSecretName: utilptr.To("secret"),
  1958  			},
  1959  		},
  1960  	}
  1961  
  1962  	for _, opt := range append([]machineOption{withNodeRef(fmt.Sprintf("node-%s", m.Name))}, options...) {
  1963  		opt(m)
  1964  	}
  1965  	return m
  1966  }
  1967  
  1968  func assertMachineCondition(ctx context.Context, g *WithT, m *clusterv1.Machine, t clusterv1.ConditionType, status corev1.ConditionStatus, reason string, severity clusterv1.ConditionSeverity, message string) {
  1969  	g.Eventually(func() error {
  1970  		if err := env.Get(ctx, client.ObjectKey{Namespace: m.Namespace, Name: m.Name}, m); err != nil {
  1971  			return err
  1972  		}
  1973  		c := conditions.Get(m, t)
  1974  		if c == nil {
  1975  			return errors.Errorf("condition %q was nil", t)
  1976  		}
  1977  		if c.Status != status {
  1978  			return errors.Errorf("condition %q status %q did not match %q", t, c.Status, status)
  1979  		}
  1980  		if c.Reason != reason {
  1981  			return errors.Errorf("condition %q reason %q did not match %q", t, c.Reason, reason)
  1982  		}
  1983  		if c.Severity != severity {
  1984  			return errors.Errorf("condition %q severity %q did not match %q", t, c.Status, status)
  1985  		}
  1986  		if c.Message != message {
  1987  			return errors.Errorf("condition %q message %q did not match %q", t, c.Message, message)
  1988  		}
  1989  		return nil
  1990  	}, 10*time.Second).Should(Succeed())
  1991  }
  1992  
  1993  func MustMarshalRemediationData(r *RemediationData) string {
  1994  	s, err := r.Marshal()
  1995  	if err != nil {
  1996  		panic("failed to marshal remediation data")
  1997  	}
  1998  	return s
  1999  }