sigs.k8s.io/cluster-api@v1.6.3/controlplane/kubeadm/internal/controllers/remediation_test.go (about)

     1  /*
     2  Copyright 2020 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package controllers
    18  
    19  import (
    20  	"context"
    21  	"fmt"
    22  	"strings"
    23  	"testing"
    24  	"time"
    25  
    26  	. "github.com/onsi/gomega"
    27  	"github.com/pkg/errors"
    28  	corev1 "k8s.io/api/core/v1"
    29  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    30  	"k8s.io/apimachinery/pkg/util/intstr"
    31  	"k8s.io/client-go/tools/record"
    32  	utilpointer "k8s.io/utils/pointer"
    33  	"sigs.k8s.io/controller-runtime/pkg/client"
    34  
    35  	clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1"
    36  	controlplanev1 "sigs.k8s.io/cluster-api/controlplane/kubeadm/api/v1beta1"
    37  	"sigs.k8s.io/cluster-api/controlplane/kubeadm/internal"
    38  	"sigs.k8s.io/cluster-api/util/collections"
    39  	"sigs.k8s.io/cluster-api/util/conditions"
    40  	"sigs.k8s.io/cluster-api/util/patch"
    41  )
    42  
    43  func TestReconcileUnhealthyMachines(t *testing.T) {
    44  	g := NewWithT(t)
    45  
    46  	r := &KubeadmControlPlaneReconciler{
    47  		Client:   env.GetClient(),
    48  		recorder: record.NewFakeRecorder(32),
    49  	}
    50  	ns, err := env.CreateNamespace(ctx, "ns1")
    51  	g.Expect(err).ToNot(HaveOccurred())
    52  	defer func() {
    53  		g.Expect(env.Cleanup(ctx, ns)).To(Succeed())
    54  	}()
    55  
    56  	var removeFinalizer = func(g *WithT, m *clusterv1.Machine) {
    57  		patchHelper, err := patch.NewHelper(m, env.GetClient())
    58  		g.Expect(err).ToNot(HaveOccurred())
    59  		m.ObjectMeta.Finalizers = nil
    60  		g.Expect(patchHelper.Patch(ctx, m)).To(Succeed())
    61  	}
    62  
    63  	t.Run("It cleans up stuck remediation on previously unhealthy machines", func(t *testing.T) {
    64  		g := NewWithT(t)
    65  
    66  		m := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withStuckRemediation())
    67  
    68  		controlPlane := &internal.ControlPlane{
    69  			KCP:      &controlplanev1.KubeadmControlPlane{},
    70  			Cluster:  &clusterv1.Cluster{},
    71  			Machines: collections.FromMachines(m),
    72  		}
    73  		ret, err := r.reconcileUnhealthyMachines(ctx, controlPlane)
    74  
    75  		g.Expect(ret.IsZero()).To(BeTrue()) // Remediation skipped
    76  		g.Expect(err).ToNot(HaveOccurred())
    77  
    78  		g.Eventually(func() error {
    79  			if err := env.Get(ctx, client.ObjectKey{Namespace: m.Namespace, Name: m.Name}, m); err != nil {
    80  				return err
    81  			}
    82  			c := conditions.Get(m, clusterv1.MachineOwnerRemediatedCondition)
    83  			if c == nil {
    84  				return nil
    85  			}
    86  			return errors.Errorf("condition %s still exists", clusterv1.MachineOwnerRemediatedCondition)
    87  		}, 10*time.Second).Should(Succeed())
    88  	})
    89  
    90  	// Generic preflight checks
    91  	// Those are ore flight checks that happen no matter if the control plane has been already initialized or not.
    92  
    93  	t.Run("Remediation does not happen if there are no unhealthy machines", func(t *testing.T) {
    94  		g := NewWithT(t)
    95  
    96  		controlPlane := &internal.ControlPlane{
    97  			KCP:      &controlplanev1.KubeadmControlPlane{},
    98  			Cluster:  &clusterv1.Cluster{},
    99  			Machines: collections.New(),
   100  		}
   101  		ret, err := r.reconcileUnhealthyMachines(ctx, controlPlane)
   102  
   103  		g.Expect(ret.IsZero()).To(BeTrue()) // Remediation skipped
   104  		g.Expect(err).ToNot(HaveOccurred())
   105  	})
   106  	t.Run("reconcileUnhealthyMachines return early if another remediation is in progress", func(t *testing.T) {
   107  		g := NewWithT(t)
   108  
   109  		m := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withStuckRemediation())
   110  		conditions.MarkFalse(m, clusterv1.MachineHealthCheckSucceededCondition, clusterv1.MachineHasFailureReason, clusterv1.ConditionSeverityWarning, "")
   111  		conditions.MarkFalse(m, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "")
   112  		controlPlane := &internal.ControlPlane{
   113  			KCP: &controlplanev1.KubeadmControlPlane{
   114  				ObjectMeta: metav1.ObjectMeta{
   115  					Annotations: map[string]string{
   116  						controlplanev1.RemediationInProgressAnnotation: MustMarshalRemediationData(&RemediationData{
   117  							Machine:    "foo",
   118  							Timestamp:  metav1.Time{Time: time.Now().UTC()},
   119  							RetryCount: 0,
   120  						}),
   121  					},
   122  				},
   123  			},
   124  			Cluster:  &clusterv1.Cluster{},
   125  			Machines: collections.FromMachines(m),
   126  		}
   127  		ret, err := r.reconcileUnhealthyMachines(ctx, controlPlane)
   128  
   129  		g.Expect(ret.IsZero()).To(BeTrue()) // Remediation skipped
   130  		g.Expect(err).ToNot(HaveOccurred())
   131  	})
   132  	t.Run("reconcileUnhealthyMachines return early if the machine to be remediated is already being deleted", func(t *testing.T) {
   133  		g := NewWithT(t)
   134  
   135  		m := getDeletingMachine(ns.Name, "m1-unhealthy-deleting-", withMachineHealthCheckFailed())
   136  		conditions.MarkFalse(m, clusterv1.MachineHealthCheckSucceededCondition, clusterv1.MachineHasFailureReason, clusterv1.ConditionSeverityWarning, "")
   137  		conditions.MarkFalse(m, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "")
   138  		controlPlane := &internal.ControlPlane{
   139  			KCP:      &controlplanev1.KubeadmControlPlane{},
   140  			Cluster:  &clusterv1.Cluster{},
   141  			Machines: collections.FromMachines(m),
   142  		}
   143  		ret, err := r.reconcileUnhealthyMachines(ctx, controlPlane)
   144  
   145  		g.Expect(controlPlane.KCP.Annotations).ToNot(HaveKey(controlplanev1.RemediationInProgressAnnotation))
   146  
   147  		g.Expect(ret.IsZero()).To(BeTrue()) // Remediation skipped
   148  		g.Expect(err).ToNot(HaveOccurred())
   149  	})
   150  	t.Run("Remediation does not happen if MaxRetry is reached", func(t *testing.T) {
   151  		g := NewWithT(t)
   152  
   153  		m1 := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withMachineHealthCheckFailed(), withWaitBeforeDeleteFinalizer(), withRemediateForAnnotation(MustMarshalRemediationData(&RemediationData{
   154  			Machine:    "m0",
   155  			Timestamp:  metav1.Time{Time: time.Now().Add(-controlplanev1.DefaultMinHealthyPeriod / 2).UTC()}, // minHealthy not expired yet.
   156  			RetryCount: 3,
   157  		})))
   158  		m2 := createMachine(ctx, g, ns.Name, "m2-healthy-", withHealthyEtcdMember())
   159  		m3 := createMachine(ctx, g, ns.Name, "m3-healthy-", withHealthyEtcdMember())
   160  
   161  		controlPlane := &internal.ControlPlane{
   162  			KCP: &controlplanev1.KubeadmControlPlane{
   163  				Spec: controlplanev1.KubeadmControlPlaneSpec{
   164  					Replicas: utilpointer.Int32(3),
   165  					Version:  "v1.19.1",
   166  					RemediationStrategy: &controlplanev1.RemediationStrategy{
   167  						MaxRetry: utilpointer.Int32(3),
   168  					},
   169  				},
   170  			},
   171  			Cluster:  &clusterv1.Cluster{},
   172  			Machines: collections.FromMachines(m1, m2, m3),
   173  		}
   174  
   175  		r := &KubeadmControlPlaneReconciler{
   176  			Client:   env.GetClient(),
   177  			recorder: record.NewFakeRecorder(32),
   178  			managementCluster: &fakeManagementCluster{
   179  				Workload: fakeWorkloadCluster{
   180  					EtcdMembersResult: nodes(controlPlane.Machines),
   181  				},
   182  			},
   183  		}
   184  
   185  		ret, err := r.reconcileUnhealthyMachines(ctx, controlPlane)
   186  
   187  		g.Expect(ret.IsZero()).To(BeTrue()) // Remediation skipped
   188  		g.Expect(err).ToNot(HaveOccurred())
   189  
   190  		g.Expect(controlPlane.KCP.Annotations).ToNot(HaveKey(controlplanev1.RemediationInProgressAnnotation))
   191  
   192  		assertMachineCondition(ctx, g, m1, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP can't remediate this machine because the operation already failed 3 times (MaxRetry)")
   193  
   194  		err = env.Get(ctx, client.ObjectKey{Namespace: m1.Namespace, Name: m1.Name}, m1)
   195  		g.Expect(err).ToNot(HaveOccurred())
   196  		g.Expect(m1.ObjectMeta.DeletionTimestamp.IsZero()).To(BeTrue())
   197  
   198  		removeFinalizer(g, m1)
   199  		g.Expect(env.Cleanup(ctx, m1, m2, m3)).To(Succeed())
   200  	})
   201  	t.Run("Retry history is ignored if min healthy period is expired, default min healthy period", func(t *testing.T) {
   202  		g := NewWithT(t)
   203  
   204  		m1 := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withMachineHealthCheckFailed(), withWaitBeforeDeleteFinalizer(), withRemediateForAnnotation(MustMarshalRemediationData(&RemediationData{
   205  			Machine:    "m0",
   206  			Timestamp:  metav1.Time{Time: time.Now().Add(-2 * controlplanev1.DefaultMinHealthyPeriod).UTC()}, // minHealthyPeriod already expired.
   207  			RetryCount: 3,
   208  		})))
   209  		m2 := createMachine(ctx, g, ns.Name, "m2-healthy-", withHealthyEtcdMember())
   210  		m3 := createMachine(ctx, g, ns.Name, "m3-healthy-", withHealthyEtcdMember())
   211  
   212  		controlPlane := &internal.ControlPlane{
   213  			KCP: &controlplanev1.KubeadmControlPlane{
   214  				Spec: controlplanev1.KubeadmControlPlaneSpec{
   215  					Replicas: utilpointer.Int32(3),
   216  					Version:  "v1.19.1",
   217  					RemediationStrategy: &controlplanev1.RemediationStrategy{
   218  						MaxRetry: utilpointer.Int32(3),
   219  					},
   220  				},
   221  			},
   222  			Cluster:  &clusterv1.Cluster{},
   223  			Machines: collections.FromMachines(m1, m2, m3),
   224  		}
   225  
   226  		r := &KubeadmControlPlaneReconciler{
   227  			Client:   env.GetClient(),
   228  			recorder: record.NewFakeRecorder(32),
   229  			managementCluster: &fakeManagementCluster{
   230  				Workload: fakeWorkloadCluster{
   231  					EtcdMembersResult: nodes(controlPlane.Machines),
   232  				},
   233  			},
   234  		}
   235  
   236  		ret, err := r.reconcileUnhealthyMachines(ctx, controlPlane)
   237  
   238  		g.Expect(ret.IsZero()).To(BeFalse()) // Remediation completed, requeue
   239  		g.Expect(err).ToNot(HaveOccurred())
   240  
   241  		g.Expect(controlPlane.KCP.Annotations).To(HaveKey(controlplanev1.RemediationInProgressAnnotation))
   242  		remediationData, err := RemediationDataFromAnnotation(controlPlane.KCP.Annotations[controlplanev1.RemediationInProgressAnnotation])
   243  		g.Expect(err).ToNot(HaveOccurred())
   244  		g.Expect(remediationData.Machine).To(Equal(m1.Name))
   245  		g.Expect(remediationData.RetryCount).To(Equal(0))
   246  
   247  		assertMachineCondition(ctx, g, m1, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.RemediationInProgressReason, clusterv1.ConditionSeverityWarning, "")
   248  
   249  		err = env.Get(ctx, client.ObjectKey{Namespace: m1.Namespace, Name: m1.Name}, m1)
   250  		g.Expect(err).ToNot(HaveOccurred())
   251  		g.Expect(m1.ObjectMeta.DeletionTimestamp.IsZero()).To(BeFalse())
   252  
   253  		removeFinalizer(g, m1)
   254  		g.Expect(env.Cleanup(ctx, m1, m2, m3)).To(Succeed())
   255  	})
   256  	t.Run("Retry history is ignored if min healthy period is expired", func(t *testing.T) {
   257  		g := NewWithT(t)
   258  
   259  		minHealthyPeriod := 4 * controlplanev1.DefaultMinHealthyPeriod // big min healthy period, so we are user that we are not using DefaultMinHealthyPeriod.
   260  
   261  		m1 := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withMachineHealthCheckFailed(), withWaitBeforeDeleteFinalizer(), withRemediateForAnnotation(MustMarshalRemediationData(&RemediationData{
   262  			Machine:    "m0",
   263  			Timestamp:  metav1.Time{Time: time.Now().Add(-2 * minHealthyPeriod).UTC()}, // minHealthyPeriod already expired.
   264  			RetryCount: 3,
   265  		})))
   266  		m2 := createMachine(ctx, g, ns.Name, "m2-healthy-", withHealthyEtcdMember())
   267  		m3 := createMachine(ctx, g, ns.Name, "m3-healthy-", withHealthyEtcdMember())
   268  
   269  		controlPlane := &internal.ControlPlane{
   270  			KCP: &controlplanev1.KubeadmControlPlane{
   271  				Spec: controlplanev1.KubeadmControlPlaneSpec{
   272  					Replicas: utilpointer.Int32(3),
   273  					Version:  "v1.19.1",
   274  					RemediationStrategy: &controlplanev1.RemediationStrategy{
   275  						MaxRetry:         utilpointer.Int32(3),
   276  						MinHealthyPeriod: &metav1.Duration{Duration: minHealthyPeriod},
   277  					},
   278  				},
   279  			},
   280  			Cluster:  &clusterv1.Cluster{},
   281  			Machines: collections.FromMachines(m1, m2, m3),
   282  		}
   283  
   284  		r := &KubeadmControlPlaneReconciler{
   285  			Client:   env.GetClient(),
   286  			recorder: record.NewFakeRecorder(32),
   287  			managementCluster: &fakeManagementCluster{
   288  				Workload: fakeWorkloadCluster{
   289  					EtcdMembersResult: nodes(controlPlane.Machines),
   290  				},
   291  			},
   292  		}
   293  
   294  		ret, err := r.reconcileUnhealthyMachines(ctx, controlPlane)
   295  
   296  		g.Expect(ret.IsZero()).To(BeFalse()) // Remediation completed, requeue
   297  		g.Expect(err).ToNot(HaveOccurred())
   298  
   299  		g.Expect(controlPlane.KCP.Annotations).To(HaveKey(controlplanev1.RemediationInProgressAnnotation))
   300  		remediationData, err := RemediationDataFromAnnotation(controlPlane.KCP.Annotations[controlplanev1.RemediationInProgressAnnotation])
   301  		g.Expect(err).ToNot(HaveOccurred())
   302  		g.Expect(remediationData.Machine).To(Equal(m1.Name))
   303  		g.Expect(remediationData.RetryCount).To(Equal(0))
   304  
   305  		assertMachineCondition(ctx, g, m1, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.RemediationInProgressReason, clusterv1.ConditionSeverityWarning, "")
   306  
   307  		err = env.Get(ctx, client.ObjectKey{Namespace: m1.Namespace, Name: m1.Name}, m1)
   308  		g.Expect(err).ToNot(HaveOccurred())
   309  		g.Expect(m1.ObjectMeta.DeletionTimestamp.IsZero()).To(BeFalse())
   310  
   311  		removeFinalizer(g, m1)
   312  		g.Expect(env.Cleanup(ctx, m1, m2, m3)).To(Succeed())
   313  	})
   314  	t.Run("Remediation does not happen if RetryPeriod is not yet passed", func(t *testing.T) {
   315  		g := NewWithT(t)
   316  
   317  		m1 := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withMachineHealthCheckFailed(), withWaitBeforeDeleteFinalizer(), withRemediateForAnnotation(MustMarshalRemediationData(&RemediationData{
   318  			Machine:    "m0",
   319  			Timestamp:  metav1.Time{Time: time.Now().Add(-controlplanev1.DefaultMinHealthyPeriod / 2).UTC()}, // minHealthyPeriod not yet expired.
   320  			RetryCount: 2,
   321  		})))
   322  		m2 := createMachine(ctx, g, ns.Name, "m2-healthy-", withHealthyEtcdMember())
   323  		m3 := createMachine(ctx, g, ns.Name, "m3-healthy-", withHealthyEtcdMember())
   324  
   325  		controlPlane := &internal.ControlPlane{
   326  			KCP: &controlplanev1.KubeadmControlPlane{
   327  				Spec: controlplanev1.KubeadmControlPlaneSpec{
   328  					Replicas: utilpointer.Int32(3),
   329  					Version:  "v1.19.1",
   330  					RemediationStrategy: &controlplanev1.RemediationStrategy{
   331  						MaxRetry:    utilpointer.Int32(3),
   332  						RetryPeriod: metav1.Duration{Duration: controlplanev1.DefaultMinHealthyPeriod}, // RetryPeriod not yet expired.
   333  					},
   334  				},
   335  			},
   336  			Cluster:  &clusterv1.Cluster{},
   337  			Machines: collections.FromMachines(m1, m2, m3),
   338  		}
   339  
   340  		r := &KubeadmControlPlaneReconciler{
   341  			Client:   env.GetClient(),
   342  			recorder: record.NewFakeRecorder(32),
   343  			managementCluster: &fakeManagementCluster{
   344  				Workload: fakeWorkloadCluster{
   345  					EtcdMembersResult: nodes(controlPlane.Machines),
   346  				},
   347  			},
   348  		}
   349  
   350  		ret, err := r.reconcileUnhealthyMachines(ctx, controlPlane)
   351  
   352  		g.Expect(ret.IsZero()).To(BeTrue()) // Remediation skipped
   353  		g.Expect(err).ToNot(HaveOccurred())
   354  
   355  		g.Expect(controlPlane.KCP.Annotations).ToNot(HaveKey(controlplanev1.RemediationInProgressAnnotation))
   356  
   357  		assertMachineCondition(ctx, g, m1, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP can't remediate this machine because the operation already failed in the latest 1h0m0s (RetryPeriod)")
   358  
   359  		err = env.Get(ctx, client.ObjectKey{Namespace: m1.Namespace, Name: m1.Name}, m1)
   360  		g.Expect(err).ToNot(HaveOccurred())
   361  		g.Expect(m1.ObjectMeta.DeletionTimestamp.IsZero()).To(BeTrue())
   362  
   363  		removeFinalizer(g, m1)
   364  		g.Expect(env.Cleanup(ctx, m1, m2, m3)).To(Succeed())
   365  	})
   366  
   367  	// There are no preflight checks for when control plane is not yet initialized
   368  	// (it is the first CP, we can nuke it).
   369  
   370  	// Preflight checks for when control plane is already initialized.
   371  
   372  	t.Run("Remediation does not happen if desired replicas <= 1", func(t *testing.T) {
   373  		g := NewWithT(t)
   374  
   375  		m := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withMachineHealthCheckFailed())
   376  		controlPlane := &internal.ControlPlane{
   377  			KCP: &controlplanev1.KubeadmControlPlane{
   378  				Spec: controlplanev1.KubeadmControlPlaneSpec{
   379  					Replicas: utilpointer.Int32(1),
   380  					RolloutStrategy: &controlplanev1.RolloutStrategy{
   381  						RollingUpdate: &controlplanev1.RollingUpdate{
   382  							MaxSurge: &intstr.IntOrString{
   383  								IntVal: 1,
   384  							},
   385  						},
   386  					},
   387  				},
   388  				Status: controlplanev1.KubeadmControlPlaneStatus{
   389  					Initialized: true,
   390  				},
   391  			},
   392  			Cluster:  &clusterv1.Cluster{},
   393  			Machines: collections.FromMachines(m),
   394  		}
   395  		ret, err := r.reconcileUnhealthyMachines(ctx, controlPlane)
   396  
   397  		g.Expect(ret.IsZero()).To(BeTrue()) // Remediation skipped
   398  		g.Expect(err).ToNot(HaveOccurred())
   399  
   400  		g.Expect(controlPlane.KCP.Annotations).ToNot(HaveKey(controlplanev1.RemediationInProgressAnnotation))
   401  
   402  		assertMachineCondition(ctx, g, m, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP can't remediate if current replicas are less or equal to 1")
   403  
   404  		g.Expect(env.Cleanup(ctx, m)).To(Succeed())
   405  	})
   406  	t.Run("Remediation does not happen if there is another machine being deleted (not the one to be remediated)", func(t *testing.T) {
   407  		g := NewWithT(t)
   408  
   409  		m1 := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withMachineHealthCheckFailed())
   410  		m2 := createMachine(ctx, g, ns.Name, "m2-healthy-")
   411  		m3 := getDeletingMachine(ns.Name, "m3-deleting") // NB. This machine is not created, it gets only added to control plane
   412  		controlPlane := &internal.ControlPlane{
   413  			KCP: &controlplanev1.KubeadmControlPlane{
   414  				Spec: controlplanev1.KubeadmControlPlaneSpec{
   415  					Replicas: utilpointer.Int32(3),
   416  				},
   417  				Status: controlplanev1.KubeadmControlPlaneStatus{
   418  					Initialized: true,
   419  				},
   420  			},
   421  			Cluster:  &clusterv1.Cluster{},
   422  			Machines: collections.FromMachines(m1, m2, m3),
   423  		}
   424  		ret, err := r.reconcileUnhealthyMachines(ctx, controlPlane)
   425  
   426  		g.Expect(ret.IsZero()).To(BeTrue()) // Remediation skipped
   427  		g.Expect(err).ToNot(HaveOccurred())
   428  
   429  		g.Expect(controlPlane.KCP.Annotations).ToNot(HaveKey(controlplanev1.RemediationInProgressAnnotation))
   430  
   431  		assertMachineCondition(ctx, g, m1, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP waiting for control plane machine deletion to complete before triggering remediation")
   432  
   433  		g.Expect(env.Cleanup(ctx, m1, m2)).To(Succeed())
   434  	})
   435  	t.Run("Remediation does not happen if there is at least one additional unhealthy etcd member on a 3 machine CP", func(t *testing.T) {
   436  		g := NewWithT(t)
   437  
   438  		m1 := createMachine(ctx, g, ns.Name, "m1-mhc-unhealthy-", withMachineHealthCheckFailed())
   439  		m2 := createMachine(ctx, g, ns.Name, "m2-etcd-unhealthy-", withUnhealthyEtcdMember())
   440  		m3 := createMachine(ctx, g, ns.Name, "m3-etcd-healthy-", withHealthyEtcdMember())
   441  
   442  		controlPlane := &internal.ControlPlane{
   443  			KCP: &controlplanev1.KubeadmControlPlane{
   444  				Spec: controlplanev1.KubeadmControlPlaneSpec{
   445  					Replicas: utilpointer.Int32(3),
   446  				},
   447  				Status: controlplanev1.KubeadmControlPlaneStatus{
   448  					Initialized: true,
   449  				},
   450  			},
   451  			Cluster:  &clusterv1.Cluster{},
   452  			Machines: collections.FromMachines(m1, m2, m3),
   453  		}
   454  
   455  		r := &KubeadmControlPlaneReconciler{
   456  			Client:   env.GetClient(),
   457  			recorder: record.NewFakeRecorder(32),
   458  			managementCluster: &fakeManagementCluster{
   459  				Workload: fakeWorkloadCluster{
   460  					EtcdMembersResult: nodes(controlPlane.Machines),
   461  				},
   462  			},
   463  		}
   464  		controlPlane.InjectTestManagementCluster(r.managementCluster)
   465  
   466  		ret, err := r.reconcileUnhealthyMachines(ctx, controlPlane)
   467  
   468  		g.Expect(ret.IsZero()).To(BeTrue()) // Remediation skipped
   469  		g.Expect(err).ToNot(HaveOccurred())
   470  
   471  		g.Expect(controlPlane.KCP.Annotations).ToNot(HaveKey(controlplanev1.RemediationInProgressAnnotation))
   472  
   473  		assertMachineCondition(ctx, g, m1, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP can't remediate this machine because this could result in etcd loosing quorum")
   474  
   475  		g.Expect(env.Cleanup(ctx, m1, m2, m3)).To(Succeed())
   476  	})
   477  	t.Run("Remediation does not happen if there are at least two additional unhealthy etcd member on a 5 machine CP", func(t *testing.T) {
   478  		g := NewWithT(t)
   479  
   480  		m1 := createMachine(ctx, g, ns.Name, "m1-mhc-unhealthy-", withMachineHealthCheckFailed())
   481  		m2 := createMachine(ctx, g, ns.Name, "m2-etcd-unhealthy-", withUnhealthyEtcdMember())
   482  		m3 := createMachine(ctx, g, ns.Name, "m3-etcd-unhealthy-", withUnhealthyEtcdMember())
   483  		m4 := createMachine(ctx, g, ns.Name, "m4-etcd-healthy-", withHealthyEtcdMember())
   484  		m5 := createMachine(ctx, g, ns.Name, "m5-etcd-healthy-", withHealthyEtcdMember())
   485  
   486  		controlPlane := &internal.ControlPlane{
   487  			KCP: &controlplanev1.KubeadmControlPlane{
   488  				Spec: controlplanev1.KubeadmControlPlaneSpec{
   489  					Replicas: utilpointer.Int32(5),
   490  				},
   491  				Status: controlplanev1.KubeadmControlPlaneStatus{
   492  					Initialized: true,
   493  				},
   494  			},
   495  			Cluster:  &clusterv1.Cluster{},
   496  			Machines: collections.FromMachines(m1, m2, m3, m4, m5),
   497  		}
   498  
   499  		r := &KubeadmControlPlaneReconciler{
   500  			Client:   env.GetClient(),
   501  			recorder: record.NewFakeRecorder(32),
   502  			managementCluster: &fakeManagementCluster{
   503  				Workload: fakeWorkloadCluster{
   504  					EtcdMembersResult: nodes(controlPlane.Machines),
   505  				},
   506  			},
   507  		}
   508  		controlPlane.InjectTestManagementCluster(r.managementCluster)
   509  
   510  		ret, err := r.reconcileUnhealthyMachines(ctx, controlPlane)
   511  
   512  		g.Expect(ret.IsZero()).To(BeTrue()) // Remediation skipped
   513  		g.Expect(err).ToNot(HaveOccurred())
   514  
   515  		g.Expect(controlPlane.KCP.Annotations).ToNot(HaveKey(controlplanev1.RemediationInProgressAnnotation))
   516  
   517  		assertMachineCondition(ctx, g, m1, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP can't remediate this machine because this could result in etcd loosing quorum")
   518  
   519  		g.Expect(env.Cleanup(ctx, m1, m2, m3, m4, m5)).To(Succeed())
   520  	})
   521  
   522  	// Remediation for when control plane is not yet initialized
   523  
   524  	t.Run("Remediation deletes unhealthy machine - 1 CP not initialized", func(t *testing.T) {
   525  		g := NewWithT(t)
   526  
   527  		m1 := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withMachineHealthCheckFailed(), withWaitBeforeDeleteFinalizer())
   528  
   529  		controlPlane := &internal.ControlPlane{
   530  			KCP: &controlplanev1.KubeadmControlPlane{
   531  				Spec: controlplanev1.KubeadmControlPlaneSpec{
   532  					Replicas: utilpointer.Int32(1),
   533  					Version:  "v1.19.1",
   534  				},
   535  				Status: controlplanev1.KubeadmControlPlaneStatus{
   536  					Initialized: false,
   537  				},
   538  			},
   539  			Cluster:  &clusterv1.Cluster{},
   540  			Machines: collections.FromMachines(m1),
   541  		}
   542  
   543  		r := &KubeadmControlPlaneReconciler{
   544  			Client:   env.GetClient(),
   545  			recorder: record.NewFakeRecorder(32),
   546  			managementCluster: &fakeManagementCluster{
   547  				Workload: fakeWorkloadCluster{
   548  					EtcdMembersResult: nodes(controlPlane.Machines),
   549  				},
   550  			},
   551  		}
   552  
   553  		ret, err := r.reconcileUnhealthyMachines(ctx, controlPlane)
   554  
   555  		g.Expect(ret.IsZero()).To(BeFalse()) // Remediation completed, requeue
   556  		g.Expect(err).ToNot(HaveOccurred())
   557  
   558  		g.Expect(controlPlane.KCP.Annotations).To(HaveKey(controlplanev1.RemediationInProgressAnnotation))
   559  		remediationData, err := RemediationDataFromAnnotation(controlPlane.KCP.Annotations[controlplanev1.RemediationInProgressAnnotation])
   560  		g.Expect(err).ToNot(HaveOccurred())
   561  		g.Expect(remediationData.Machine).To(Equal(m1.Name))
   562  		g.Expect(remediationData.RetryCount).To(Equal(0))
   563  
   564  		assertMachineCondition(ctx, g, m1, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.RemediationInProgressReason, clusterv1.ConditionSeverityWarning, "")
   565  
   566  		err = env.Get(ctx, client.ObjectKey{Namespace: m1.Namespace, Name: m1.Name}, m1)
   567  		g.Expect(err).ToNot(HaveOccurred())
   568  		g.Expect(m1.ObjectMeta.DeletionTimestamp.IsZero()).To(BeFalse())
   569  
   570  		removeFinalizer(g, m1)
   571  		g.Expect(env.Cleanup(ctx, m1)).To(Succeed())
   572  	})
   573  	t.Run("Subsequent remediation of the same machine increase retry count - 1 CP not initialized", func(t *testing.T) {
   574  		g := NewWithT(t)
   575  
   576  		m1 := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withMachineHealthCheckFailed(), withWaitBeforeDeleteFinalizer())
   577  
   578  		controlPlane := &internal.ControlPlane{
   579  			KCP: &controlplanev1.KubeadmControlPlane{
   580  				Spec: controlplanev1.KubeadmControlPlaneSpec{
   581  					Replicas: utilpointer.Int32(1),
   582  					Version:  "v1.19.1",
   583  				},
   584  				Status: controlplanev1.KubeadmControlPlaneStatus{
   585  					Initialized: false,
   586  				},
   587  			},
   588  			Cluster:  &clusterv1.Cluster{},
   589  			Machines: collections.FromMachines(m1),
   590  		}
   591  
   592  		// First reconcile, remediate machine m1 for the first time
   593  		r := &KubeadmControlPlaneReconciler{
   594  			Client:   env.GetClient(),
   595  			recorder: record.NewFakeRecorder(32),
   596  			managementCluster: &fakeManagementCluster{
   597  				Workload: fakeWorkloadCluster{
   598  					EtcdMembersResult: nodes(controlPlane.Machines),
   599  				},
   600  			},
   601  		}
   602  
   603  		ret, err := r.reconcileUnhealthyMachines(ctx, controlPlane)
   604  
   605  		g.Expect(ret.IsZero()).To(BeFalse()) // Remediation completed, requeue
   606  		g.Expect(err).ToNot(HaveOccurred())
   607  
   608  		g.Expect(controlPlane.KCP.Annotations).To(HaveKey(controlplanev1.RemediationInProgressAnnotation))
   609  		remediationData, err := RemediationDataFromAnnotation(controlPlane.KCP.Annotations[controlplanev1.RemediationInProgressAnnotation])
   610  		g.Expect(err).ToNot(HaveOccurred())
   611  		g.Expect(remediationData.Machine).To(Equal(m1.Name))
   612  		g.Expect(remediationData.RetryCount).To(Equal(0))
   613  
   614  		assertMachineCondition(ctx, g, m1, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.RemediationInProgressReason, clusterv1.ConditionSeverityWarning, "")
   615  
   616  		err = env.Get(ctx, client.ObjectKey{Namespace: m1.Namespace, Name: m1.Name}, m1)
   617  		g.Expect(err).ToNot(HaveOccurred())
   618  		g.Expect(m1.ObjectMeta.DeletionTimestamp.IsZero()).To(BeFalse())
   619  
   620  		removeFinalizer(g, m1)
   621  		g.Expect(env.CleanupAndWait(ctx, m1)).To(Succeed())
   622  
   623  		for i := 2; i < 4; i++ {
   624  			// Simulate the creation of a replacement for 0.
   625  			mi := createMachine(ctx, g, ns.Name, fmt.Sprintf("m%d-unhealthy-", i), withMachineHealthCheckFailed(), withWaitBeforeDeleteFinalizer(), withRemediateForAnnotation(MustMarshalRemediationData(remediationData)))
   626  
   627  			// Simulate KCP dropping RemediationInProgressAnnotation after creating the replacement machine.
   628  			delete(controlPlane.KCP.Annotations, controlplanev1.RemediationInProgressAnnotation)
   629  
   630  			controlPlane.Machines = collections.FromMachines(mi)
   631  
   632  			// Reconcile unhealthy replacements for m1.
   633  			r.managementCluster = &fakeManagementCluster{
   634  				Workload: fakeWorkloadCluster{
   635  					EtcdMembersResult: nodes(collections.FromMachines(mi)),
   636  				},
   637  			}
   638  			ret, err := r.reconcileUnhealthyMachines(ctx, controlPlane)
   639  
   640  			g.Expect(ret.IsZero()).To(BeFalse()) // Remediation completed, requeue
   641  			g.Expect(err).ToNot(HaveOccurred())
   642  
   643  			g.Expect(controlPlane.KCP.Annotations).To(HaveKey(controlplanev1.RemediationInProgressAnnotation))
   644  			remediationData, err = RemediationDataFromAnnotation(controlPlane.KCP.Annotations[controlplanev1.RemediationInProgressAnnotation])
   645  			g.Expect(err).ToNot(HaveOccurred())
   646  			g.Expect(remediationData.Machine).To(Equal(mi.Name))
   647  			g.Expect(remediationData.RetryCount).To(Equal(i - 1))
   648  
   649  			assertMachineCondition(ctx, g, mi, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.RemediationInProgressReason, clusterv1.ConditionSeverityWarning, "")
   650  
   651  			err = env.Get(ctx, client.ObjectKey{Namespace: mi.Namespace, Name: mi.Name}, mi)
   652  			g.Expect(err).ToNot(HaveOccurred())
   653  			g.Expect(mi.ObjectMeta.DeletionTimestamp.IsZero()).To(BeFalse())
   654  
   655  			removeFinalizer(g, mi)
   656  			g.Expect(env.CleanupAndWait(ctx, mi)).To(Succeed())
   657  		}
   658  	})
   659  
   660  	// Remediation for when control plane is already initialized
   661  
   662  	t.Run("Remediation deletes unhealthy machine - 2 CP (during 1 CP rolling upgrade)", func(t *testing.T) {
   663  		g := NewWithT(t)
   664  
   665  		m1 := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withMachineHealthCheckFailed(), withWaitBeforeDeleteFinalizer())
   666  		m2 := createMachine(ctx, g, ns.Name, "m2-healthy-", withHealthyEtcdMember())
   667  
   668  		controlPlane := &internal.ControlPlane{
   669  			KCP: &controlplanev1.KubeadmControlPlane{
   670  				Spec: controlplanev1.KubeadmControlPlaneSpec{
   671  					Replicas: utilpointer.Int32(2),
   672  					Version:  "v1.19.1",
   673  				},
   674  				Status: controlplanev1.KubeadmControlPlaneStatus{
   675  					Initialized: true,
   676  				},
   677  			},
   678  			Cluster:  &clusterv1.Cluster{},
   679  			Machines: collections.FromMachines(m1, m2),
   680  		}
   681  
   682  		r := &KubeadmControlPlaneReconciler{
   683  			Client:   env.GetClient(),
   684  			recorder: record.NewFakeRecorder(32),
   685  			managementCluster: &fakeManagementCluster{
   686  				Workload: fakeWorkloadCluster{
   687  					EtcdMembersResult: nodes(controlPlane.Machines),
   688  				},
   689  			},
   690  		}
   691  		controlPlane.InjectTestManagementCluster(r.managementCluster)
   692  
   693  		ret, err := r.reconcileUnhealthyMachines(ctx, controlPlane)
   694  
   695  		g.Expect(ret.IsZero()).To(BeFalse()) // Remediation completed, requeue
   696  		g.Expect(err).ToNot(HaveOccurred())
   697  
   698  		g.Expect(controlPlane.KCP.Annotations).To(HaveKey(controlplanev1.RemediationInProgressAnnotation))
   699  		remediationData, err := RemediationDataFromAnnotation(controlPlane.KCP.Annotations[controlplanev1.RemediationInProgressAnnotation])
   700  		g.Expect(err).ToNot(HaveOccurred())
   701  		g.Expect(remediationData.Machine).To(Equal(m1.Name))
   702  		g.Expect(remediationData.RetryCount).To(Equal(0))
   703  
   704  		assertMachineCondition(ctx, g, m1, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.RemediationInProgressReason, clusterv1.ConditionSeverityWarning, "")
   705  
   706  		err = env.Get(ctx, client.ObjectKey{Namespace: m1.Namespace, Name: m1.Name}, m1)
   707  		g.Expect(err).ToNot(HaveOccurred())
   708  		g.Expect(m1.ObjectMeta.DeletionTimestamp.IsZero()).To(BeFalse())
   709  
   710  		removeFinalizer(g, m1)
   711  		g.Expect(env.Cleanup(ctx, m1, m2)).To(Succeed())
   712  	})
   713  	t.Run("Remediation deletes unhealthy machine - 3 CP", func(t *testing.T) {
   714  		g := NewWithT(t)
   715  
   716  		m1 := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withMachineHealthCheckFailed(), withWaitBeforeDeleteFinalizer())
   717  		m2 := createMachine(ctx, g, ns.Name, "m2-healthy-", withHealthyEtcdMember())
   718  		m3 := createMachine(ctx, g, ns.Name, "m3-healthy-", withHealthyEtcdMember())
   719  
   720  		controlPlane := &internal.ControlPlane{
   721  			KCP: &controlplanev1.KubeadmControlPlane{
   722  				Spec: controlplanev1.KubeadmControlPlaneSpec{
   723  					Replicas: utilpointer.Int32(3),
   724  					Version:  "v1.19.1",
   725  				},
   726  				Status: controlplanev1.KubeadmControlPlaneStatus{
   727  					Initialized: true,
   728  				},
   729  			},
   730  			Cluster:  &clusterv1.Cluster{},
   731  			Machines: collections.FromMachines(m1, m2, m3),
   732  		}
   733  
   734  		r := &KubeadmControlPlaneReconciler{
   735  			Client:   env.GetClient(),
   736  			recorder: record.NewFakeRecorder(32),
   737  			managementCluster: &fakeManagementCluster{
   738  				Workload: fakeWorkloadCluster{
   739  					EtcdMembersResult: nodes(controlPlane.Machines),
   740  				},
   741  			},
   742  		}
   743  		controlPlane.InjectTestManagementCluster(r.managementCluster)
   744  
   745  		ret, err := r.reconcileUnhealthyMachines(ctx, controlPlane)
   746  
   747  		g.Expect(ret.IsZero()).To(BeFalse()) // Remediation completed, requeue
   748  		g.Expect(err).ToNot(HaveOccurred())
   749  
   750  		g.Expect(controlPlane.KCP.Annotations).To(HaveKey(controlplanev1.RemediationInProgressAnnotation))
   751  		remediationData, err := RemediationDataFromAnnotation(controlPlane.KCP.Annotations[controlplanev1.RemediationInProgressAnnotation])
   752  		g.Expect(err).ToNot(HaveOccurred())
   753  		g.Expect(remediationData.Machine).To(Equal(m1.Name))
   754  		g.Expect(remediationData.RetryCount).To(Equal(0))
   755  
   756  		assertMachineCondition(ctx, g, m1, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.RemediationInProgressReason, clusterv1.ConditionSeverityWarning, "")
   757  
   758  		err = env.Get(ctx, client.ObjectKey{Namespace: m1.Namespace, Name: m1.Name}, m1)
   759  		g.Expect(err).ToNot(HaveOccurred())
   760  		g.Expect(m1.ObjectMeta.DeletionTimestamp.IsZero()).To(BeFalse())
   761  
   762  		removeFinalizer(g, m1)
   763  		g.Expect(env.Cleanup(ctx, m1, m2, m3)).To(Succeed())
   764  	})
   765  	t.Run("Remediation deletes unhealthy machine - 4 CP (during 3 CP rolling upgrade)", func(t *testing.T) {
   766  		g := NewWithT(t)
   767  
   768  		m1 := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withMachineHealthCheckFailed(), withWaitBeforeDeleteFinalizer())
   769  		m2 := createMachine(ctx, g, ns.Name, "m2-healthy-", withHealthyEtcdMember())
   770  		m3 := createMachine(ctx, g, ns.Name, "m3-healthy-", withHealthyEtcdMember())
   771  		m4 := createMachine(ctx, g, ns.Name, "m4-healthy-", withHealthyEtcdMember())
   772  
   773  		controlPlane := &internal.ControlPlane{
   774  			KCP: &controlplanev1.KubeadmControlPlane{
   775  				Spec: controlplanev1.KubeadmControlPlaneSpec{
   776  					Replicas: utilpointer.Int32(4),
   777  					Version:  "v1.19.1",
   778  				},
   779  				Status: controlplanev1.KubeadmControlPlaneStatus{
   780  					Initialized: true,
   781  				},
   782  			},
   783  			Cluster:  &clusterv1.Cluster{},
   784  			Machines: collections.FromMachines(m1, m2, m3, m4),
   785  		}
   786  
   787  		r := &KubeadmControlPlaneReconciler{
   788  			Client:   env.GetClient(),
   789  			recorder: record.NewFakeRecorder(32),
   790  			managementCluster: &fakeManagementCluster{
   791  				Workload: fakeWorkloadCluster{
   792  					EtcdMembersResult: nodes(controlPlane.Machines),
   793  				},
   794  			},
   795  		}
   796  		controlPlane.InjectTestManagementCluster(r.managementCluster)
   797  
   798  		ret, err := r.reconcileUnhealthyMachines(ctx, controlPlane)
   799  
   800  		g.Expect(ret.IsZero()).To(BeFalse()) // Remediation completed, requeue
   801  		g.Expect(err).ToNot(HaveOccurred())
   802  
   803  		g.Expect(controlPlane.KCP.Annotations).To(HaveKey(controlplanev1.RemediationInProgressAnnotation))
   804  		remediationData, err := RemediationDataFromAnnotation(controlPlane.KCP.Annotations[controlplanev1.RemediationInProgressAnnotation])
   805  		g.Expect(err).ToNot(HaveOccurred())
   806  		g.Expect(remediationData.Machine).To(Equal(m1.Name))
   807  		g.Expect(remediationData.RetryCount).To(Equal(0))
   808  
   809  		assertMachineCondition(ctx, g, m1, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.RemediationInProgressReason, clusterv1.ConditionSeverityWarning, "")
   810  
   811  		err = env.Get(ctx, client.ObjectKey{Namespace: m1.Namespace, Name: m1.Name}, m1)
   812  		g.Expect(err).ToNot(HaveOccurred())
   813  		g.Expect(m1.ObjectMeta.DeletionTimestamp.IsZero()).To(BeFalse())
   814  
   815  		removeFinalizer(g, m1)
   816  		g.Expect(env.Cleanup(ctx, m1, m2, m3, m4)).To(Succeed())
   817  	})
   818  	t.Run("Remediation fails gracefully if no healthy Control Planes are available to become etcd leader", func(t *testing.T) {
   819  		g := NewWithT(t)
   820  
   821  		m1 := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withMachineHealthCheckFailed(), withWaitBeforeDeleteFinalizer())
   822  		m2 := createMachine(ctx, g, ns.Name, "m2-healthy-", withMachineHealthCheckFailed(), withHealthyEtcdMember())
   823  		m3 := createMachine(ctx, g, ns.Name, "m3-healthy-", withMachineHealthCheckFailed(), withHealthyEtcdMember())
   824  		m4 := createMachine(ctx, g, ns.Name, "m4-healthy-", withMachineHealthCheckFailed(), withHealthyEtcdMember())
   825  
   826  		controlPlane := &internal.ControlPlane{
   827  			KCP: &controlplanev1.KubeadmControlPlane{
   828  				Spec: controlplanev1.KubeadmControlPlaneSpec{
   829  					Replicas: utilpointer.Int32(4),
   830  					Version:  "v1.19.1",
   831  				},
   832  				Status: controlplanev1.KubeadmControlPlaneStatus{
   833  					Initialized: true,
   834  				},
   835  			},
   836  			Cluster:  &clusterv1.Cluster{},
   837  			Machines: collections.FromMachines(m1, m2, m3, m4),
   838  		}
   839  
   840  		r := &KubeadmControlPlaneReconciler{
   841  			Client:   env.GetClient(),
   842  			recorder: record.NewFakeRecorder(32),
   843  			managementCluster: &fakeManagementCluster{
   844  				Workload: fakeWorkloadCluster{
   845  					EtcdMembersResult: nodes(controlPlane.Machines),
   846  				},
   847  			},
   848  		}
   849  		controlPlane.InjectTestManagementCluster(r.managementCluster)
   850  
   851  		_, err = r.reconcileUnhealthyMachines(ctx, controlPlane)
   852  		g.Expect(err).ToNot(HaveOccurred())
   853  
   854  		g.Expect(controlPlane.KCP.Annotations).ToNot(HaveKey(controlplanev1.RemediationInProgressAnnotation))
   855  
   856  		assertMachineCondition(ctx, g, m1, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.RemediationFailedReason, clusterv1.ConditionSeverityWarning,
   857  			"A control plane machine needs remediation, but there is no healthy machine to forward etcd leadership to. Skipping remediation")
   858  
   859  		removeFinalizer(g, m1)
   860  		g.Expect(env.Cleanup(ctx, m1, m2, m3, m4)).To(Succeed())
   861  	})
   862  	t.Run("Subsequent remediation of the same machine increase retry count - 3 CP", func(t *testing.T) {
   863  		g := NewWithT(t)
   864  
   865  		m1 := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withMachineHealthCheckFailed(), withWaitBeforeDeleteFinalizer())
   866  		m2 := createMachine(ctx, g, ns.Name, "m2-healthy-", withHealthyEtcdMember())
   867  		m3 := createMachine(ctx, g, ns.Name, "m3-healthy-", withHealthyEtcdMember())
   868  
   869  		controlPlane := &internal.ControlPlane{
   870  			KCP: &controlplanev1.KubeadmControlPlane{
   871  				Spec: controlplanev1.KubeadmControlPlaneSpec{
   872  					Replicas: utilpointer.Int32(1),
   873  					Version:  "v1.19.1",
   874  				},
   875  				Status: controlplanev1.KubeadmControlPlaneStatus{
   876  					Initialized: false,
   877  				},
   878  			},
   879  			Cluster:  &clusterv1.Cluster{},
   880  			Machines: collections.FromMachines(m1, m2, m3),
   881  		}
   882  
   883  		// First reconcile, remediate machine m1 for the first time
   884  		r := &KubeadmControlPlaneReconciler{
   885  			Client:   env.GetClient(),
   886  			recorder: record.NewFakeRecorder(32),
   887  			managementCluster: &fakeManagementCluster{
   888  				Workload: fakeWorkloadCluster{
   889  					EtcdMembersResult: nodes(controlPlane.Machines),
   890  				},
   891  			},
   892  		}
   893  
   894  		ret, err := r.reconcileUnhealthyMachines(ctx, controlPlane)
   895  
   896  		g.Expect(ret.IsZero()).To(BeFalse()) // Remediation completed, requeue
   897  		g.Expect(err).ToNot(HaveOccurred())
   898  
   899  		g.Expect(controlPlane.KCP.Annotations).To(HaveKey(controlplanev1.RemediationInProgressAnnotation))
   900  		remediationData, err := RemediationDataFromAnnotation(controlPlane.KCP.Annotations[controlplanev1.RemediationInProgressAnnotation])
   901  		g.Expect(err).ToNot(HaveOccurred())
   902  		g.Expect(remediationData.Machine).To(Equal(m1.Name))
   903  		g.Expect(remediationData.RetryCount).To(Equal(0))
   904  
   905  		assertMachineCondition(ctx, g, m1, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.RemediationInProgressReason, clusterv1.ConditionSeverityWarning, "")
   906  
   907  		err = env.Get(ctx, client.ObjectKey{Namespace: m1.Namespace, Name: m1.Name}, m1)
   908  		g.Expect(err).ToNot(HaveOccurred())
   909  		g.Expect(m1.ObjectMeta.DeletionTimestamp.IsZero()).To(BeFalse())
   910  
   911  		removeFinalizer(g, m1)
   912  		g.Expect(env.CleanupAndWait(ctx, m1)).To(Succeed())
   913  
   914  		for i := 5; i < 6; i++ {
   915  			// Simulate the creation of a replacement for m1.
   916  			mi := createMachine(ctx, g, ns.Name, fmt.Sprintf("m%d-unhealthy-", i), withMachineHealthCheckFailed(), withWaitBeforeDeleteFinalizer(), withRemediateForAnnotation(MustMarshalRemediationData(remediationData)))
   917  
   918  			// Simulate KCP dropping RemediationInProgressAnnotation after creating the replacement machine.
   919  			delete(controlPlane.KCP.Annotations, controlplanev1.RemediationInProgressAnnotation)
   920  			controlPlane.Machines = collections.FromMachines(mi, m2, m3)
   921  
   922  			// Reconcile unhealthy replacements for m1.
   923  			r.managementCluster = &fakeManagementCluster{
   924  				Workload: fakeWorkloadCluster{
   925  					EtcdMembersResult: nodes(collections.FromMachines(mi, m2, m3)),
   926  				},
   927  			}
   928  
   929  			ret, err := r.reconcileUnhealthyMachines(ctx, controlPlane)
   930  
   931  			g.Expect(ret.IsZero()).To(BeFalse()) // Remediation completed, requeue
   932  			g.Expect(err).ToNot(HaveOccurred())
   933  
   934  			g.Expect(controlPlane.KCP.Annotations).To(HaveKey(controlplanev1.RemediationInProgressAnnotation))
   935  			remediationData, err := RemediationDataFromAnnotation(controlPlane.KCP.Annotations[controlplanev1.RemediationInProgressAnnotation])
   936  			g.Expect(err).ToNot(HaveOccurred())
   937  			g.Expect(remediationData.Machine).To(Equal(mi.Name))
   938  			g.Expect(remediationData.RetryCount).To(Equal(i - 4))
   939  
   940  			assertMachineCondition(ctx, g, mi, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.RemediationInProgressReason, clusterv1.ConditionSeverityWarning, "")
   941  
   942  			err = env.Get(ctx, client.ObjectKey{Namespace: mi.Namespace, Name: mi.Name}, mi)
   943  			g.Expect(err).ToNot(HaveOccurred())
   944  			g.Expect(mi.ObjectMeta.DeletionTimestamp.IsZero()).To(BeFalse())
   945  
   946  			removeFinalizer(g, mi)
   947  			g.Expect(env.CleanupAndWait(ctx, mi)).To(Succeed())
   948  		}
   949  
   950  		g.Expect(env.CleanupAndWait(ctx, m2, m3)).To(Succeed())
   951  	})
   952  }
   953  
   954  func TestReconcileUnhealthyMachinesSequences(t *testing.T) {
   955  	var removeFinalizer = func(g *WithT, m *clusterv1.Machine) {
   956  		patchHelper, err := patch.NewHelper(m, env.GetClient())
   957  		g.Expect(err).ToNot(HaveOccurred())
   958  		m.ObjectMeta.Finalizers = nil
   959  		g.Expect(patchHelper.Patch(ctx, m)).To(Succeed())
   960  	}
   961  
   962  	t.Run("Remediates the first CP machine having problems to come up", func(t *testing.T) {
   963  		g := NewWithT(t)
   964  
   965  		ns, err := env.CreateNamespace(ctx, "ns1")
   966  		g.Expect(err).ToNot(HaveOccurred())
   967  		defer func() {
   968  			g.Expect(env.Cleanup(ctx, ns)).To(Succeed())
   969  		}()
   970  
   971  		// Control plane not initialized yet, First CP is unhealthy and gets remediated:
   972  
   973  		m1 := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withMachineHealthCheckFailed(), withWaitBeforeDeleteFinalizer())
   974  
   975  		controlPlane := &internal.ControlPlane{
   976  			KCP: &controlplanev1.KubeadmControlPlane{
   977  				Spec: controlplanev1.KubeadmControlPlaneSpec{
   978  					Replicas: utilpointer.Int32(3),
   979  					Version:  "v1.19.1",
   980  				},
   981  				Status: controlplanev1.KubeadmControlPlaneStatus{
   982  					Initialized: false,
   983  				},
   984  			},
   985  			Cluster:  &clusterv1.Cluster{},
   986  			Machines: collections.FromMachines(m1),
   987  		}
   988  
   989  		r := &KubeadmControlPlaneReconciler{
   990  			Client:   env.GetClient(),
   991  			recorder: record.NewFakeRecorder(32),
   992  			managementCluster: &fakeManagementCluster{
   993  				Workload: fakeWorkloadCluster{
   994  					EtcdMembersResult: nodes(controlPlane.Machines),
   995  				},
   996  			},
   997  		}
   998  
   999  		ret, err := r.reconcileUnhealthyMachines(ctx, controlPlane)
  1000  
  1001  		g.Expect(ret.IsZero()).To(BeFalse()) // Remediation completed, requeue
  1002  		g.Expect(err).ToNot(HaveOccurred())
  1003  
  1004  		g.Expect(controlPlane.KCP.Annotations).To(HaveKey(controlplanev1.RemediationInProgressAnnotation))
  1005  		remediationData, err := RemediationDataFromAnnotation(controlPlane.KCP.Annotations[controlplanev1.RemediationInProgressAnnotation])
  1006  		g.Expect(err).ToNot(HaveOccurred())
  1007  		g.Expect(remediationData.Machine).To(Equal(m1.Name))
  1008  		g.Expect(remediationData.RetryCount).To(Equal(0))
  1009  
  1010  		assertMachineCondition(ctx, g, m1, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.RemediationInProgressReason, clusterv1.ConditionSeverityWarning, "")
  1011  
  1012  		err = env.Get(ctx, client.ObjectKey{Namespace: m1.Namespace, Name: m1.Name}, m1)
  1013  		g.Expect(err).ToNot(HaveOccurred())
  1014  		g.Expect(m1.ObjectMeta.DeletionTimestamp.IsZero()).To(BeFalse())
  1015  
  1016  		removeFinalizer(g, m1)
  1017  		g.Expect(env.Cleanup(ctx, m1)).To(Succeed())
  1018  
  1019  		// Fake scaling up, which creates a remediation machine, fast forwards to when also the replacement machine is marked unhealthy.
  1020  		// NOTE: scale up also resets remediation in progress and remediation counts.
  1021  
  1022  		m2 := createMachine(ctx, g, ns.Name, "m2-unhealthy-", withMachineHealthCheckFailed(), withWaitBeforeDeleteFinalizer(), withRemediateForAnnotation(MustMarshalRemediationData(remediationData)))
  1023  		delete(controlPlane.KCP.Annotations, controlplanev1.RemediationInProgressAnnotation)
  1024  
  1025  		// Control plane not initialized yet, Second CP is unhealthy and gets remediated (retry 2)
  1026  
  1027  		controlPlane.Machines = collections.FromMachines(m2)
  1028  		r.managementCluster = &fakeManagementCluster{
  1029  			Workload: fakeWorkloadCluster{
  1030  				EtcdMembersResult: nodes(controlPlane.Machines),
  1031  			},
  1032  		}
  1033  
  1034  		ret, err = r.reconcileUnhealthyMachines(ctx, controlPlane)
  1035  
  1036  		g.Expect(ret.IsZero()).To(BeFalse()) // Remediation completed, requeue
  1037  		g.Expect(err).ToNot(HaveOccurred())
  1038  
  1039  		g.Expect(controlPlane.KCP.Annotations).To(HaveKey(controlplanev1.RemediationInProgressAnnotation))
  1040  		remediationData, err = RemediationDataFromAnnotation(controlPlane.KCP.Annotations[controlplanev1.RemediationInProgressAnnotation])
  1041  		g.Expect(err).ToNot(HaveOccurred())
  1042  		g.Expect(remediationData.Machine).To(Equal(m2.Name))
  1043  		g.Expect(remediationData.RetryCount).To(Equal(1))
  1044  
  1045  		assertMachineCondition(ctx, g, m2, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.RemediationInProgressReason, clusterv1.ConditionSeverityWarning, "")
  1046  
  1047  		err = env.Get(ctx, client.ObjectKey{Namespace: m2.Namespace, Name: m2.Name}, m1)
  1048  		g.Expect(err).ToNot(HaveOccurred())
  1049  		g.Expect(m2.ObjectMeta.DeletionTimestamp.IsZero()).To(BeFalse())
  1050  
  1051  		removeFinalizer(g, m2)
  1052  		g.Expect(env.Cleanup(ctx, m2)).To(Succeed())
  1053  
  1054  		// Fake scaling up, which creates a remediation machine, which is healthy.
  1055  		// NOTE: scale up also resets remediation in progress and remediation counts.
  1056  
  1057  		m3 := createMachine(ctx, g, ns.Name, "m3-healthy-", withHealthyEtcdMember(), withRemediateForAnnotation(MustMarshalRemediationData(remediationData)))
  1058  		delete(controlPlane.KCP.Annotations, controlplanev1.RemediationInProgressAnnotation)
  1059  
  1060  		g.Expect(env.Cleanup(ctx, m3)).To(Succeed())
  1061  	})
  1062  
  1063  	t.Run("Remediates the second CP machine having problems to come up", func(t *testing.T) {
  1064  		g := NewWithT(t)
  1065  
  1066  		ns, err := env.CreateNamespace(ctx, "ns1")
  1067  		g.Expect(err).ToNot(HaveOccurred())
  1068  		defer func() {
  1069  			g.Expect(env.Cleanup(ctx, ns)).To(Succeed())
  1070  		}()
  1071  
  1072  		// Control plane initialized yet, First CP healthy, second CP is unhealthy and gets remediated:
  1073  
  1074  		m1 := createMachine(ctx, g, ns.Name, "m1-healthy-", withHealthyEtcdMember())
  1075  		m2 := createMachine(ctx, g, ns.Name, "m2-unhealthy-", withMachineHealthCheckFailed(), withWaitBeforeDeleteFinalizer())
  1076  
  1077  		controlPlane := &internal.ControlPlane{
  1078  			KCP: &controlplanev1.KubeadmControlPlane{
  1079  				Spec: controlplanev1.KubeadmControlPlaneSpec{
  1080  					Replicas: utilpointer.Int32(3),
  1081  					Version:  "v1.19.1",
  1082  					RolloutStrategy: &controlplanev1.RolloutStrategy{
  1083  						RollingUpdate: &controlplanev1.RollingUpdate{
  1084  							MaxSurge: &intstr.IntOrString{
  1085  								IntVal: 1,
  1086  							},
  1087  						},
  1088  					},
  1089  				},
  1090  				Status: controlplanev1.KubeadmControlPlaneStatus{
  1091  					Initialized: true,
  1092  				},
  1093  			},
  1094  			Cluster:  &clusterv1.Cluster{},
  1095  			Machines: collections.FromMachines(m1, m2),
  1096  		}
  1097  
  1098  		r := &KubeadmControlPlaneReconciler{
  1099  			Client:   env.GetClient(),
  1100  			recorder: record.NewFakeRecorder(32),
  1101  			managementCluster: &fakeManagementCluster{
  1102  				Workload: fakeWorkloadCluster{
  1103  					EtcdMembersResult: nodes(controlPlane.Machines),
  1104  				},
  1105  			},
  1106  		}
  1107  		controlPlane.InjectTestManagementCluster(r.managementCluster)
  1108  
  1109  		ret, err := r.reconcileUnhealthyMachines(ctx, controlPlane)
  1110  
  1111  		g.Expect(ret.IsZero()).To(BeFalse()) // Remediation completed, requeue
  1112  		g.Expect(err).ToNot(HaveOccurred())
  1113  
  1114  		g.Expect(controlPlane.KCP.Annotations).To(HaveKey(controlplanev1.RemediationInProgressAnnotation))
  1115  		remediationData, err := RemediationDataFromAnnotation(controlPlane.KCP.Annotations[controlplanev1.RemediationInProgressAnnotation])
  1116  		g.Expect(err).ToNot(HaveOccurred())
  1117  		g.Expect(remediationData.Machine).To(Equal(m2.Name))
  1118  		g.Expect(remediationData.RetryCount).To(Equal(0))
  1119  
  1120  		assertMachineCondition(ctx, g, m2, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.RemediationInProgressReason, clusterv1.ConditionSeverityWarning, "")
  1121  
  1122  		err = env.Get(ctx, client.ObjectKey{Namespace: m2.Namespace, Name: m2.Name}, m2)
  1123  		g.Expect(err).ToNot(HaveOccurred())
  1124  		g.Expect(m2.ObjectMeta.DeletionTimestamp.IsZero()).To(BeFalse())
  1125  
  1126  		removeFinalizer(g, m2)
  1127  		g.Expect(env.Cleanup(ctx, m2)).To(Succeed())
  1128  
  1129  		// Fake scaling up, which creates a remediation machine, fast forwards to when also the replacement machine is marked unhealthy.
  1130  		// NOTE: scale up also resets remediation in progress and remediation counts.
  1131  
  1132  		m3 := createMachine(ctx, g, ns.Name, "m3-unhealthy-", withMachineHealthCheckFailed(), withWaitBeforeDeleteFinalizer(), withRemediateForAnnotation(MustMarshalRemediationData(remediationData)))
  1133  		delete(controlPlane.KCP.Annotations, controlplanev1.RemediationInProgressAnnotation)
  1134  
  1135  		// Control plane not initialized yet, Second CP is unhealthy and gets remediated (retry 2)
  1136  
  1137  		controlPlane.Machines = collections.FromMachines(m1, m3)
  1138  		r.managementCluster = &fakeManagementCluster{
  1139  			Workload: fakeWorkloadCluster{
  1140  				EtcdMembersResult: nodes(controlPlane.Machines),
  1141  			},
  1142  		}
  1143  		controlPlane.InjectTestManagementCluster(r.managementCluster)
  1144  
  1145  		ret, err = r.reconcileUnhealthyMachines(ctx, controlPlane)
  1146  
  1147  		g.Expect(ret.IsZero()).To(BeFalse()) // Remediation completed, requeue
  1148  		g.Expect(err).ToNot(HaveOccurred())
  1149  
  1150  		g.Expect(controlPlane.KCP.Annotations).To(HaveKey(controlplanev1.RemediationInProgressAnnotation))
  1151  		remediationData, err = RemediationDataFromAnnotation(controlPlane.KCP.Annotations[controlplanev1.RemediationInProgressAnnotation])
  1152  		g.Expect(err).ToNot(HaveOccurred())
  1153  		g.Expect(remediationData.Machine).To(Equal(m3.Name))
  1154  		g.Expect(remediationData.RetryCount).To(Equal(1))
  1155  
  1156  		assertMachineCondition(ctx, g, m3, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.RemediationInProgressReason, clusterv1.ConditionSeverityWarning, "")
  1157  
  1158  		err = env.Get(ctx, client.ObjectKey{Namespace: m3.Namespace, Name: m3.Name}, m3)
  1159  		g.Expect(err).ToNot(HaveOccurred())
  1160  		g.Expect(m3.ObjectMeta.DeletionTimestamp.IsZero()).To(BeFalse())
  1161  
  1162  		removeFinalizer(g, m3)
  1163  		g.Expect(env.Cleanup(ctx, m3)).To(Succeed())
  1164  
  1165  		// Fake scaling up, which creates a remediation machine, which is healthy.
  1166  		// NOTE: scale up also resets remediation in progress and remediation counts.
  1167  
  1168  		m4 := createMachine(ctx, g, ns.Name, "m4-healthy-", withHealthyEtcdMember(), withRemediateForAnnotation(MustMarshalRemediationData(remediationData)))
  1169  		delete(controlPlane.KCP.Annotations, controlplanev1.RemediationInProgressAnnotation)
  1170  
  1171  		g.Expect(env.Cleanup(ctx, m1, m4)).To(Succeed())
  1172  	})
  1173  
  1174  	t.Run("Remediates only one CP machine in case of multiple failures", func(t *testing.T) {
  1175  		g := NewWithT(t)
  1176  
  1177  		ns, err := env.CreateNamespace(ctx, "ns1")
  1178  		g.Expect(err).ToNot(HaveOccurred())
  1179  		defer func() {
  1180  			g.Expect(env.Cleanup(ctx, ns)).To(Succeed())
  1181  		}()
  1182  
  1183  		// Control plane initialized yet, First CP healthy, second and third CP are unhealthy. second gets remediated:
  1184  
  1185  		m1 := createMachine(ctx, g, ns.Name, "m1-healthy-", withHealthyEtcdMember())
  1186  		m2 := createMachine(ctx, g, ns.Name, "m2-unhealthy-", withHealthyEtcdMember(), withMachineHealthCheckFailed(), withWaitBeforeDeleteFinalizer())
  1187  		m3 := createMachine(ctx, g, ns.Name, "m3-unhealthy-", withHealthyEtcdMember(), withMachineHealthCheckFailed(), withWaitBeforeDeleteFinalizer())
  1188  
  1189  		controlPlane := &internal.ControlPlane{
  1190  			KCP: &controlplanev1.KubeadmControlPlane{
  1191  				Spec: controlplanev1.KubeadmControlPlaneSpec{
  1192  					Replicas: utilpointer.Int32(3),
  1193  					Version:  "v1.19.1",
  1194  					RolloutStrategy: &controlplanev1.RolloutStrategy{
  1195  						RollingUpdate: &controlplanev1.RollingUpdate{
  1196  							MaxSurge: &intstr.IntOrString{
  1197  								IntVal: 1,
  1198  							},
  1199  						},
  1200  					},
  1201  				},
  1202  				Status: controlplanev1.KubeadmControlPlaneStatus{
  1203  					Initialized: true,
  1204  				},
  1205  			},
  1206  			Cluster:  &clusterv1.Cluster{},
  1207  			Machines: collections.FromMachines(m1, m2, m3),
  1208  		}
  1209  
  1210  		r := &KubeadmControlPlaneReconciler{
  1211  			Client:   env.GetClient(),
  1212  			recorder: record.NewFakeRecorder(32),
  1213  			managementCluster: &fakeManagementCluster{
  1214  				Workload: fakeWorkloadCluster{
  1215  					EtcdMembersResult: nodes(controlPlane.Machines),
  1216  				},
  1217  			},
  1218  		}
  1219  		controlPlane.InjectTestManagementCluster(r.managementCluster)
  1220  
  1221  		ret, err := r.reconcileUnhealthyMachines(ctx, controlPlane)
  1222  
  1223  		g.Expect(ret.IsZero()).To(BeFalse()) // Remediation completed, requeue
  1224  		g.Expect(err).ToNot(HaveOccurred())
  1225  
  1226  		g.Expect(controlPlane.KCP.Annotations).To(HaveKey(controlplanev1.RemediationInProgressAnnotation))
  1227  		remediationData, err := RemediationDataFromAnnotation(controlPlane.KCP.Annotations[controlplanev1.RemediationInProgressAnnotation])
  1228  		g.Expect(err).ToNot(HaveOccurred())
  1229  		g.Expect(remediationData.Machine).To(Equal(m2.Name))
  1230  		g.Expect(remediationData.RetryCount).To(Equal(0))
  1231  
  1232  		assertMachineCondition(ctx, g, m2, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.RemediationInProgressReason, clusterv1.ConditionSeverityWarning, "")
  1233  		assertMachineCondition(ctx, g, m3, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "")
  1234  
  1235  		err = env.Get(ctx, client.ObjectKey{Namespace: m2.Namespace, Name: m2.Name}, m2)
  1236  		g.Expect(err).ToNot(HaveOccurred())
  1237  		g.Expect(m2.ObjectMeta.DeletionTimestamp.IsZero()).To(BeFalse())
  1238  
  1239  		removeFinalizer(g, m2)
  1240  		g.Expect(env.Cleanup(ctx, m2)).To(Succeed())
  1241  
  1242  		// Check next reconcile does not further remediate
  1243  
  1244  		controlPlane.Machines = collections.FromMachines(m1, m3)
  1245  		r.managementCluster = &fakeManagementCluster{
  1246  			Workload: fakeWorkloadCluster{
  1247  				EtcdMembersResult: nodes(controlPlane.Machines),
  1248  			},
  1249  		}
  1250  
  1251  		ret, err = r.reconcileUnhealthyMachines(ctx, controlPlane)
  1252  
  1253  		g.Expect(ret.IsZero()).To(BeTrue()) // Remediation skipped
  1254  		g.Expect(err).ToNot(HaveOccurred())
  1255  
  1256  		g.Expect(env.Cleanup(ctx, m1)).To(Succeed())
  1257  	})
  1258  }
  1259  
  1260  func TestCanSafelyRemoveEtcdMember(t *testing.T) {
  1261  	g := NewWithT(t)
  1262  
  1263  	ns, err := env.CreateNamespace(ctx, "ns1")
  1264  	g.Expect(err).ToNot(HaveOccurred())
  1265  	defer func() {
  1266  		g.Expect(env.Cleanup(ctx, ns)).To(Succeed())
  1267  	}()
  1268  
  1269  	t.Run("Can't safely remediate 1 machine CP", func(t *testing.T) {
  1270  		g := NewWithT(t)
  1271  
  1272  		m1 := createMachine(ctx, g, ns.Name, "m1-mhc-unhealthy-", withMachineHealthCheckFailed())
  1273  
  1274  		controlPlane := &internal.ControlPlane{
  1275  			KCP: &controlplanev1.KubeadmControlPlane{Spec: controlplanev1.KubeadmControlPlaneSpec{
  1276  				Replicas: utilpointer.Int32(1),
  1277  			}},
  1278  			Cluster:  &clusterv1.Cluster{},
  1279  			Machines: collections.FromMachines(m1),
  1280  		}
  1281  
  1282  		r := &KubeadmControlPlaneReconciler{
  1283  			Client:   env.GetClient(),
  1284  			recorder: record.NewFakeRecorder(32),
  1285  			managementCluster: &fakeManagementCluster{
  1286  				Workload: fakeWorkloadCluster{
  1287  					EtcdMembersResult: nodes(controlPlane.Machines),
  1288  				},
  1289  			},
  1290  		}
  1291  		controlPlane.InjectTestManagementCluster(r.managementCluster)
  1292  
  1293  		ret, err := r.canSafelyRemoveEtcdMember(ctx, controlPlane, m1)
  1294  		g.Expect(ret).To(BeFalse())
  1295  		g.Expect(err).ToNot(HaveOccurred())
  1296  
  1297  		g.Expect(env.Cleanup(ctx, m1)).To(Succeed())
  1298  	})
  1299  
  1300  	t.Run("Can safely remediate 2 machine CP without additional etcd member failures", func(t *testing.T) {
  1301  		g := NewWithT(t)
  1302  
  1303  		m1 := createMachine(ctx, g, ns.Name, "m1-mhc-unhealthy-", withMachineHealthCheckFailed())
  1304  		m2 := createMachine(ctx, g, ns.Name, "m2-etcd-healthy-", withHealthyEtcdMember())
  1305  
  1306  		controlPlane := &internal.ControlPlane{
  1307  			KCP: &controlplanev1.KubeadmControlPlane{Spec: controlplanev1.KubeadmControlPlaneSpec{
  1308  				Replicas: utilpointer.Int32(3),
  1309  			}},
  1310  			Cluster:  &clusterv1.Cluster{},
  1311  			Machines: collections.FromMachines(m1, m2),
  1312  		}
  1313  
  1314  		r := &KubeadmControlPlaneReconciler{
  1315  			Client:   env.GetClient(),
  1316  			recorder: record.NewFakeRecorder(32),
  1317  			managementCluster: &fakeManagementCluster{
  1318  				Workload: fakeWorkloadCluster{
  1319  					EtcdMembersResult: nodes(controlPlane.Machines),
  1320  				},
  1321  			},
  1322  		}
  1323  		controlPlane.InjectTestManagementCluster(r.managementCluster)
  1324  
  1325  		ret, err := r.canSafelyRemoveEtcdMember(ctx, controlPlane, m1)
  1326  		g.Expect(ret).To(BeTrue())
  1327  		g.Expect(err).ToNot(HaveOccurred())
  1328  
  1329  		g.Expect(env.Cleanup(ctx, m1, m2)).To(Succeed())
  1330  	})
  1331  	t.Run("Can safely remediate 2 machines CP when the etcd member being remediated is missing", func(t *testing.T) {
  1332  		g := NewWithT(t)
  1333  
  1334  		m1 := createMachine(ctx, g, ns.Name, "m1-mhc-unhealthy-", withMachineHealthCheckFailed())
  1335  		m2 := createMachine(ctx, g, ns.Name, "m2-etcd-healthy-", withHealthyEtcdMember())
  1336  
  1337  		controlPlane := &internal.ControlPlane{
  1338  			KCP: &controlplanev1.KubeadmControlPlane{Spec: controlplanev1.KubeadmControlPlaneSpec{
  1339  				Replicas: utilpointer.Int32(3),
  1340  			}},
  1341  			Cluster:  &clusterv1.Cluster{},
  1342  			Machines: collections.FromMachines(m1, m2),
  1343  		}
  1344  
  1345  		members := make([]string, 0, len(controlPlane.Machines)-1)
  1346  		for _, n := range nodes(controlPlane.Machines) {
  1347  			if !strings.Contains(n, "m1-mhc-unhealthy-") {
  1348  				members = append(members, n)
  1349  			}
  1350  		}
  1351  
  1352  		r := &KubeadmControlPlaneReconciler{
  1353  			Client:   env.GetClient(),
  1354  			recorder: record.NewFakeRecorder(32),
  1355  			managementCluster: &fakeManagementCluster{
  1356  				Workload: fakeWorkloadCluster{
  1357  					EtcdMembersResult: members,
  1358  				},
  1359  			},
  1360  		}
  1361  		controlPlane.InjectTestManagementCluster(r.managementCluster)
  1362  
  1363  		ret, err := r.canSafelyRemoveEtcdMember(ctx, controlPlane, m1)
  1364  		g.Expect(ret).To(BeTrue())
  1365  		g.Expect(err).ToNot(HaveOccurred())
  1366  
  1367  		g.Expect(env.Cleanup(ctx, m1, m2)).To(Succeed())
  1368  	})
  1369  	t.Run("Can't safely remediate 2 machines CP with one additional etcd member failure", func(t *testing.T) {
  1370  		g := NewWithT(t)
  1371  
  1372  		m1 := createMachine(ctx, g, ns.Name, "m1-mhc-unhealthy-", withMachineHealthCheckFailed())
  1373  		m2 := createMachine(ctx, g, ns.Name, "m2-etcd-unhealthy-", withUnhealthyEtcdMember())
  1374  
  1375  		controlPlane := &internal.ControlPlane{
  1376  			KCP: &controlplanev1.KubeadmControlPlane{Spec: controlplanev1.KubeadmControlPlaneSpec{
  1377  				Replicas: utilpointer.Int32(3),
  1378  			}},
  1379  			Cluster:  &clusterv1.Cluster{},
  1380  			Machines: collections.FromMachines(m1, m2),
  1381  		}
  1382  
  1383  		r := &KubeadmControlPlaneReconciler{
  1384  			Client:   env.GetClient(),
  1385  			recorder: record.NewFakeRecorder(32),
  1386  			managementCluster: &fakeManagementCluster{
  1387  				Workload: fakeWorkloadCluster{
  1388  					EtcdMembersResult: nodes(controlPlane.Machines),
  1389  				},
  1390  			},
  1391  		}
  1392  		controlPlane.InjectTestManagementCluster(r.managementCluster)
  1393  
  1394  		ret, err := r.canSafelyRemoveEtcdMember(ctx, controlPlane, m1)
  1395  		g.Expect(ret).To(BeFalse())
  1396  		g.Expect(err).ToNot(HaveOccurred())
  1397  
  1398  		g.Expect(env.Cleanup(ctx, m1, m2)).To(Succeed())
  1399  	})
  1400  	t.Run("Can safely remediate 3 machines CP without additional etcd member failures", func(t *testing.T) {
  1401  		g := NewWithT(t)
  1402  
  1403  		m1 := createMachine(ctx, g, ns.Name, "m1-mhc-unhealthy-", withMachineHealthCheckFailed())
  1404  		m2 := createMachine(ctx, g, ns.Name, "m2-etcd-healthy-", withHealthyEtcdMember())
  1405  		m3 := createMachine(ctx, g, ns.Name, "m3-etcd-healthy-", withHealthyEtcdMember())
  1406  
  1407  		controlPlane := &internal.ControlPlane{
  1408  			KCP: &controlplanev1.KubeadmControlPlane{Spec: controlplanev1.KubeadmControlPlaneSpec{
  1409  				Replicas: utilpointer.Int32(3),
  1410  			}},
  1411  			Cluster:  &clusterv1.Cluster{},
  1412  			Machines: collections.FromMachines(m1, m2, m3),
  1413  		}
  1414  
  1415  		r := &KubeadmControlPlaneReconciler{
  1416  			Client:   env.GetClient(),
  1417  			recorder: record.NewFakeRecorder(32),
  1418  			managementCluster: &fakeManagementCluster{
  1419  				Workload: fakeWorkloadCluster{
  1420  					EtcdMembersResult: nodes(controlPlane.Machines),
  1421  				},
  1422  			},
  1423  		}
  1424  		controlPlane.InjectTestManagementCluster(r.managementCluster)
  1425  
  1426  		ret, err := r.canSafelyRemoveEtcdMember(ctx, controlPlane, m1)
  1427  		g.Expect(ret).To(BeTrue())
  1428  		g.Expect(err).ToNot(HaveOccurred())
  1429  
  1430  		g.Expect(env.Cleanup(ctx, m1, m2, m3)).To(Succeed())
  1431  	})
  1432  	t.Run("Can safely remediate 3 machines CP when the etcd member being remediated is missing", func(t *testing.T) {
  1433  		g := NewWithT(t)
  1434  
  1435  		m1 := createMachine(ctx, g, ns.Name, "m1-mhc-unhealthy-", withMachineHealthCheckFailed())
  1436  		m2 := createMachine(ctx, g, ns.Name, "m2-etcd-healthy-", withHealthyEtcdMember())
  1437  		m3 := createMachine(ctx, g, ns.Name, "m3-etcd-healthy-", withHealthyEtcdMember())
  1438  
  1439  		controlPlane := &internal.ControlPlane{
  1440  			KCP: &controlplanev1.KubeadmControlPlane{Spec: controlplanev1.KubeadmControlPlaneSpec{
  1441  				Replicas: utilpointer.Int32(3),
  1442  			}},
  1443  			Cluster:  &clusterv1.Cluster{},
  1444  			Machines: collections.FromMachines(m1, m2, m3),
  1445  		}
  1446  
  1447  		members := make([]string, 0, len(controlPlane.Machines)-1)
  1448  		for _, n := range nodes(controlPlane.Machines) {
  1449  			if !strings.Contains(n, "m1-mhc-unhealthy-") {
  1450  				members = append(members, n)
  1451  			}
  1452  		}
  1453  
  1454  		r := &KubeadmControlPlaneReconciler{
  1455  			Client:   env.GetClient(),
  1456  			recorder: record.NewFakeRecorder(32),
  1457  			managementCluster: &fakeManagementCluster{
  1458  				Workload: fakeWorkloadCluster{
  1459  					EtcdMembersResult: members,
  1460  				},
  1461  			},
  1462  		}
  1463  		controlPlane.InjectTestManagementCluster(r.managementCluster)
  1464  
  1465  		ret, err := r.canSafelyRemoveEtcdMember(ctx, controlPlane, m1)
  1466  		g.Expect(ret).To(BeTrue())
  1467  		g.Expect(err).ToNot(HaveOccurred())
  1468  
  1469  		g.Expect(env.Cleanup(ctx, m1, m2, m3)).To(Succeed())
  1470  	})
  1471  	t.Run("Can't safely remediate 3 machines CP with one additional etcd member failure", func(t *testing.T) {
  1472  		g := NewWithT(t)
  1473  
  1474  		m1 := createMachine(ctx, g, ns.Name, "m1-mhc-unhealthy-", withMachineHealthCheckFailed())
  1475  		m2 := createMachine(ctx, g, ns.Name, "m2-etcd-unhealthy-", withUnhealthyEtcdMember())
  1476  		m3 := createMachine(ctx, g, ns.Name, "m3-etcd-healthy-", withHealthyEtcdMember())
  1477  
  1478  		controlPlane := &internal.ControlPlane{
  1479  			KCP: &controlplanev1.KubeadmControlPlane{Spec: controlplanev1.KubeadmControlPlaneSpec{
  1480  				Replicas: utilpointer.Int32(3),
  1481  			}},
  1482  			Cluster:  &clusterv1.Cluster{},
  1483  			Machines: collections.FromMachines(m1, m2, m3),
  1484  		}
  1485  
  1486  		r := &KubeadmControlPlaneReconciler{
  1487  			Client:   env.GetClient(),
  1488  			recorder: record.NewFakeRecorder(32),
  1489  			managementCluster: &fakeManagementCluster{
  1490  				Workload: fakeWorkloadCluster{
  1491  					EtcdMembersResult: nodes(controlPlane.Machines),
  1492  				},
  1493  			},
  1494  		}
  1495  		controlPlane.InjectTestManagementCluster(r.managementCluster)
  1496  
  1497  		ret, err := r.canSafelyRemoveEtcdMember(ctx, controlPlane, m1)
  1498  		g.Expect(ret).To(BeFalse())
  1499  		g.Expect(err).ToNot(HaveOccurred())
  1500  
  1501  		g.Expect(env.Cleanup(ctx, m1, m2, m3)).To(Succeed())
  1502  	})
  1503  	t.Run("Can safely remediate 5 machines CP less than 2 additional etcd member failures", func(t *testing.T) {
  1504  		g := NewWithT(t)
  1505  
  1506  		m1 := createMachine(ctx, g, ns.Name, "m1-mhc-unhealthy-", withMachineHealthCheckFailed())
  1507  		m2 := createMachine(ctx, g, ns.Name, "m2-etcd-unhealthy-", withUnhealthyEtcdMember())
  1508  		m3 := createMachine(ctx, g, ns.Name, "m3-etcd-healthy-", withHealthyEtcdMember())
  1509  		m4 := createMachine(ctx, g, ns.Name, "m4-etcd-healthy-", withHealthyEtcdMember())
  1510  		m5 := createMachine(ctx, g, ns.Name, "m5-etcd-healthy-", withHealthyEtcdMember())
  1511  
  1512  		controlPlane := &internal.ControlPlane{
  1513  			KCP: &controlplanev1.KubeadmControlPlane{Spec: controlplanev1.KubeadmControlPlaneSpec{
  1514  				Replicas: utilpointer.Int32(5),
  1515  			}},
  1516  			Cluster:  &clusterv1.Cluster{},
  1517  			Machines: collections.FromMachines(m1, m2, m3, m4, m5),
  1518  		}
  1519  
  1520  		r := &KubeadmControlPlaneReconciler{
  1521  			Client:   env.GetClient(),
  1522  			recorder: record.NewFakeRecorder(32),
  1523  			managementCluster: &fakeManagementCluster{
  1524  				Workload: fakeWorkloadCluster{
  1525  					EtcdMembersResult: nodes(controlPlane.Machines),
  1526  				},
  1527  			},
  1528  		}
  1529  		controlPlane.InjectTestManagementCluster(r.managementCluster)
  1530  
  1531  		ret, err := r.canSafelyRemoveEtcdMember(ctx, controlPlane, m1)
  1532  		g.Expect(ret).To(BeTrue())
  1533  		g.Expect(err).ToNot(HaveOccurred())
  1534  
  1535  		g.Expect(env.Cleanup(ctx, m1, m2, m3, m4, m5)).To(Succeed())
  1536  	})
  1537  	t.Run("Can't safely remediate 5 machines CP with 2 additional etcd member failures", func(t *testing.T) {
  1538  		g := NewWithT(t)
  1539  
  1540  		m1 := createMachine(ctx, g, ns.Name, "m1-mhc-unhealthy-", withMachineHealthCheckFailed())
  1541  		m2 := createMachine(ctx, g, ns.Name, "m2-etcd-unhealthy-", withUnhealthyEtcdMember())
  1542  		m3 := createMachine(ctx, g, ns.Name, "m3-etcd-unhealthy-", withUnhealthyEtcdMember())
  1543  		m4 := createMachine(ctx, g, ns.Name, "m4-etcd-healthy-", withHealthyEtcdMember())
  1544  		m5 := createMachine(ctx, g, ns.Name, "m5-etcd-healthy-", withHealthyEtcdMember())
  1545  
  1546  		controlPlane := &internal.ControlPlane{
  1547  			KCP: &controlplanev1.KubeadmControlPlane{Spec: controlplanev1.KubeadmControlPlaneSpec{
  1548  				Replicas: utilpointer.Int32(7),
  1549  			}},
  1550  			Cluster:  &clusterv1.Cluster{},
  1551  			Machines: collections.FromMachines(m1, m2, m3, m4, m5),
  1552  		}
  1553  
  1554  		r := &KubeadmControlPlaneReconciler{
  1555  			Client:   env.GetClient(),
  1556  			recorder: record.NewFakeRecorder(32),
  1557  			managementCluster: &fakeManagementCluster{
  1558  				Workload: fakeWorkloadCluster{
  1559  					EtcdMembersResult: nodes(controlPlane.Machines),
  1560  				},
  1561  			},
  1562  		}
  1563  		controlPlane.InjectTestManagementCluster(r.managementCluster)
  1564  
  1565  		ret, err := r.canSafelyRemoveEtcdMember(ctx, controlPlane, m1)
  1566  		g.Expect(ret).To(BeFalse())
  1567  		g.Expect(err).ToNot(HaveOccurred())
  1568  
  1569  		g.Expect(env.Cleanup(ctx, m1, m2, m3, m4, m5)).To(Succeed())
  1570  	})
  1571  	t.Run("Can safely remediate 7 machines CP with less than 3 additional etcd member failures", func(t *testing.T) {
  1572  		g := NewWithT(t)
  1573  
  1574  		m1 := createMachine(ctx, g, ns.Name, "m1-mhc-unhealthy-", withMachineHealthCheckFailed())
  1575  		m2 := createMachine(ctx, g, ns.Name, "m2-etcd-unhealthy-", withUnhealthyEtcdMember())
  1576  		m3 := createMachine(ctx, g, ns.Name, "m3-etcd-unhealthy-", withUnhealthyEtcdMember())
  1577  		m4 := createMachine(ctx, g, ns.Name, "m4-etcd-healthy-", withHealthyEtcdMember())
  1578  		m5 := createMachine(ctx, g, ns.Name, "m5-etcd-healthy-", withHealthyEtcdMember())
  1579  		m6 := createMachine(ctx, g, ns.Name, "m6-etcd-healthy-", withHealthyEtcdMember())
  1580  		m7 := createMachine(ctx, g, ns.Name, "m7-etcd-healthy-", withHealthyEtcdMember())
  1581  
  1582  		controlPlane := &internal.ControlPlane{
  1583  			KCP: &controlplanev1.KubeadmControlPlane{Spec: controlplanev1.KubeadmControlPlaneSpec{
  1584  				Replicas: utilpointer.Int32(7),
  1585  			}},
  1586  			Cluster:  &clusterv1.Cluster{},
  1587  			Machines: collections.FromMachines(m1, m2, m3, m4, m5, m6, m7),
  1588  		}
  1589  
  1590  		r := &KubeadmControlPlaneReconciler{
  1591  			Client:   env.GetClient(),
  1592  			recorder: record.NewFakeRecorder(32),
  1593  			managementCluster: &fakeManagementCluster{
  1594  				Workload: fakeWorkloadCluster{
  1595  					EtcdMembersResult: nodes(controlPlane.Machines),
  1596  				},
  1597  			},
  1598  		}
  1599  		controlPlane.InjectTestManagementCluster(r.managementCluster)
  1600  
  1601  		ret, err := r.canSafelyRemoveEtcdMember(ctx, controlPlane, m1)
  1602  		g.Expect(ret).To(BeTrue())
  1603  		g.Expect(err).ToNot(HaveOccurred())
  1604  
  1605  		g.Expect(env.Cleanup(ctx, m1, m2, m3, m4, m5, m6, m7)).To(Succeed())
  1606  	})
  1607  	t.Run("Can't safely remediate 7 machines CP with 3 additional etcd member failures", func(t *testing.T) {
  1608  		g := NewWithT(t)
  1609  
  1610  		m1 := createMachine(ctx, g, ns.Name, "m1-mhc-unhealthy-", withMachineHealthCheckFailed())
  1611  		m2 := createMachine(ctx, g, ns.Name, "m2-etcd-unhealthy-", withUnhealthyEtcdMember())
  1612  		m3 := createMachine(ctx, g, ns.Name, "m3-etcd-unhealthy-", withUnhealthyEtcdMember())
  1613  		m4 := createMachine(ctx, g, ns.Name, "m4-etcd-unhealthy-", withUnhealthyEtcdMember())
  1614  		m5 := createMachine(ctx, g, ns.Name, "m5-etcd-healthy-", withHealthyEtcdMember())
  1615  		m6 := createMachine(ctx, g, ns.Name, "m6-etcd-healthy-", withHealthyEtcdMember())
  1616  		m7 := createMachine(ctx, g, ns.Name, "m7-etcd-healthy-", withHealthyEtcdMember())
  1617  
  1618  		controlPlane := &internal.ControlPlane{
  1619  			KCP: &controlplanev1.KubeadmControlPlane{Spec: controlplanev1.KubeadmControlPlaneSpec{
  1620  				Replicas: utilpointer.Int32(5),
  1621  			}},
  1622  			Cluster:  &clusterv1.Cluster{},
  1623  			Machines: collections.FromMachines(m1, m2, m3, m4, m5, m6, m7),
  1624  		}
  1625  
  1626  		r := &KubeadmControlPlaneReconciler{
  1627  			Client:   env.GetClient(),
  1628  			recorder: record.NewFakeRecorder(32),
  1629  			managementCluster: &fakeManagementCluster{
  1630  				Workload: fakeWorkloadCluster{
  1631  					EtcdMembersResult: nodes(controlPlane.Machines),
  1632  				},
  1633  			},
  1634  		}
  1635  		controlPlane.InjectTestManagementCluster(r.managementCluster)
  1636  
  1637  		ret, err := r.canSafelyRemoveEtcdMember(ctx, controlPlane, m1)
  1638  		g.Expect(ret).To(BeFalse())
  1639  		g.Expect(err).ToNot(HaveOccurred())
  1640  
  1641  		g.Expect(env.Cleanup(ctx, m1, m2, m3, m4, m5, m6, m7)).To(Succeed())
  1642  	})
  1643  }
  1644  
  1645  func nodes(machines collections.Machines) []string {
  1646  	nodes := make([]string, 0, machines.Len())
  1647  	for _, m := range machines {
  1648  		if m.Status.NodeRef != nil {
  1649  			nodes = append(nodes, m.Status.NodeRef.Name)
  1650  		}
  1651  	}
  1652  	return nodes
  1653  }
  1654  
  1655  type machineOption func(*clusterv1.Machine)
  1656  
  1657  func withMachineHealthCheckFailed() machineOption {
  1658  	return func(machine *clusterv1.Machine) {
  1659  		conditions.MarkFalse(machine, clusterv1.MachineHealthCheckSucceededCondition, clusterv1.MachineHasFailureReason, clusterv1.ConditionSeverityWarning, "")
  1660  		conditions.MarkFalse(machine, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "")
  1661  	}
  1662  }
  1663  
  1664  func withStuckRemediation() machineOption {
  1665  	return func(machine *clusterv1.Machine) {
  1666  		conditions.MarkTrue(machine, clusterv1.MachineHealthCheckSucceededCondition)
  1667  		conditions.MarkFalse(machine, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "")
  1668  	}
  1669  }
  1670  
  1671  func withHealthyEtcdMember() machineOption {
  1672  	return func(machine *clusterv1.Machine) {
  1673  		conditions.MarkTrue(machine, controlplanev1.MachineEtcdMemberHealthyCondition)
  1674  	}
  1675  }
  1676  
  1677  func withUnhealthyEtcdMember() machineOption {
  1678  	return func(machine *clusterv1.Machine) {
  1679  		conditions.MarkFalse(machine, controlplanev1.MachineEtcdMemberHealthyCondition, controlplanev1.EtcdMemberUnhealthyReason, clusterv1.ConditionSeverityError, "")
  1680  	}
  1681  }
  1682  
  1683  func withNodeRef(ref string) machineOption {
  1684  	return func(machine *clusterv1.Machine) {
  1685  		machine.Status.NodeRef = &corev1.ObjectReference{
  1686  			Kind: "Node",
  1687  			Name: ref,
  1688  		}
  1689  	}
  1690  }
  1691  
  1692  func withRemediateForAnnotation(remediatedFor string) machineOption {
  1693  	return func(machine *clusterv1.Machine) {
  1694  		if machine.Annotations == nil {
  1695  			machine.Annotations = map[string]string{}
  1696  		}
  1697  		machine.Annotations[controlplanev1.RemediationForAnnotation] = remediatedFor
  1698  	}
  1699  }
  1700  
  1701  func withWaitBeforeDeleteFinalizer() machineOption {
  1702  	return func(machine *clusterv1.Machine) {
  1703  		machine.Finalizers = []string{"wait-before-delete"}
  1704  	}
  1705  }
  1706  
  1707  func createMachine(ctx context.Context, g *WithT, namespace, name string, options ...machineOption) *clusterv1.Machine {
  1708  	m := &clusterv1.Machine{
  1709  		ObjectMeta: metav1.ObjectMeta{
  1710  			Namespace:    namespace,
  1711  			GenerateName: name,
  1712  		},
  1713  		Spec: clusterv1.MachineSpec{
  1714  			ClusterName: "cluster",
  1715  			Bootstrap: clusterv1.Bootstrap{
  1716  				DataSecretName: utilpointer.String("secret"),
  1717  			},
  1718  		},
  1719  	}
  1720  	g.Expect(env.CreateAndWait(ctx, m)).To(Succeed())
  1721  
  1722  	patchHelper, err := patch.NewHelper(m, env.GetClient())
  1723  	g.Expect(err).ToNot(HaveOccurred())
  1724  
  1725  	for _, opt := range append(options, withNodeRef(fmt.Sprintf("node-%s", m.Name))) {
  1726  		opt(m)
  1727  	}
  1728  
  1729  	g.Expect(patchHelper.Patch(ctx, m)).To(Succeed())
  1730  	return m
  1731  }
  1732  
  1733  func getDeletingMachine(namespace, name string, options ...machineOption) *clusterv1.Machine {
  1734  	deletionTime := metav1.Now()
  1735  	m := &clusterv1.Machine{
  1736  		ObjectMeta: metav1.ObjectMeta{
  1737  			Namespace:         namespace,
  1738  			Name:              name,
  1739  			DeletionTimestamp: &deletionTime,
  1740  		},
  1741  		Spec: clusterv1.MachineSpec{
  1742  			ClusterName: "cluster",
  1743  			Bootstrap: clusterv1.Bootstrap{
  1744  				DataSecretName: utilpointer.String("secret"),
  1745  			},
  1746  		},
  1747  	}
  1748  
  1749  	for _, opt := range append(options, withNodeRef(fmt.Sprintf("node-%s", m.Name))) {
  1750  		opt(m)
  1751  	}
  1752  	return m
  1753  }
  1754  
  1755  func assertMachineCondition(ctx context.Context, g *WithT, m *clusterv1.Machine, t clusterv1.ConditionType, status corev1.ConditionStatus, reason string, severity clusterv1.ConditionSeverity, message string) {
  1756  	g.Eventually(func() error {
  1757  		if err := env.Get(ctx, client.ObjectKey{Namespace: m.Namespace, Name: m.Name}, m); err != nil {
  1758  			return err
  1759  		}
  1760  		c := conditions.Get(m, t)
  1761  		if c == nil {
  1762  			return errors.Errorf("condition %q was nil", t)
  1763  		}
  1764  		if c.Status != status {
  1765  			return errors.Errorf("condition %q status %q did not match %q", t, c.Status, status)
  1766  		}
  1767  		if c.Reason != reason {
  1768  			return errors.Errorf("condition %q reason %q did not match %q", t, c.Reason, reason)
  1769  		}
  1770  		if c.Severity != severity {
  1771  			return errors.Errorf("condition %q severity %q did not match %q", t, c.Status, status)
  1772  		}
  1773  		if c.Message != message {
  1774  			return errors.Errorf("condition %q message %q did not match %q", t, c.Message, message)
  1775  		}
  1776  		return nil
  1777  	}, 10*time.Second).Should(Succeed())
  1778  }
  1779  
  1780  func MustMarshalRemediationData(r *RemediationData) string {
  1781  	s, err := r.Marshal()
  1782  	if err != nil {
  1783  		panic("failed to marshal remediation data")
  1784  	}
  1785  	return s
  1786  }