sigs.k8s.io/cluster-api@v1.7.1/internal/controllers/machinehealthcheck/machinehealthcheck_controller_test.go (about)

     1  /*
     2  Copyright 2020 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package machinehealthcheck
    18  
    19  import (
    20  	"context"
    21  	"errors"
    22  	"fmt"
    23  	"sort"
    24  	"testing"
    25  	"time"
    26  
    27  	"github.com/go-logr/logr"
    28  	. "github.com/onsi/gomega"
    29  	corev1 "k8s.io/api/core/v1"
    30  	apierrors "k8s.io/apimachinery/pkg/api/errors"
    31  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    32  	"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
    33  	"k8s.io/apimachinery/pkg/types"
    34  	"k8s.io/apimachinery/pkg/util/intstr"
    35  	"k8s.io/apimachinery/pkg/util/uuid"
    36  	"k8s.io/client-go/kubernetes/scheme"
    37  	"k8s.io/client-go/tools/record"
    38  	"k8s.io/utils/ptr"
    39  	"sigs.k8s.io/controller-runtime/pkg/client"
    40  	"sigs.k8s.io/controller-runtime/pkg/client/fake"
    41  	"sigs.k8s.io/controller-runtime/pkg/log"
    42  	"sigs.k8s.io/controller-runtime/pkg/reconcile"
    43  	"sigs.k8s.io/controller-runtime/pkg/webhook/admission"
    44  
    45  	clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1"
    46  	"sigs.k8s.io/cluster-api/api/v1beta1/index"
    47  	"sigs.k8s.io/cluster-api/controllers/remote"
    48  	capierrors "sigs.k8s.io/cluster-api/errors"
    49  	"sigs.k8s.io/cluster-api/internal/test/builder"
    50  	"sigs.k8s.io/cluster-api/internal/webhooks"
    51  	"sigs.k8s.io/cluster-api/util"
    52  	"sigs.k8s.io/cluster-api/util/conditions"
    53  	"sigs.k8s.io/cluster-api/util/patch"
    54  )
    55  
    56  func TestMachineHealthCheck_Reconcile(t *testing.T) {
    57  	ns, err := env.CreateNamespace(ctx, "test-mhc")
    58  	if err != nil {
    59  		t.Fatal(err)
    60  	}
    61  	defer func() {
    62  		if err := env.Delete(ctx, ns); err != nil {
    63  			t.Fatal(err)
    64  		}
    65  	}()
    66  
    67  	t.Run("it should ensure the correct cluster-name label when no existing labels exist", func(t *testing.T) {
    68  		g := NewWithT(t)
    69  		cluster := createCluster(g, ns.Name)
    70  
    71  		mhc := newMachineHealthCheck(cluster.Namespace, cluster.Name)
    72  		mhc.Labels = map[string]string{}
    73  
    74  		g.Expect(env.Create(ctx, mhc)).To(Succeed())
    75  		defer func(do ...client.Object) {
    76  			g.Expect(env.Cleanup(ctx, do...)).To(Succeed())
    77  		}(cluster, mhc)
    78  
    79  		g.Eventually(func() map[string]string {
    80  			err := env.Get(ctx, util.ObjectKey(mhc), mhc)
    81  			if err != nil {
    82  				return nil
    83  			}
    84  			return mhc.GetLabels()
    85  		}).Should(HaveKeyWithValue(clusterv1.ClusterNameLabel, cluster.Name))
    86  	})
    87  
    88  	t.Run("it should ensure the correct cluster-name label when the label has the wrong value", func(t *testing.T) {
    89  		g := NewWithT(t)
    90  		cluster := createCluster(g, ns.Name)
    91  
    92  		mhc := newMachineHealthCheck(cluster.Namespace, cluster.Name)
    93  		mhc.Labels = map[string]string{
    94  			clusterv1.ClusterNameLabel: "wrong-cluster",
    95  		}
    96  
    97  		g.Expect(env.Create(ctx, mhc)).To(Succeed())
    98  		defer func(do ...client.Object) {
    99  			g.Expect(env.Cleanup(ctx, do...)).To(Succeed())
   100  		}(cluster, mhc)
   101  
   102  		g.Eventually(func() map[string]string {
   103  			err := env.Get(ctx, util.ObjectKey(mhc), mhc)
   104  			if err != nil {
   105  				return nil
   106  			}
   107  			return mhc.GetLabels()
   108  		}).Should(HaveKeyWithValue(clusterv1.ClusterNameLabel, cluster.Name))
   109  	})
   110  
   111  	t.Run("it should ensure the correct cluster-name label when other labels are present", func(t *testing.T) {
   112  		g := NewWithT(t)
   113  		cluster := createCluster(g, ns.Name)
   114  
   115  		mhc := newMachineHealthCheck(cluster.Namespace, cluster.Name)
   116  		mhc.Labels = map[string]string{
   117  			"extra-label": "1",
   118  		}
   119  
   120  		g.Expect(env.Create(ctx, mhc)).To(Succeed())
   121  		defer func(do ...client.Object) {
   122  			g.Expect(env.Cleanup(ctx, do...)).To(Succeed())
   123  		}(cluster, mhc)
   124  
   125  		g.Eventually(func() map[string]string {
   126  			err := env.Get(ctx, util.ObjectKey(mhc), mhc)
   127  			if err != nil {
   128  				return nil
   129  			}
   130  			return mhc.GetLabels()
   131  		}).Should(And(
   132  			HaveKeyWithValue(clusterv1.ClusterNameLabel, cluster.Name),
   133  			HaveKeyWithValue("extra-label", "1"),
   134  			HaveLen(2),
   135  		))
   136  	})
   137  
   138  	t.Run("it should ensure an owner reference is present when no existing ones exist", func(t *testing.T) {
   139  		g := NewWithT(t)
   140  		cluster := createCluster(g, ns.Name)
   141  
   142  		mhc := newMachineHealthCheck(cluster.Namespace, cluster.Name)
   143  		mhc.OwnerReferences = []metav1.OwnerReference{}
   144  
   145  		g.Expect(env.Create(ctx, mhc)).To(Succeed())
   146  		defer func(do ...client.Object) {
   147  			g.Expect(env.Cleanup(ctx, do...)).To(Succeed())
   148  		}(cluster, mhc)
   149  
   150  		g.Eventually(func() []metav1.OwnerReference {
   151  			err := env.Get(ctx, util.ObjectKey(mhc), mhc)
   152  			if err != nil {
   153  				fmt.Printf("error cannot retrieve mhc in ctx: %v", err)
   154  				return nil
   155  			}
   156  			return mhc.GetOwnerReferences()
   157  		}, timeout, 100*time.Millisecond).Should(And(
   158  			HaveLen(1),
   159  			ContainElement(ownerReferenceForCluster(ctx, g, cluster)),
   160  		))
   161  	})
   162  
   163  	t.Run("it should ensure an owner reference is present when modifying existing ones", func(t *testing.T) {
   164  		g := NewWithT(t)
   165  		cluster := createCluster(g, ns.Name)
   166  
   167  		mhc := newMachineHealthCheck(cluster.Namespace, cluster.Name)
   168  		mhc.OwnerReferences = []metav1.OwnerReference{
   169  			{Kind: "Foo", APIVersion: "foo.bar.baz/v1", Name: "Bar", UID: "12345"},
   170  		}
   171  
   172  		g.Expect(env.Create(ctx, mhc)).To(Succeed())
   173  		defer func(do ...client.Object) {
   174  			g.Expect(env.Cleanup(ctx, do...)).To(Succeed())
   175  		}(cluster, mhc)
   176  
   177  		g.Eventually(func() []metav1.OwnerReference {
   178  			err := env.Get(ctx, util.ObjectKey(mhc), mhc)
   179  			if err != nil {
   180  				return nil
   181  			}
   182  			return mhc.GetOwnerReferences()
   183  		}, timeout, 100*time.Millisecond).Should(And(
   184  			ContainElements(
   185  				metav1.OwnerReference{Kind: "Foo", APIVersion: "foo.bar.baz/v1", Name: "Bar", UID: "12345"},
   186  				ownerReferenceForCluster(ctx, g, cluster)),
   187  			HaveLen(2),
   188  		))
   189  	})
   190  
   191  	t.Run("it ignores Machines not matching the label selector", func(t *testing.T) {
   192  		g := NewWithT(t)
   193  		cluster := createCluster(g, ns.Name)
   194  
   195  		mhc := newMachineHealthCheck(cluster.Namespace, cluster.Name)
   196  
   197  		g.Expect(env.Create(ctx, mhc)).To(Succeed())
   198  		defer func(do ...client.Object) {
   199  			g.Expect(env.Cleanup(ctx, do...)).To(Succeed())
   200  		}(cluster, mhc)
   201  
   202  		// Healthy nodes and machines matching the MHC's label selector.
   203  		_, machines, cleanup := createMachinesWithNodes(g, cluster,
   204  			count(2),
   205  			firstMachineAsControlPlane(),
   206  			createNodeRefForMachine(true),
   207  			nodeStatus(corev1.ConditionTrue),
   208  			machineLabels(mhc.Spec.Selector.MatchLabels),
   209  		)
   210  		defer cleanup()
   211  		targetMachines := make([]string, len(machines))
   212  		for i, m := range machines {
   213  			targetMachines[i] = m.Name
   214  		}
   215  		sort.Strings(targetMachines)
   216  
   217  		// Healthy nodes and machines NOT matching the MHC's label selector.
   218  		_, _, cleanup2 := createMachinesWithNodes(g, cluster,
   219  			count(2),
   220  			createNodeRefForMachine(true),
   221  			nodeStatus(corev1.ConditionTrue),
   222  		)
   223  		defer cleanup2()
   224  
   225  		// Make sure the status matches.
   226  		g.Eventually(func() *clusterv1.MachineHealthCheckStatus {
   227  			err := env.Get(ctx, util.ObjectKey(mhc), mhc)
   228  			if err != nil {
   229  				return nil
   230  			}
   231  			return &mhc.Status
   232  		}, 5*time.Second, 100*time.Millisecond).Should(MatchMachineHealthCheckStatus(&clusterv1.MachineHealthCheckStatus{
   233  			ExpectedMachines:    2,
   234  			CurrentHealthy:      2,
   235  			RemediationsAllowed: 2,
   236  			ObservedGeneration:  1,
   237  			Targets:             targetMachines,
   238  			Conditions: clusterv1.Conditions{
   239  				{
   240  					Type:   clusterv1.RemediationAllowedCondition,
   241  					Status: corev1.ConditionTrue,
   242  				},
   243  			},
   244  		}))
   245  	})
   246  
   247  	t.Run("it doesn't mark anything unhealthy when cluster infrastructure is not ready", func(t *testing.T) {
   248  		g := NewWithT(t)
   249  		cluster := createCluster(g, ns.Name)
   250  
   251  		patchHelper, err := patch.NewHelper(cluster, env.Client)
   252  		g.Expect(err).ToNot(HaveOccurred())
   253  
   254  		conditions.MarkFalse(cluster, clusterv1.InfrastructureReadyCondition, "SomeReason", clusterv1.ConditionSeverityError, "")
   255  		g.Expect(patchHelper.Patch(ctx, cluster)).To(Succeed())
   256  
   257  		mhc := newMachineHealthCheck(cluster.Namespace, cluster.Name)
   258  
   259  		g.Expect(env.Create(ctx, mhc)).To(Succeed())
   260  		defer func(do ...client.Object) {
   261  			g.Expect(env.Cleanup(ctx, do...)).To(Succeed())
   262  		}(cluster, mhc)
   263  
   264  		// Healthy nodes and machines.
   265  		_, machines, cleanup := createMachinesWithNodes(g, cluster,
   266  			count(2),
   267  			firstMachineAsControlPlane(),
   268  			createNodeRefForMachine(true),
   269  			machineLabels(mhc.Spec.Selector.MatchLabels),
   270  		)
   271  		defer cleanup()
   272  		targetMachines := make([]string, len(machines))
   273  		for i, m := range machines {
   274  			targetMachines[i] = m.Name
   275  		}
   276  		sort.Strings(targetMachines)
   277  
   278  		// Make sure the status matches.
   279  		g.Eventually(func() *clusterv1.MachineHealthCheckStatus {
   280  			err := env.Get(ctx, util.ObjectKey(mhc), mhc)
   281  			if err != nil {
   282  				return nil
   283  			}
   284  			return &mhc.Status
   285  		}).Should(MatchMachineHealthCheckStatus(&clusterv1.MachineHealthCheckStatus{
   286  			ExpectedMachines:    2,
   287  			CurrentHealthy:      2,
   288  			RemediationsAllowed: 2,
   289  			ObservedGeneration:  1,
   290  			Targets:             targetMachines,
   291  			Conditions: clusterv1.Conditions{
   292  				{
   293  					Type:   clusterv1.RemediationAllowedCondition,
   294  					Status: corev1.ConditionTrue,
   295  				},
   296  			},
   297  		}))
   298  	})
   299  
   300  	t.Run("it doesn't mark anything unhealthy when all Machines are healthy", func(t *testing.T) {
   301  		g := NewWithT(t)
   302  		cluster := createCluster(g, ns.Name)
   303  
   304  		mhc := newMachineHealthCheck(cluster.Namespace, cluster.Name)
   305  
   306  		g.Expect(env.Create(ctx, mhc)).To(Succeed())
   307  		defer func(do ...client.Object) {
   308  			g.Expect(env.Cleanup(ctx, do...)).To(Succeed())
   309  		}(cluster, mhc)
   310  
   311  		// Healthy nodes and machines.
   312  		_, machines, cleanup := createMachinesWithNodes(g, cluster,
   313  			count(2),
   314  			firstMachineAsControlPlane(),
   315  			createNodeRefForMachine(true),
   316  			nodeStatus(corev1.ConditionTrue),
   317  			machineLabels(mhc.Spec.Selector.MatchLabels),
   318  		)
   319  		defer cleanup()
   320  		targetMachines := make([]string, len(machines))
   321  		for i, m := range machines {
   322  			targetMachines[i] = m.Name
   323  		}
   324  		sort.Strings(targetMachines)
   325  
   326  		// Make sure the status matches.
   327  		g.Eventually(func() *clusterv1.MachineHealthCheckStatus {
   328  			err := env.Get(ctx, util.ObjectKey(mhc), mhc)
   329  			if err != nil {
   330  				return nil
   331  			}
   332  			return &mhc.Status
   333  		}).Should(MatchMachineHealthCheckStatus(&clusterv1.MachineHealthCheckStatus{
   334  			ExpectedMachines:    2,
   335  			CurrentHealthy:      2,
   336  			RemediationsAllowed: 2,
   337  			ObservedGeneration:  1,
   338  			Targets:             targetMachines,
   339  			Conditions: clusterv1.Conditions{
   340  				{
   341  					Type:   clusterv1.RemediationAllowedCondition,
   342  					Status: corev1.ConditionTrue,
   343  				},
   344  			},
   345  		}))
   346  	})
   347  
   348  	t.Run("it marks unhealthy machines for remediation when there is one unhealthy Machine", func(t *testing.T) {
   349  		g := NewWithT(t)
   350  		cluster := createCluster(g, ns.Name)
   351  
   352  		mhc := newMachineHealthCheck(cluster.Namespace, cluster.Name)
   353  
   354  		g.Expect(env.Create(ctx, mhc)).To(Succeed())
   355  		defer func(do ...client.Object) {
   356  			g.Expect(env.Cleanup(ctx, do...)).To(Succeed())
   357  		}(cluster, mhc)
   358  
   359  		// Healthy nodes and machines.
   360  		_, machines, cleanup1 := createMachinesWithNodes(g, cluster,
   361  			count(2),
   362  			firstMachineAsControlPlane(),
   363  			createNodeRefForMachine(true),
   364  			nodeStatus(corev1.ConditionTrue),
   365  			machineLabels(mhc.Spec.Selector.MatchLabels),
   366  		)
   367  		defer cleanup1()
   368  		// Unhealthy nodes and machines.
   369  		_, unhealthyMachines, cleanup2 := createMachinesWithNodes(g, cluster,
   370  			count(1),
   371  			createNodeRefForMachine(true),
   372  			nodeStatus(corev1.ConditionUnknown),
   373  			machineLabels(mhc.Spec.Selector.MatchLabels),
   374  		)
   375  		defer cleanup2()
   376  		machines = append(machines, unhealthyMachines...)
   377  		targetMachines := make([]string, len(machines))
   378  		for i, m := range machines {
   379  			targetMachines[i] = m.Name
   380  		}
   381  		sort.Strings(targetMachines)
   382  
   383  		// Make sure the status matches.
   384  		g.Eventually(func() *clusterv1.MachineHealthCheckStatus {
   385  			err := env.Get(ctx, util.ObjectKey(mhc), mhc)
   386  			if err != nil {
   387  				return nil
   388  			}
   389  			return &mhc.Status
   390  		}).Should(MatchMachineHealthCheckStatus(&clusterv1.MachineHealthCheckStatus{
   391  			ExpectedMachines:    3,
   392  			CurrentHealthy:      2,
   393  			RemediationsAllowed: 2,
   394  			ObservedGeneration:  1,
   395  			Targets:             targetMachines,
   396  			Conditions: clusterv1.Conditions{
   397  				{
   398  					Type:   clusterv1.RemediationAllowedCondition,
   399  					Status: corev1.ConditionTrue,
   400  				},
   401  			},
   402  		}))
   403  	})
   404  
   405  	t.Run("it marks unhealthy machines for remediation when there a Machine has a failure reason", func(t *testing.T) {
   406  		g := NewWithT(t)
   407  		cluster := createCluster(g, ns.Name)
   408  
   409  		mhc := newMachineHealthCheck(cluster.Namespace, cluster.Name)
   410  
   411  		g.Expect(env.Create(ctx, mhc)).To(Succeed())
   412  		defer func(do ...client.Object) {
   413  			g.Expect(env.Cleanup(ctx, do...)).To(Succeed())
   414  		}(cluster, mhc)
   415  
   416  		// Healthy nodes and machines.
   417  		_, machines, cleanup1 := createMachinesWithNodes(g, cluster,
   418  			count(2),
   419  			firstMachineAsControlPlane(),
   420  			createNodeRefForMachine(true),
   421  			nodeStatus(corev1.ConditionTrue),
   422  			machineLabels(mhc.Spec.Selector.MatchLabels),
   423  		)
   424  		defer cleanup1()
   425  		// Machine with failure reason.
   426  		_, unhealthyMachines, cleanup2 := createMachinesWithNodes(g, cluster,
   427  			count(1),
   428  			createNodeRefForMachine(true),
   429  			nodeStatus(corev1.ConditionTrue),
   430  			machineLabels(mhc.Spec.Selector.MatchLabels),
   431  			machineFailureReason("some failure"),
   432  		)
   433  		defer cleanup2()
   434  		machines = append(machines, unhealthyMachines...)
   435  		targetMachines := make([]string, len(machines))
   436  		for i, m := range machines {
   437  			targetMachines[i] = m.Name
   438  		}
   439  		sort.Strings(targetMachines)
   440  
   441  		// Make sure the status matches.
   442  		g.Eventually(func() *clusterv1.MachineHealthCheckStatus {
   443  			err := env.Get(ctx, util.ObjectKey(mhc), mhc)
   444  			if err != nil {
   445  				return nil
   446  			}
   447  			return &mhc.Status
   448  		}).Should(MatchMachineHealthCheckStatus(&clusterv1.MachineHealthCheckStatus{
   449  			ExpectedMachines:    3,
   450  			CurrentHealthy:      2,
   451  			RemediationsAllowed: 2,
   452  			ObservedGeneration:  1,
   453  			Targets:             targetMachines,
   454  			Conditions: clusterv1.Conditions{
   455  				{
   456  					Type:   clusterv1.RemediationAllowedCondition,
   457  					Status: corev1.ConditionTrue,
   458  				},
   459  			},
   460  		}))
   461  	})
   462  
   463  	t.Run("it marks unhealthy machines for remediation when there a Machine has a failure message", func(t *testing.T) {
   464  		g := NewWithT(t)
   465  		cluster := createCluster(g, ns.Name)
   466  
   467  		mhc := newMachineHealthCheck(cluster.Namespace, cluster.Name)
   468  
   469  		g.Expect(env.Create(ctx, mhc)).To(Succeed())
   470  		defer func(do ...client.Object) {
   471  			g.Expect(env.Cleanup(ctx, do...)).To(Succeed())
   472  		}(cluster, mhc)
   473  
   474  		// Healthy nodes and machines.
   475  		_, machines, cleanup1 := createMachinesWithNodes(g, cluster,
   476  			count(2),
   477  			firstMachineAsControlPlane(),
   478  			createNodeRefForMachine(true),
   479  			nodeStatus(corev1.ConditionTrue),
   480  			machineLabels(mhc.Spec.Selector.MatchLabels),
   481  		)
   482  		defer cleanup1()
   483  		// Machine with failure message.
   484  		_, unhealthyMachines, cleanup2 := createMachinesWithNodes(g, cluster,
   485  			count(1),
   486  			createNodeRefForMachine(true),
   487  			nodeStatus(corev1.ConditionTrue),
   488  			machineLabels(mhc.Spec.Selector.MatchLabels),
   489  			machineFailureMessage("some failure"),
   490  		)
   491  		defer cleanup2()
   492  		machines = append(machines, unhealthyMachines...)
   493  		targetMachines := make([]string, len(machines))
   494  		for i, m := range machines {
   495  			targetMachines[i] = m.Name
   496  		}
   497  		sort.Strings(targetMachines)
   498  
   499  		// Make sure the status matches.
   500  		g.Eventually(func() *clusterv1.MachineHealthCheckStatus {
   501  			err := env.Get(ctx, util.ObjectKey(mhc), mhc)
   502  			if err != nil {
   503  				return nil
   504  			}
   505  			return &mhc.Status
   506  		}).Should(MatchMachineHealthCheckStatus(&clusterv1.MachineHealthCheckStatus{
   507  			ExpectedMachines:    3,
   508  			CurrentHealthy:      2,
   509  			RemediationsAllowed: 2,
   510  			ObservedGeneration:  1,
   511  			Targets:             targetMachines,
   512  			Conditions: clusterv1.Conditions{
   513  				{
   514  					Type:   clusterv1.RemediationAllowedCondition,
   515  					Status: corev1.ConditionTrue,
   516  				},
   517  			},
   518  		}))
   519  	})
   520  
   521  	t.Run("it marks unhealthy machines for remediation when the unhealthy Machines exceed MaxUnhealthy", func(t *testing.T) {
   522  		g := NewWithT(t)
   523  		cluster := createCluster(g, ns.Name)
   524  
   525  		mhc := newMachineHealthCheck(cluster.Namespace, cluster.Name)
   526  		maxUnhealthy := intstr.Parse("40%")
   527  		mhc.Spec.MaxUnhealthy = &maxUnhealthy
   528  
   529  		g.Expect(env.Create(ctx, mhc)).To(Succeed())
   530  		defer func(do ...client.Object) {
   531  			g.Expect(env.Cleanup(ctx, do...)).To(Succeed())
   532  		}(cluster, mhc)
   533  
   534  		// Healthy nodes and machines.
   535  		_, machines, cleanup1 := createMachinesWithNodes(g, cluster,
   536  			count(1),
   537  			firstMachineAsControlPlane(),
   538  			createNodeRefForMachine(true),
   539  			nodeStatus(corev1.ConditionTrue),
   540  			machineLabels(mhc.Spec.Selector.MatchLabels),
   541  		)
   542  		defer cleanup1()
   543  		// Unhealthy nodes and machines.
   544  		_, unhealthyMachines, cleanup2 := createMachinesWithNodes(g, cluster,
   545  			count(2),
   546  			createNodeRefForMachine(true),
   547  			nodeStatus(corev1.ConditionUnknown),
   548  			machineLabels(mhc.Spec.Selector.MatchLabels),
   549  		)
   550  		defer cleanup2()
   551  		machines = append(machines, unhealthyMachines...)
   552  		targetMachines := make([]string, len(machines))
   553  		for i, m := range machines {
   554  			targetMachines[i] = m.Name
   555  		}
   556  		sort.Strings(targetMachines)
   557  
   558  		// Make sure the status matches.
   559  		g.Eventually(func() *clusterv1.MachineHealthCheckStatus {
   560  			err := env.Get(ctx, util.ObjectKey(mhc), mhc)
   561  			if err != nil {
   562  				return nil
   563  			}
   564  			return &mhc.Status
   565  		}).Should(MatchMachineHealthCheckStatus(&clusterv1.MachineHealthCheckStatus{
   566  			ExpectedMachines:    3,
   567  			CurrentHealthy:      1,
   568  			RemediationsAllowed: 0,
   569  			ObservedGeneration:  1,
   570  			Targets:             targetMachines,
   571  			Conditions: clusterv1.Conditions{
   572  				{
   573  					Type:     clusterv1.RemediationAllowedCondition,
   574  					Status:   corev1.ConditionFalse,
   575  					Severity: clusterv1.ConditionSeverityWarning,
   576  					Reason:   clusterv1.TooManyUnhealthyReason,
   577  					Message:  "Remediation is not allowed, the number of not started or unhealthy machines exceeds maxUnhealthy (total: 3, unhealthy: 2, maxUnhealthy: 40%)",
   578  				},
   579  			},
   580  		}))
   581  
   582  		// Calculate how many Machines have health check succeeded = false.
   583  		g.Eventually(func() (unhealthy int) {
   584  			machines := &clusterv1.MachineList{}
   585  			err := env.List(ctx, machines, client.MatchingLabels{
   586  				"selector": mhc.Spec.Selector.MatchLabels["selector"],
   587  			})
   588  			if err != nil {
   589  				return -1
   590  			}
   591  
   592  			for i := range machines.Items {
   593  				if conditions.IsFalse(&machines.Items[i], clusterv1.MachineHealthCheckSucceededCondition) {
   594  					unhealthy++
   595  				}
   596  			}
   597  			return
   598  		}).Should(Equal(2))
   599  
   600  		// Calculate how many Machines have been remediated.
   601  		g.Eventually(func() (remediated int) {
   602  			machines := &clusterv1.MachineList{}
   603  			err := env.List(ctx, machines, client.MatchingLabels{
   604  				"selector": mhc.Spec.Selector.MatchLabels["selector"],
   605  			})
   606  			if err != nil {
   607  				return -1
   608  			}
   609  
   610  			for i := range machines.Items {
   611  				if conditions.IsTrue(&machines.Items[i], clusterv1.MachineOwnerRemediatedCondition) {
   612  					remediated++
   613  				}
   614  			}
   615  			return
   616  		}).Should(Equal(0))
   617  	})
   618  
   619  	t.Run("it marks unhealthy machines for remediation when number of unhealthy machines is within unhealthyRange", func(t *testing.T) {
   620  		g := NewWithT(t)
   621  		cluster := createCluster(g, ns.Name)
   622  
   623  		mhc := newMachineHealthCheck(cluster.Namespace, cluster.Name)
   624  		unhealthyRange := "[1-3]"
   625  		mhc.Spec.UnhealthyRange = &unhealthyRange
   626  
   627  		g.Expect(env.Create(ctx, mhc)).To(Succeed())
   628  		defer func(do ...client.Object) {
   629  			g.Expect(env.Cleanup(ctx, do...)).To(Succeed())
   630  		}(cluster, mhc)
   631  
   632  		// Healthy nodes and machines.
   633  		_, machines, cleanup1 := createMachinesWithNodes(g, cluster,
   634  			count(2),
   635  			firstMachineAsControlPlane(),
   636  			createNodeRefForMachine(true),
   637  			nodeStatus(corev1.ConditionTrue),
   638  			machineLabels(mhc.Spec.Selector.MatchLabels),
   639  		)
   640  		defer cleanup1()
   641  		// Unhealthy nodes and machines.
   642  		_, unhealthyMachines, cleanup2 := createMachinesWithNodes(g, cluster,
   643  			count(1),
   644  			createNodeRefForMachine(true),
   645  			nodeStatus(corev1.ConditionUnknown),
   646  			machineLabels(mhc.Spec.Selector.MatchLabels),
   647  		)
   648  		defer cleanup2()
   649  		machines = append(machines, unhealthyMachines...)
   650  		targetMachines := make([]string, len(machines))
   651  		for i, m := range machines {
   652  			targetMachines[i] = m.Name
   653  		}
   654  		sort.Strings(targetMachines)
   655  
   656  		// Make sure the status matches.
   657  		g.Eventually(func() *clusterv1.MachineHealthCheckStatus {
   658  			err := env.Get(ctx, util.ObjectKey(mhc), mhc)
   659  			if err != nil {
   660  				return nil
   661  			}
   662  			return &mhc.Status
   663  		}).Should(MatchMachineHealthCheckStatus(&clusterv1.MachineHealthCheckStatus{
   664  			ExpectedMachines:    3,
   665  			CurrentHealthy:      2,
   666  			RemediationsAllowed: 2,
   667  			ObservedGeneration:  1,
   668  			Targets:             targetMachines,
   669  			Conditions: clusterv1.Conditions{
   670  				{
   671  					Type:   clusterv1.RemediationAllowedCondition,
   672  					Status: corev1.ConditionTrue,
   673  				},
   674  			},
   675  		}))
   676  	})
   677  
   678  	t.Run("it marks unhealthy machines for remediation when the unhealthy Machines is not within UnhealthyRange", func(t *testing.T) {
   679  		g := NewWithT(t)
   680  		cluster := createCluster(g, ns.Name)
   681  
   682  		mhc := newMachineHealthCheck(cluster.Namespace, cluster.Name)
   683  		unhealthyRange := "[3-5]"
   684  		mhc.Spec.UnhealthyRange = &unhealthyRange
   685  
   686  		g.Expect(env.Create(ctx, mhc)).To(Succeed())
   687  		defer func(do ...client.Object) {
   688  			g.Expect(env.Cleanup(ctx, do...)).To(Succeed())
   689  		}(cluster, mhc)
   690  
   691  		// Healthy nodes and machines.
   692  		_, machines, cleanup1 := createMachinesWithNodes(g, cluster,
   693  			count(1),
   694  			firstMachineAsControlPlane(),
   695  			createNodeRefForMachine(true),
   696  			nodeStatus(corev1.ConditionTrue),
   697  			machineLabels(mhc.Spec.Selector.MatchLabels),
   698  		)
   699  		defer cleanup1()
   700  		// Unhealthy nodes and machines.
   701  		_, unhealthyMachines, cleanup2 := createMachinesWithNodes(g, cluster,
   702  			count(2),
   703  			createNodeRefForMachine(true),
   704  			nodeStatus(corev1.ConditionUnknown),
   705  			machineLabels(mhc.Spec.Selector.MatchLabels),
   706  		)
   707  		defer cleanup2()
   708  		machines = append(machines, unhealthyMachines...)
   709  		targetMachines := make([]string, len(machines))
   710  		for i, m := range machines {
   711  			targetMachines[i] = m.Name
   712  		}
   713  		sort.Strings(targetMachines)
   714  
   715  		// Make sure the status matches.
   716  		g.Eventually(func() *clusterv1.MachineHealthCheckStatus {
   717  			err := env.Get(ctx, util.ObjectKey(mhc), mhc)
   718  			if err != nil {
   719  				return nil
   720  			}
   721  			return &mhc.Status
   722  		}).Should(MatchMachineHealthCheckStatus(&clusterv1.MachineHealthCheckStatus{
   723  			ExpectedMachines:    3,
   724  			CurrentHealthy:      1,
   725  			RemediationsAllowed: 0,
   726  			ObservedGeneration:  1,
   727  			Targets:             targetMachines,
   728  			Conditions: clusterv1.Conditions{
   729  				{
   730  					Type:     clusterv1.RemediationAllowedCondition,
   731  					Status:   corev1.ConditionFalse,
   732  					Severity: clusterv1.ConditionSeverityWarning,
   733  					Reason:   clusterv1.TooManyUnhealthyReason,
   734  					Message:  "Remediation is not allowed, the number of not started or unhealthy machines does not fall within the range (total: 3, unhealthy: 2, unhealthyRange: [3-5])",
   735  				},
   736  			},
   737  		}))
   738  
   739  		// Calculate how many Machines have health check succeeded = false.
   740  		g.Eventually(func() (unhealthy int) {
   741  			machines := &clusterv1.MachineList{}
   742  			err := env.List(ctx, machines, client.MatchingLabels{
   743  				"selector": mhc.Spec.Selector.MatchLabels["selector"],
   744  			})
   745  			if err != nil {
   746  				return -1
   747  			}
   748  
   749  			for i := range machines.Items {
   750  				if conditions.IsFalse(&machines.Items[i], clusterv1.MachineHealthCheckSucceededCondition) {
   751  					unhealthy++
   752  				}
   753  			}
   754  			return
   755  		}).Should(Equal(2))
   756  
   757  		// Calculate how many Machines have been remediated.
   758  		g.Eventually(func() (remediated int) {
   759  			machines := &clusterv1.MachineList{}
   760  			err := env.List(ctx, machines, client.MatchingLabels{
   761  				"selector": mhc.Spec.Selector.MatchLabels["selector"],
   762  			})
   763  			if err != nil {
   764  				return -1
   765  			}
   766  
   767  			for i := range machines.Items {
   768  				if conditions.Get(&machines.Items[i], clusterv1.MachineOwnerRemediatedCondition) != nil {
   769  					remediated++
   770  				}
   771  			}
   772  			return
   773  		}).Should(Equal(0))
   774  	})
   775  
   776  	t.Run("when a Machine has no Node ref for less than the NodeStartupTimeout", func(t *testing.T) {
   777  		g := NewWithT(t)
   778  		cluster := createCluster(g, ns.Name)
   779  
   780  		// After the cluster exists, we have to set the infrastructure ready condition; otherwise, MachineHealthChecks
   781  		// will never fail when nodeStartupTimeout is exceeded.
   782  		patchHelper, err := patch.NewHelper(cluster, env.GetClient())
   783  		g.Expect(err).ToNot(HaveOccurred())
   784  
   785  		conditions.MarkTrue(cluster, clusterv1.InfrastructureReadyCondition)
   786  		g.Expect(patchHelper.Patch(ctx, cluster)).To(Succeed())
   787  
   788  		mhc := newMachineHealthCheck(cluster.Namespace, cluster.Name)
   789  		mhc.Spec.NodeStartupTimeout = &metav1.Duration{Duration: 5 * time.Hour}
   790  
   791  		g.Expect(env.Create(ctx, mhc)).To(Succeed())
   792  		defer func(do ...client.Object) {
   793  			g.Expect(env.Cleanup(ctx, do...)).To(Succeed())
   794  		}(cluster, mhc)
   795  
   796  		// Healthy nodes and machines.
   797  		_, machines, cleanup1 := createMachinesWithNodes(g, cluster,
   798  			count(2),
   799  			firstMachineAsControlPlane(),
   800  			createNodeRefForMachine(true),
   801  			nodeStatus(corev1.ConditionTrue),
   802  			machineLabels(mhc.Spec.Selector.MatchLabels),
   803  		)
   804  		defer cleanup1()
   805  		// Unhealthy nodes and machines.
   806  		_, unhealthyMachines, cleanup2 := createMachinesWithNodes(g, cluster,
   807  			count(1),
   808  			createNodeRefForMachine(false),
   809  			nodeStatus(corev1.ConditionUnknown),
   810  			machineLabels(mhc.Spec.Selector.MatchLabels),
   811  		)
   812  		defer cleanup2()
   813  		machines = append(machines, unhealthyMachines...)
   814  		targetMachines := make([]string, len(machines))
   815  		for i, m := range machines {
   816  			targetMachines[i] = m.Name
   817  		}
   818  		sort.Strings(targetMachines)
   819  
   820  		// Make sure the status matches.
   821  		g.Eventually(func() *clusterv1.MachineHealthCheckStatus {
   822  			err := env.Get(ctx, util.ObjectKey(mhc), mhc)
   823  			if err != nil {
   824  				return nil
   825  			}
   826  			return &mhc.Status
   827  		}).Should(MatchMachineHealthCheckStatus(&clusterv1.MachineHealthCheckStatus{
   828  			ExpectedMachines:    3,
   829  			CurrentHealthy:      2,
   830  			RemediationsAllowed: 2,
   831  			ObservedGeneration:  1,
   832  			Targets:             targetMachines,
   833  			Conditions: clusterv1.Conditions{
   834  				{
   835  					Type:   clusterv1.RemediationAllowedCondition,
   836  					Status: corev1.ConditionTrue,
   837  				},
   838  			},
   839  		}))
   840  
   841  		// Calculate how many Machines have health check succeeded = false.
   842  		g.Eventually(func() (unhealthy int) {
   843  			machines := &clusterv1.MachineList{}
   844  			err := env.List(ctx, machines, client.MatchingLabels{
   845  				"selector": mhc.Spec.Selector.MatchLabels["selector"],
   846  			})
   847  			if err != nil {
   848  				return -1
   849  			}
   850  
   851  			for i := range machines.Items {
   852  				if conditions.IsFalse(&machines.Items[i], clusterv1.MachineHealthCheckSucceededCondition) {
   853  					unhealthy++
   854  				}
   855  			}
   856  			return
   857  		}).Should(Equal(0))
   858  
   859  		// Calculate how many Machines have been remediated.
   860  		g.Eventually(func() (remediated int) {
   861  			machines := &clusterv1.MachineList{}
   862  			err := env.List(ctx, machines, client.MatchingLabels{
   863  				"selector": mhc.Spec.Selector.MatchLabels["selector"],
   864  			})
   865  			if err != nil {
   866  				return -1
   867  			}
   868  
   869  			for i := range machines.Items {
   870  				if conditions.IsTrue(&machines.Items[i], clusterv1.MachineOwnerRemediatedCondition) {
   871  					remediated++
   872  				}
   873  			}
   874  			return
   875  		}).Should(Equal(0))
   876  	})
   877  
   878  	t.Run("when a Machine has no Node ref for longer than the NodeStartupTimeout", func(t *testing.T) {
   879  		// FIXME: Resolve flaky/failing test
   880  		t.Skip("skipping until made stable")
   881  		g := NewWithT(t)
   882  		cluster := createCluster(g, ns.Name)
   883  
   884  		mhc := newMachineHealthCheck(cluster.Namespace, cluster.Name)
   885  		mhc.Spec.NodeStartupTimeout = &metav1.Duration{Duration: time.Second}
   886  
   887  		g.Expect(env.Create(ctx, mhc)).To(Succeed())
   888  		defer func(do ...client.Object) {
   889  			g.Expect(env.Cleanup(ctx, do...)).To(Succeed())
   890  		}(cluster, mhc)
   891  
   892  		// Healthy nodes and machines.
   893  		_, machines, cleanup1 := createMachinesWithNodes(g, cluster,
   894  			count(2),
   895  			firstMachineAsControlPlane(),
   896  			createNodeRefForMachine(true),
   897  			nodeStatus(corev1.ConditionTrue),
   898  			machineLabels(mhc.Spec.Selector.MatchLabels),
   899  		)
   900  		defer cleanup1()
   901  		// Unhealthy nodes and machines.
   902  		_, unhealthyMachines, cleanup2 := createMachinesWithNodes(g, cluster,
   903  			count(1),
   904  			createNodeRefForMachine(false),
   905  			nodeStatus(corev1.ConditionUnknown),
   906  			machineLabels(mhc.Spec.Selector.MatchLabels),
   907  		)
   908  		defer cleanup2()
   909  		machines = append(machines, unhealthyMachines...)
   910  
   911  		targetMachines := make([]string, len(machines))
   912  		for i, m := range machines {
   913  			targetMachines[i] = m.Name
   914  		}
   915  		sort.Strings(targetMachines)
   916  
   917  		// Make sure the MHC status matches. We have two healthy machines and
   918  		// one unhealthy.
   919  		g.Eventually(func() *clusterv1.MachineHealthCheckStatus {
   920  			err := env.Get(ctx, util.ObjectKey(mhc), mhc)
   921  			if err != nil {
   922  				fmt.Printf("error retrieving mhc: %v", err)
   923  				return nil
   924  			}
   925  			return &mhc.Status
   926  		}, timeout, 100*time.Millisecond).Should(MatchMachineHealthCheckStatus(&clusterv1.MachineHealthCheckStatus{
   927  			ExpectedMachines:    3,
   928  			CurrentHealthy:      2,
   929  			RemediationsAllowed: 2,
   930  			ObservedGeneration:  1,
   931  			Targets:             targetMachines,
   932  			Conditions: clusterv1.Conditions{
   933  				{
   934  					Type:   clusterv1.RemediationAllowedCondition,
   935  					Status: corev1.ConditionTrue,
   936  				},
   937  			},
   938  		}))
   939  
   940  		// Calculate how many Machines have health check succeeded = false.
   941  		g.Eventually(func() (unhealthy int) {
   942  			machines := &clusterv1.MachineList{}
   943  			err := env.List(ctx, machines, client.MatchingLabels{
   944  				"selector": mhc.Spec.Selector.MatchLabels["selector"],
   945  			})
   946  			if err != nil {
   947  				fmt.Printf("error retrieving list: %v", err)
   948  				return -1
   949  			}
   950  
   951  			for i := range machines.Items {
   952  				if conditions.IsFalse(&machines.Items[i], clusterv1.MachineHealthCheckSucceededCondition) {
   953  					unhealthy++
   954  				}
   955  			}
   956  			return
   957  		}, timeout, 100*time.Millisecond).Should(Equal(1))
   958  
   959  		// Calculate how many Machines have been remediated.
   960  		g.Eventually(func() (remediated int) {
   961  			machines := &clusterv1.MachineList{}
   962  			err := env.List(ctx, machines, client.MatchingLabels{
   963  				"selector": mhc.Spec.Selector.MatchLabels["selector"],
   964  			})
   965  			if err != nil {
   966  				return -1
   967  			}
   968  
   969  			for i := range machines.Items {
   970  				if conditions.Get(&machines.Items[i], clusterv1.MachineOwnerRemediatedCondition) != nil {
   971  					remediated++
   972  				}
   973  			}
   974  			return
   975  		}, timeout, 100*time.Millisecond).Should(Equal(1))
   976  	})
   977  
   978  	t.Run("when a Machine's Node has gone away", func(t *testing.T) {
   979  		// FIXME: Resolve flaky/failing test
   980  		t.Skip("skipping until made stable")
   981  		g := NewWithT(t)
   982  		cluster := createCluster(g, ns.Name)
   983  
   984  		mhc := newMachineHealthCheck(cluster.Namespace, cluster.Name)
   985  
   986  		g.Expect(env.Create(ctx, mhc)).To(Succeed())
   987  		defer func(do ...client.Object) {
   988  			g.Expect(env.Cleanup(ctx, do...)).To(Succeed())
   989  		}(cluster, mhc)
   990  
   991  		// Healthy nodes and machines.
   992  		nodes, machines, cleanup := createMachinesWithNodes(g, cluster,
   993  			count(3),
   994  			firstMachineAsControlPlane(),
   995  			createNodeRefForMachine(true),
   996  			nodeStatus(corev1.ConditionTrue),
   997  			machineLabels(mhc.Spec.Selector.MatchLabels),
   998  		)
   999  		defer cleanup()
  1000  		targetMachines := make([]string, len(machines))
  1001  		for i, m := range machines {
  1002  			targetMachines[i] = m.Name
  1003  		}
  1004  		sort.Strings(targetMachines)
  1005  
  1006  		// Forcibly remove the last machine's node.
  1007  		g.Eventually(func() bool {
  1008  			nodeToBeRemoved := nodes[2]
  1009  			if err := env.Delete(ctx, nodeToBeRemoved); err != nil {
  1010  				return apierrors.IsNotFound(err)
  1011  			}
  1012  			return apierrors.IsNotFound(env.Get(ctx, util.ObjectKey(nodeToBeRemoved), nodeToBeRemoved))
  1013  		}).Should(BeTrue())
  1014  
  1015  		// Make sure the status matches.
  1016  		g.Eventually(func() *clusterv1.MachineHealthCheckStatus {
  1017  			err := env.Get(ctx, util.ObjectKey(mhc), mhc)
  1018  			if err != nil {
  1019  				return nil
  1020  			}
  1021  			return &mhc.Status
  1022  		}).Should(MatchMachineHealthCheckStatus(&clusterv1.MachineHealthCheckStatus{
  1023  			ExpectedMachines:    3,
  1024  			CurrentHealthy:      2,
  1025  			RemediationsAllowed: 2,
  1026  			ObservedGeneration:  1,
  1027  			Targets:             targetMachines,
  1028  			Conditions: clusterv1.Conditions{
  1029  				{
  1030  					Type:   clusterv1.RemediationAllowedCondition,
  1031  					Status: corev1.ConditionTrue,
  1032  				},
  1033  			},
  1034  		}))
  1035  
  1036  		// Calculate how many Machines have health check succeeded = false.
  1037  		g.Eventually(func() (unhealthy int) {
  1038  			machines := &clusterv1.MachineList{}
  1039  			err := env.List(ctx, machines, client.MatchingLabels{
  1040  				"selector": mhc.Spec.Selector.MatchLabels["selector"],
  1041  			})
  1042  			if err != nil {
  1043  				return -1
  1044  			}
  1045  
  1046  			for i := range machines.Items {
  1047  				if conditions.IsFalse(&machines.Items[i], clusterv1.MachineHealthCheckSucceededCondition) {
  1048  					unhealthy++
  1049  				}
  1050  			}
  1051  			return
  1052  		}).Should(Equal(1))
  1053  
  1054  		// Calculate how many Machines have been remediated.
  1055  		g.Eventually(func() (remediated int) {
  1056  			machines := &clusterv1.MachineList{}
  1057  			err := env.List(ctx, machines, client.MatchingLabels{
  1058  				"selector": mhc.Spec.Selector.MatchLabels["selector"],
  1059  			})
  1060  			if err != nil {
  1061  				return -1
  1062  			}
  1063  
  1064  			for i := range machines.Items {
  1065  				if conditions.Get(&machines.Items[i], clusterv1.MachineOwnerRemediatedCondition) != nil {
  1066  					remediated++
  1067  				}
  1068  			}
  1069  			return
  1070  		}, timeout, 100*time.Millisecond).Should(Equal(1))
  1071  	})
  1072  
  1073  	t.Run("should react when a Node transitions to unhealthy", func(t *testing.T) {
  1074  		g := NewWithT(t)
  1075  		cluster := createCluster(g, ns.Name)
  1076  
  1077  		mhc := newMachineHealthCheck(cluster.Namespace, cluster.Name)
  1078  
  1079  		g.Expect(env.Create(ctx, mhc)).To(Succeed())
  1080  		defer func(do ...client.Object) {
  1081  			g.Expect(env.Cleanup(ctx, do...)).To(Succeed())
  1082  		}(cluster, mhc)
  1083  
  1084  		// Healthy nodes and machines.
  1085  		nodes, machines, cleanup := createMachinesWithNodes(g, cluster,
  1086  			count(1),
  1087  			firstMachineAsControlPlane(),
  1088  			createNodeRefForMachine(true),
  1089  			nodeStatus(corev1.ConditionTrue),
  1090  			machineLabels(mhc.Spec.Selector.MatchLabels),
  1091  		)
  1092  		defer cleanup()
  1093  		targetMachines := make([]string, len(machines))
  1094  		for i, m := range machines {
  1095  			targetMachines[i] = m.Name
  1096  		}
  1097  		sort.Strings(targetMachines)
  1098  
  1099  		// Make sure the status matches.
  1100  		g.Eventually(func() *clusterv1.MachineHealthCheckStatus {
  1101  			err := env.Get(ctx, util.ObjectKey(mhc), mhc)
  1102  			if err != nil {
  1103  				return nil
  1104  			}
  1105  			return &mhc.Status
  1106  		}).Should(MatchMachineHealthCheckStatus(&clusterv1.MachineHealthCheckStatus{
  1107  			ExpectedMachines:    1,
  1108  			CurrentHealthy:      1,
  1109  			RemediationsAllowed: 1,
  1110  			ObservedGeneration:  1,
  1111  			Targets:             targetMachines,
  1112  			Conditions: clusterv1.Conditions{
  1113  				{
  1114  					Type:   clusterv1.RemediationAllowedCondition,
  1115  					Status: corev1.ConditionTrue,
  1116  				},
  1117  			},
  1118  		}))
  1119  
  1120  		// Transition the node to unhealthy.
  1121  		node := nodes[0]
  1122  		nodePatch := client.MergeFrom(node.DeepCopy())
  1123  		node.Status.Conditions = []corev1.NodeCondition{
  1124  			{
  1125  				Type:               corev1.NodeReady,
  1126  				Status:             corev1.ConditionUnknown,
  1127  				LastTransitionTime: metav1.NewTime(time.Now().Add(-10 * time.Minute)),
  1128  			},
  1129  		}
  1130  		g.Expect(env.Status().Patch(ctx, node, nodePatch)).To(Succeed())
  1131  
  1132  		// Make sure the status matches.
  1133  		g.Eventually(func() *clusterv1.MachineHealthCheckStatus {
  1134  			err := env.Get(ctx, util.ObjectKey(mhc), mhc)
  1135  			if err != nil {
  1136  				return nil
  1137  			}
  1138  			return &mhc.Status
  1139  		}).Should(MatchMachineHealthCheckStatus(&clusterv1.MachineHealthCheckStatus{
  1140  			ExpectedMachines:   1,
  1141  			CurrentHealthy:     0,
  1142  			ObservedGeneration: 1,
  1143  			Targets:            targetMachines,
  1144  			Conditions: clusterv1.Conditions{
  1145  				{
  1146  					Type:   clusterv1.RemediationAllowedCondition,
  1147  					Status: corev1.ConditionTrue,
  1148  				},
  1149  			},
  1150  		}))
  1151  
  1152  		// Calculate how many Machines have health check succeeded = false.
  1153  		g.Eventually(func() (unhealthy int) {
  1154  			machines := &clusterv1.MachineList{}
  1155  			err := env.List(ctx, machines, client.MatchingLabels{
  1156  				"selector": mhc.Spec.Selector.MatchLabels["selector"],
  1157  			})
  1158  			if err != nil {
  1159  				return -1
  1160  			}
  1161  
  1162  			for i := range machines.Items {
  1163  				if conditions.IsFalse(&machines.Items[i], clusterv1.MachineHealthCheckSucceededCondition) {
  1164  					unhealthy++
  1165  				}
  1166  			}
  1167  			return
  1168  		}).Should(Equal(1))
  1169  
  1170  		// Calculate how many Machines have been marked for remediation
  1171  		g.Eventually(func() (remediated int) {
  1172  			machines := &clusterv1.MachineList{}
  1173  			err := env.List(ctx, machines, client.MatchingLabels{
  1174  				"selector": mhc.Spec.Selector.MatchLabels["selector"],
  1175  			})
  1176  			if err != nil {
  1177  				return -1
  1178  			}
  1179  
  1180  			for i := range machines.Items {
  1181  				if conditions.IsFalse(&machines.Items[i], clusterv1.MachineOwnerRemediatedCondition) {
  1182  					remediated++
  1183  				}
  1184  			}
  1185  			return
  1186  		}).Should(Equal(1))
  1187  	})
  1188  
  1189  	t.Run("when in a MachineSet, unhealthy machines should be deleted", func(t *testing.T) {
  1190  		g := NewWithT(t)
  1191  		cluster := createCluster(g, ns.Name)
  1192  
  1193  		// Create 1 control plane machine so MHC can proceed
  1194  		_, _, cleanup := createMachinesWithNodes(g, cluster,
  1195  			count(1),
  1196  			firstMachineAsControlPlane(),
  1197  			createNodeRefForMachine(true),
  1198  			nodeStatus(corev1.ConditionTrue),
  1199  		)
  1200  		defer cleanup()
  1201  
  1202  		mhc := newMachineHealthCheck(cluster.Namespace, cluster.Name)
  1203  		// Create infrastructure template resource.
  1204  		infraResource := map[string]interface{}{
  1205  			"kind":       "GenericInfrastructureMachine",
  1206  			"apiVersion": "infrastructure.cluster.x-k8s.io/v1beta1",
  1207  			"metadata":   map[string]interface{}{},
  1208  			"spec": map[string]interface{}{
  1209  				"size": "3xlarge",
  1210  			},
  1211  		}
  1212  		infraTmpl := &unstructured.Unstructured{
  1213  			Object: map[string]interface{}{
  1214  				"spec": map[string]interface{}{
  1215  					"template": infraResource,
  1216  				},
  1217  			},
  1218  		}
  1219  		infraTmpl.SetKind("GenericInfrastructureMachineTemplate")
  1220  		infraTmpl.SetAPIVersion("infrastructure.cluster.x-k8s.io/v1beta1")
  1221  		infraTmpl.SetGenerateName("mhc-ms-template-")
  1222  		infraTmpl.SetNamespace(mhc.Namespace)
  1223  
  1224  		g.Expect(env.Create(ctx, infraTmpl)).To(Succeed())
  1225  
  1226  		machineSet := &clusterv1.MachineSet{
  1227  			ObjectMeta: metav1.ObjectMeta{
  1228  				GenerateName: "mhc-ms-",
  1229  				Namespace:    mhc.Namespace,
  1230  			},
  1231  			Spec: clusterv1.MachineSetSpec{
  1232  				ClusterName: cluster.Name,
  1233  				Replicas:    ptr.To[int32](1),
  1234  				Selector:    mhc.Spec.Selector,
  1235  				Template: clusterv1.MachineTemplateSpec{
  1236  					ObjectMeta: clusterv1.ObjectMeta{
  1237  						Labels: mhc.Spec.Selector.MatchLabels,
  1238  					},
  1239  					Spec: clusterv1.MachineSpec{
  1240  						ClusterName: cluster.Name,
  1241  						Bootstrap: clusterv1.Bootstrap{
  1242  							DataSecretName: ptr.To("test-data-secret-name"),
  1243  						},
  1244  						InfrastructureRef: corev1.ObjectReference{
  1245  							APIVersion: "infrastructure.cluster.x-k8s.io/v1beta1",
  1246  							Kind:       "GenericInfrastructureMachineTemplate",
  1247  							Name:       infraTmpl.GetName(),
  1248  						},
  1249  					},
  1250  				},
  1251  			},
  1252  		}
  1253  
  1254  		reqCtx := admission.NewContextWithRequest(ctx, admission.Request{})
  1255  		g.Expect((&webhooks.MachineSet{}).Default(reqCtx, machineSet)).Should(Succeed())
  1256  		g.Expect(env.Create(ctx, machineSet)).To(Succeed())
  1257  
  1258  		// Ensure machines have been created.
  1259  		g.Eventually(func() int {
  1260  			machines := &clusterv1.MachineList{}
  1261  			err := env.List(ctx, machines, client.MatchingLabels{
  1262  				"selector": mhc.Spec.Selector.MatchLabels["selector"],
  1263  			})
  1264  			if err != nil {
  1265  				return -1
  1266  			}
  1267  			return len(machines.Items)
  1268  		}, timeout, 100*time.Millisecond).Should(Equal(1))
  1269  
  1270  		// Create the MachineHealthCheck instance.
  1271  		mhc.Spec.NodeStartupTimeout = &metav1.Duration{Duration: time.Second}
  1272  
  1273  		g.Expect(env.Create(ctx, mhc)).To(Succeed())
  1274  		// defer cleanup for all the objects that have been created
  1275  		defer func(do ...client.Object) {
  1276  			g.Expect(env.Cleanup(ctx, do...)).To(Succeed())
  1277  		}(cluster, mhc, infraTmpl, machineSet)
  1278  
  1279  		// Pause the MachineSet reconciler to delay the deletion of the
  1280  		// Machine, because the MachineSet controller deletes the Machine when
  1281  		// it is marked unhealthy by MHC.
  1282  		machineSetPatch := client.MergeFrom(machineSet.DeepCopy())
  1283  		machineSet.Annotations = map[string]string{
  1284  			clusterv1.PausedAnnotation: "",
  1285  		}
  1286  		g.Expect(env.Patch(ctx, machineSet, machineSetPatch)).To(Succeed())
  1287  
  1288  		// Calculate how many Machines have health check succeeded = false.
  1289  		g.Eventually(func() (unhealthy int) {
  1290  			machines := &clusterv1.MachineList{}
  1291  			err := env.List(ctx, machines, client.MatchingLabels{
  1292  				"selector": mhc.Spec.Selector.MatchLabels["selector"],
  1293  			})
  1294  			if err != nil {
  1295  				return -1
  1296  			}
  1297  
  1298  			for i := range machines.Items {
  1299  				if conditions.IsFalse(&machines.Items[i], clusterv1.MachineHealthCheckSucceededCondition) {
  1300  					unhealthy++
  1301  				}
  1302  			}
  1303  			return
  1304  		}, timeout, 100*time.Millisecond).Should(Equal(1))
  1305  
  1306  		// Calculate how many Machines should be remediated.
  1307  		var unhealthyMachine *clusterv1.Machine
  1308  		g.Eventually(func() (remediated int) {
  1309  			machines := &clusterv1.MachineList{}
  1310  			err := env.List(ctx, machines, client.MatchingLabels{
  1311  				"selector": mhc.Spec.Selector.MatchLabels["selector"],
  1312  			})
  1313  			if err != nil {
  1314  				return -1
  1315  			}
  1316  
  1317  			for i := range machines.Items {
  1318  				if conditions.Get(&machines.Items[i], clusterv1.MachineOwnerRemediatedCondition) != nil {
  1319  					unhealthyMachine = machines.Items[i].DeepCopy()
  1320  					remediated++
  1321  				}
  1322  			}
  1323  			return
  1324  		}, timeout, 100*time.Millisecond).Should(Equal(1))
  1325  
  1326  		// Unpause the MachineSet reconciler.
  1327  		machineSetPatch = client.MergeFrom(machineSet.DeepCopy())
  1328  		delete(machineSet.Annotations, clusterv1.PausedAnnotation)
  1329  		g.Expect(env.Patch(ctx, machineSet, machineSetPatch)).To(Succeed())
  1330  
  1331  		// Make sure the Machine gets deleted.
  1332  		g.Eventually(func() bool {
  1333  			machine := unhealthyMachine.DeepCopy()
  1334  			err := env.Get(ctx, util.ObjectKey(unhealthyMachine), machine)
  1335  			return apierrors.IsNotFound(err) || !machine.DeletionTimestamp.IsZero()
  1336  		}, timeout, 100*time.Millisecond).Should(BeTrue())
  1337  	})
  1338  
  1339  	t.Run("when a machine is paused", func(t *testing.T) {
  1340  		// FIXME: Resolve flaky/failing test
  1341  		t.Skip("skipping until made stable")
  1342  		g := NewWithT(t)
  1343  		cluster := createCluster(g, ns.Name)
  1344  
  1345  		mhc := newMachineHealthCheck(cluster.Namespace, cluster.Name)
  1346  
  1347  		g.Expect(env.Create(ctx, mhc)).To(Succeed())
  1348  		defer func(do ...client.Object) {
  1349  			g.Expect(env.Cleanup(ctx, do...)).To(Succeed())
  1350  		}(cluster, mhc)
  1351  
  1352  		// Healthy nodes and machines.
  1353  		nodes, machines, cleanup := createMachinesWithNodes(g, cluster,
  1354  			count(1),
  1355  			firstMachineAsControlPlane(),
  1356  			createNodeRefForMachine(true),
  1357  			nodeStatus(corev1.ConditionTrue),
  1358  			machineLabels(mhc.Spec.Selector.MatchLabels),
  1359  		)
  1360  		defer cleanup()
  1361  		targetMachines := make([]string, len(machines))
  1362  		for i, m := range machines {
  1363  			targetMachines[i] = m.Name
  1364  		}
  1365  		sort.Strings(targetMachines)
  1366  
  1367  		// Make sure the status matches.
  1368  		g.Eventually(func() *clusterv1.MachineHealthCheckStatus {
  1369  			err := env.Get(ctx, util.ObjectKey(mhc), mhc)
  1370  			if err != nil {
  1371  				return nil
  1372  			}
  1373  			return &mhc.Status
  1374  		}).Should(MatchMachineHealthCheckStatus(&clusterv1.MachineHealthCheckStatus{
  1375  			ExpectedMachines:   1,
  1376  			CurrentHealthy:     1,
  1377  			ObservedGeneration: 1,
  1378  			Targets:            targetMachines,
  1379  			Conditions: clusterv1.Conditions{
  1380  				{
  1381  					Type:   clusterv1.RemediationAllowedCondition,
  1382  					Status: corev1.ConditionTrue,
  1383  				},
  1384  			},
  1385  		}))
  1386  
  1387  		// Pause the machine
  1388  		machinePatch := client.MergeFrom(machines[0].DeepCopy())
  1389  		machines[0].Annotations = map[string]string{
  1390  			clusterv1.PausedAnnotation: "",
  1391  		}
  1392  		g.Expect(env.Patch(ctx, machines[0], machinePatch)).To(Succeed())
  1393  
  1394  		// Transition the node to unhealthy.
  1395  		node := nodes[0]
  1396  		nodePatch := client.MergeFrom(node.DeepCopy())
  1397  		node.Status.Conditions = []corev1.NodeCondition{
  1398  			{
  1399  				Type:               corev1.NodeReady,
  1400  				Status:             corev1.ConditionUnknown,
  1401  				LastTransitionTime: metav1.NewTime(time.Now().Add(-10 * time.Minute)),
  1402  			},
  1403  		}
  1404  		g.Expect(env.Status().Patch(ctx, node, nodePatch)).To(Succeed())
  1405  
  1406  		// Make sure the status matches.
  1407  		g.Eventually(func() *clusterv1.MachineHealthCheckStatus {
  1408  			err := env.Get(ctx, util.ObjectKey(mhc), mhc)
  1409  			if err != nil {
  1410  				return nil
  1411  			}
  1412  			return &mhc.Status
  1413  		}).Should(MatchMachineHealthCheckStatus(&clusterv1.MachineHealthCheckStatus{
  1414  			ExpectedMachines:    1,
  1415  			CurrentHealthy:      0,
  1416  			RemediationsAllowed: 0,
  1417  			ObservedGeneration:  1,
  1418  			Targets:             targetMachines,
  1419  			Conditions: clusterv1.Conditions{
  1420  				{
  1421  					Type:   clusterv1.RemediationAllowedCondition,
  1422  					Status: corev1.ConditionTrue,
  1423  				},
  1424  			},
  1425  		}))
  1426  
  1427  		// Calculate how many Machines have health check succeeded = false.
  1428  		g.Eventually(func() (unhealthy int) {
  1429  			machines := &clusterv1.MachineList{}
  1430  			err := env.List(ctx, machines, client.MatchingLabels{
  1431  				"selector": mhc.Spec.Selector.MatchLabels["selector"],
  1432  			})
  1433  			if err != nil {
  1434  				return -1
  1435  			}
  1436  
  1437  			for i := range machines.Items {
  1438  				if conditions.IsFalse(&machines.Items[i], clusterv1.MachineHealthCheckSucceededCondition) {
  1439  					unhealthy++
  1440  				}
  1441  			}
  1442  			return
  1443  		}).Should(Equal(1))
  1444  
  1445  		// Calculate how many Machines have been remediated.
  1446  		g.Eventually(func() (remediated int) {
  1447  			machines := &clusterv1.MachineList{}
  1448  			err := env.List(ctx, machines, client.MatchingLabels{
  1449  				"selector": mhc.Spec.Selector.MatchLabels["selector"],
  1450  			})
  1451  			if err != nil {
  1452  				return -1
  1453  			}
  1454  
  1455  			for i := range machines.Items {
  1456  				if conditions.Get(&machines.Items[i], clusterv1.MachineOwnerRemediatedCondition) != nil {
  1457  					remediated++
  1458  				}
  1459  			}
  1460  			return
  1461  		}).Should(Equal(0))
  1462  	})
  1463  
  1464  	t.Run("When remediationTemplate is set and node transitions to unhealthy, new Remediation Request should be created", func(t *testing.T) {
  1465  		g := NewWithT(t)
  1466  		cluster := createCluster(g, ns.Name)
  1467  
  1468  		// Create remediation template resource.
  1469  		infraRemediationResource := map[string]interface{}{
  1470  			"kind":       "GenericExternalRemediation",
  1471  			"apiVersion": builder.RemediationGroupVersion.String(),
  1472  			"metadata":   map[string]interface{}{},
  1473  			"spec": map[string]interface{}{
  1474  				"size": "3xlarge",
  1475  			},
  1476  		}
  1477  		infraRemediationTmpl := &unstructured.Unstructured{
  1478  			Object: map[string]interface{}{
  1479  				"spec": map[string]interface{}{
  1480  					"template": infraRemediationResource,
  1481  				},
  1482  			},
  1483  		}
  1484  		infraRemediationTmpl.SetKind("GenericExternalRemediationTemplate")
  1485  		infraRemediationTmpl.SetAPIVersion(builder.RemediationGroupVersion.String())
  1486  		infraRemediationTmpl.SetGenerateName("remediation-template-name-")
  1487  		infraRemediationTmpl.SetNamespace(cluster.Namespace)
  1488  		g.Expect(env.Create(ctx, infraRemediationTmpl)).To(Succeed())
  1489  
  1490  		remediationTemplate := &corev1.ObjectReference{
  1491  			APIVersion: builder.RemediationGroupVersion.String(),
  1492  			Kind:       "GenericExternalRemediationTemplate",
  1493  			Name:       infraRemediationTmpl.GetName(),
  1494  		}
  1495  
  1496  		mhc := newMachineHealthCheck(cluster.Namespace, cluster.Name)
  1497  		mhc.Spec.RemediationTemplate = remediationTemplate
  1498  		g.Expect(env.Create(ctx, mhc)).To(Succeed())
  1499  		defer func(do ...client.Object) {
  1500  			g.Expect(env.Cleanup(ctx, do...)).To(Succeed())
  1501  		}(cluster, mhc, infraRemediationTmpl)
  1502  
  1503  		// Healthy nodes and machines.
  1504  		nodes, machines, cleanup := createMachinesWithNodes(g, cluster,
  1505  			count(1),
  1506  			firstMachineAsControlPlane(),
  1507  			createNodeRefForMachine(true),
  1508  			nodeStatus(corev1.ConditionTrue),
  1509  			machineLabels(mhc.Spec.Selector.MatchLabels),
  1510  		)
  1511  		defer cleanup()
  1512  		targetMachines := make([]string, len(machines))
  1513  		for i, m := range machines {
  1514  			targetMachines[i] = m.Name
  1515  		}
  1516  		sort.Strings(targetMachines)
  1517  
  1518  		// Make sure the status matches.
  1519  		g.Eventually(func() *clusterv1.MachineHealthCheckStatus {
  1520  			err := env.Get(ctx, util.ObjectKey(mhc), mhc)
  1521  			if err != nil {
  1522  				return nil
  1523  			}
  1524  			return &mhc.Status
  1525  		}).Should(MatchMachineHealthCheckStatus(&clusterv1.MachineHealthCheckStatus{
  1526  			ExpectedMachines:    1,
  1527  			CurrentHealthy:      1,
  1528  			RemediationsAllowed: 1,
  1529  			ObservedGeneration:  1,
  1530  			Targets:             targetMachines,
  1531  			Conditions: clusterv1.Conditions{
  1532  				{
  1533  					Type:   clusterv1.RemediationAllowedCondition,
  1534  					Status: corev1.ConditionTrue,
  1535  				},
  1536  			},
  1537  		}))
  1538  
  1539  		// Transition the node to unhealthy.
  1540  		node := nodes[0]
  1541  		nodePatch := client.MergeFrom(node.DeepCopy())
  1542  		node.Status.Conditions = []corev1.NodeCondition{
  1543  			{
  1544  				Type:               corev1.NodeReady,
  1545  				Status:             corev1.ConditionUnknown,
  1546  				LastTransitionTime: metav1.NewTime(time.Now().Add(-10 * time.Minute)),
  1547  			},
  1548  		}
  1549  		g.Expect(env.Status().Patch(ctx, node, nodePatch)).To(Succeed())
  1550  
  1551  		// Make sure the status matches.
  1552  		g.Eventually(func() *clusterv1.MachineHealthCheckStatus {
  1553  			err := env.Get(ctx, util.ObjectKey(mhc), mhc)
  1554  			if err != nil {
  1555  				return nil
  1556  			}
  1557  			return &mhc.Status
  1558  		}).Should(MatchMachineHealthCheckStatus(&clusterv1.MachineHealthCheckStatus{
  1559  			ExpectedMachines:    1,
  1560  			CurrentHealthy:      0,
  1561  			RemediationsAllowed: 0,
  1562  			ObservedGeneration:  1,
  1563  			Targets:             targetMachines,
  1564  			Conditions: clusterv1.Conditions{
  1565  				{
  1566  					Type:   clusterv1.RemediationAllowedCondition,
  1567  					Status: corev1.ConditionTrue,
  1568  				},
  1569  			},
  1570  		}))
  1571  
  1572  		// Calculate how many Machines have health check succeeded = false.
  1573  		g.Eventually(func() (unhealthy int) {
  1574  			machines := &clusterv1.MachineList{}
  1575  			err := env.List(ctx, machines, client.MatchingLabels{
  1576  				"selector": mhc.Spec.Selector.MatchLabels["selector"],
  1577  			})
  1578  			if err != nil {
  1579  				return -1
  1580  			}
  1581  
  1582  			for i := range machines.Items {
  1583  				if conditions.IsFalse(&machines.Items[i], clusterv1.MachineHealthCheckSucceededCondition) {
  1584  					unhealthy++
  1585  				}
  1586  			}
  1587  			return
  1588  		}).Should(Equal(1))
  1589  
  1590  		ref := corev1.ObjectReference{
  1591  			APIVersion: builder.RemediationGroupVersion.String(),
  1592  			Kind:       "GenericExternalRemediation",
  1593  		}
  1594  
  1595  		obj := util.ObjectReferenceToUnstructured(ref)
  1596  		// Make sure the Remeditaion Request is created.
  1597  		g.Eventually(func() *unstructured.Unstructured {
  1598  			key := client.ObjectKey{
  1599  				Namespace: machines[0].Namespace,
  1600  				Name:      machines[0].Name,
  1601  			}
  1602  			err := env.Get(ctx, key, obj)
  1603  			if err != nil {
  1604  				return nil
  1605  			}
  1606  			return obj
  1607  		}, timeout, 100*time.Millisecond).ShouldNot(BeNil())
  1608  		g.Expect(obj.GetOwnerReferences()).To(HaveLen(1))
  1609  		g.Expect(obj.GetOwnerReferences()[0].Name).To(Equal(machines[0].Name))
  1610  	})
  1611  
  1612  	t.Run("When remediationTemplate is set and node transitions back to healthy, new Remediation Request should be deleted", func(t *testing.T) {
  1613  		g := NewWithT(t)
  1614  		cluster := createCluster(g, ns.Name)
  1615  
  1616  		// Create remediation template resource.
  1617  		infraRemediationResource := map[string]interface{}{
  1618  			"kind":       "GenericExternalRemediation",
  1619  			"apiVersion": builder.RemediationGroupVersion.String(),
  1620  			"metadata":   map[string]interface{}{},
  1621  			"spec": map[string]interface{}{
  1622  				"size": "3xlarge",
  1623  			},
  1624  		}
  1625  		infraRemediationTmpl := &unstructured.Unstructured{
  1626  			Object: map[string]interface{}{
  1627  				"spec": map[string]interface{}{
  1628  					"template": infraRemediationResource,
  1629  				},
  1630  			},
  1631  		}
  1632  		infraRemediationTmpl.SetKind("GenericExternalRemediationTemplate")
  1633  		infraRemediationTmpl.SetAPIVersion(builder.RemediationGroupVersion.String())
  1634  		infraRemediationTmpl.SetGenerateName("remediation-template-name-")
  1635  		infraRemediationTmpl.SetNamespace(cluster.Namespace)
  1636  		g.Expect(env.Create(ctx, infraRemediationTmpl)).To(Succeed())
  1637  
  1638  		remediationTemplate := &corev1.ObjectReference{
  1639  			APIVersion: builder.RemediationGroupVersion.String(),
  1640  			Kind:       "GenericExternalRemediationTemplate",
  1641  			Name:       infraRemediationTmpl.GetName(),
  1642  		}
  1643  
  1644  		mhc := newMachineHealthCheck(cluster.Namespace, cluster.Name)
  1645  		mhc.Spec.RemediationTemplate = remediationTemplate
  1646  		g.Expect(env.Create(ctx, mhc)).To(Succeed())
  1647  		defer func(do ...client.Object) {
  1648  			g.Expect(env.Cleanup(ctx, do...)).To(Succeed())
  1649  		}(cluster, mhc, infraRemediationTmpl)
  1650  
  1651  		// Healthy nodes and machines.
  1652  		nodes, machines, cleanup := createMachinesWithNodes(g, cluster,
  1653  			count(1),
  1654  			firstMachineAsControlPlane(),
  1655  			createNodeRefForMachine(true),
  1656  			nodeStatus(corev1.ConditionTrue),
  1657  			machineLabels(mhc.Spec.Selector.MatchLabels),
  1658  		)
  1659  		defer cleanup()
  1660  		targetMachines := make([]string, len(machines))
  1661  		for i, m := range machines {
  1662  			targetMachines[i] = m.Name
  1663  		}
  1664  		sort.Strings(targetMachines)
  1665  
  1666  		// Make sure the status matches.
  1667  		g.Eventually(func() *clusterv1.MachineHealthCheckStatus {
  1668  			err := env.Get(ctx, util.ObjectKey(mhc), mhc)
  1669  			if err != nil {
  1670  				return nil
  1671  			}
  1672  			return &mhc.Status
  1673  		}).Should(MatchMachineHealthCheckStatus(&clusterv1.MachineHealthCheckStatus{
  1674  			ExpectedMachines:    1,
  1675  			CurrentHealthy:      1,
  1676  			RemediationsAllowed: 1,
  1677  			ObservedGeneration:  1,
  1678  			Targets:             targetMachines,
  1679  			Conditions: clusterv1.Conditions{
  1680  				{
  1681  					Type:   clusterv1.RemediationAllowedCondition,
  1682  					Status: corev1.ConditionTrue,
  1683  				},
  1684  			},
  1685  		}))
  1686  
  1687  		// Transition the node to unhealthy.
  1688  		node := nodes[0]
  1689  		nodePatch := client.MergeFrom(node.DeepCopy())
  1690  		node.Status.Conditions = []corev1.NodeCondition{
  1691  			{
  1692  				Type:               corev1.NodeReady,
  1693  				Status:             corev1.ConditionUnknown,
  1694  				LastTransitionTime: metav1.NewTime(time.Now().Add(-10 * time.Minute)),
  1695  			},
  1696  		}
  1697  		g.Expect(env.Status().Patch(ctx, node, nodePatch)).To(Succeed())
  1698  
  1699  		// Make sure the status matches.
  1700  		g.Eventually(func() *clusterv1.MachineHealthCheckStatus {
  1701  			err := env.Get(ctx, util.ObjectKey(mhc), mhc)
  1702  			if err != nil {
  1703  				return nil
  1704  			}
  1705  			return &mhc.Status
  1706  		}).Should(MatchMachineHealthCheckStatus(&clusterv1.MachineHealthCheckStatus{
  1707  			ExpectedMachines:    1,
  1708  			CurrentHealthy:      0,
  1709  			RemediationsAllowed: 0,
  1710  			ObservedGeneration:  1,
  1711  			Targets:             targetMachines,
  1712  			Conditions: clusterv1.Conditions{
  1713  				{
  1714  					Type:   clusterv1.RemediationAllowedCondition,
  1715  					Status: corev1.ConditionTrue,
  1716  				},
  1717  			},
  1718  		}))
  1719  
  1720  		// Calculate how many Machines have health check succeeded = false.
  1721  		g.Eventually(func() (unhealthy int) {
  1722  			machines := &clusterv1.MachineList{}
  1723  			err := env.List(ctx, machines, client.MatchingLabels{
  1724  				"selector": mhc.Spec.Selector.MatchLabels["selector"],
  1725  			})
  1726  			if err != nil {
  1727  				return -1
  1728  			}
  1729  
  1730  			for i := range machines.Items {
  1731  				if conditions.IsFalse(&machines.Items[i], clusterv1.MachineHealthCheckSucceededCondition) {
  1732  					unhealthy++
  1733  				}
  1734  			}
  1735  			return
  1736  		}).Should(Equal(1))
  1737  
  1738  		// Transition the node back to healthy.
  1739  		node = nodes[0]
  1740  		nodePatch = client.MergeFrom(node.DeepCopy())
  1741  		node.Status.Conditions = []corev1.NodeCondition{
  1742  			{
  1743  				Type:               corev1.NodeReady,
  1744  				Status:             corev1.ConditionTrue,
  1745  				LastTransitionTime: metav1.NewTime(time.Now().Add(-10 * time.Minute)),
  1746  			},
  1747  		}
  1748  		g.Expect(env.Status().Patch(ctx, node, nodePatch)).To(Succeed())
  1749  
  1750  		// Make sure the status matches.
  1751  		g.Eventually(func() *clusterv1.MachineHealthCheckStatus {
  1752  			err := env.Get(ctx, util.ObjectKey(mhc), mhc)
  1753  			if err != nil {
  1754  				return nil
  1755  			}
  1756  			return &mhc.Status
  1757  		}).Should(MatchMachineHealthCheckStatus(&clusterv1.MachineHealthCheckStatus{
  1758  			ExpectedMachines:    1,
  1759  			CurrentHealthy:      1,
  1760  			RemediationsAllowed: 1,
  1761  			ObservedGeneration:  1,
  1762  			Targets:             targetMachines,
  1763  			Conditions: clusterv1.Conditions{
  1764  				{
  1765  					Type:   clusterv1.RemediationAllowedCondition,
  1766  					Status: corev1.ConditionTrue,
  1767  				},
  1768  			},
  1769  		}))
  1770  
  1771  		// Calculate how many Machines have health check succeeded = false.
  1772  		g.Eventually(func() (unhealthy int) {
  1773  			machines := &clusterv1.MachineList{}
  1774  			err := env.List(ctx, machines, client.MatchingLabels{
  1775  				"selector": mhc.Spec.Selector.MatchLabels["selector"],
  1776  			})
  1777  			if err != nil {
  1778  				return -1
  1779  			}
  1780  
  1781  			for i := range machines.Items {
  1782  				if conditions.IsFalse(&machines.Items[i], clusterv1.MachineHealthCheckSucceededCondition) {
  1783  					unhealthy++
  1784  				}
  1785  			}
  1786  			return
  1787  		}).Should(Equal(0))
  1788  
  1789  		ref := corev1.ObjectReference{
  1790  			APIVersion: builder.RemediationGroupVersion.String(),
  1791  			Kind:       "GenericExternalRemediation",
  1792  		}
  1793  
  1794  		obj := util.ObjectReferenceToUnstructured(ref)
  1795  		// Make sure the Remediation Request is deleted.
  1796  		g.Eventually(func() *unstructured.Unstructured {
  1797  			key := client.ObjectKey{
  1798  				Namespace: machines[0].Namespace,
  1799  				Name:      machines[0].Name,
  1800  			}
  1801  			err := env.Get(ctx, key, obj)
  1802  			if err != nil {
  1803  				return nil
  1804  			}
  1805  			return obj
  1806  		}, timeout, 100*time.Millisecond).Should(BeNil())
  1807  	})
  1808  }
  1809  
  1810  func TestClusterToMachineHealthCheck(t *testing.T) {
  1811  	fakeClient := fake.NewClientBuilder().Build()
  1812  
  1813  	r := &Reconciler{
  1814  		Client: fakeClient,
  1815  	}
  1816  
  1817  	namespace := metav1.NamespaceDefault
  1818  	clusterName := testClusterName
  1819  	labels := make(map[string]string)
  1820  
  1821  	mhc1 := newMachineHealthCheckWithLabels("mhc1", namespace, clusterName, labels)
  1822  	mhc1Req := reconcile.Request{NamespacedName: types.NamespacedName{Namespace: mhc1.Namespace, Name: mhc1.Name}}
  1823  	mhc2 := newMachineHealthCheckWithLabels("mhc2", namespace, clusterName, labels)
  1824  	mhc2Req := reconcile.Request{NamespacedName: types.NamespacedName{Namespace: mhc2.Namespace, Name: mhc2.Name}}
  1825  	mhc3 := newMachineHealthCheckWithLabels("mhc3", namespace, "othercluster", labels)
  1826  	mhc4 := newMachineHealthCheckWithLabels("mhc4", "othernamespace", clusterName, labels)
  1827  	cluster1 := &clusterv1.Cluster{
  1828  		ObjectMeta: metav1.ObjectMeta{
  1829  			Name:      clusterName,
  1830  			Namespace: namespace,
  1831  		},
  1832  	}
  1833  
  1834  	testCases := []struct {
  1835  		name     string
  1836  		toCreate []clusterv1.MachineHealthCheck
  1837  		object   client.Object
  1838  		expected []reconcile.Request
  1839  	}{
  1840  		{
  1841  			name:     "when a MachineHealthCheck exists for the Cluster in the same namespace",
  1842  			toCreate: []clusterv1.MachineHealthCheck{*mhc1},
  1843  			object:   cluster1,
  1844  			expected: []reconcile.Request{mhc1Req},
  1845  		},
  1846  		{
  1847  			name:     "when 2 MachineHealthChecks exists for the Cluster in the same namespace",
  1848  			toCreate: []clusterv1.MachineHealthCheck{*mhc1, *mhc2},
  1849  			object:   cluster1,
  1850  			expected: []reconcile.Request{mhc1Req, mhc2Req},
  1851  		},
  1852  		{
  1853  			name:     "when a MachineHealthCheck exists for another Cluster in the same namespace",
  1854  			toCreate: []clusterv1.MachineHealthCheck{*mhc3},
  1855  			object:   cluster1,
  1856  			expected: []reconcile.Request{},
  1857  		},
  1858  		{
  1859  			name:     "when a MachineHealthCheck exists for another Cluster in another namespace",
  1860  			toCreate: []clusterv1.MachineHealthCheck{*mhc4},
  1861  			object:   cluster1,
  1862  			expected: []reconcile.Request{},
  1863  		},
  1864  	}
  1865  
  1866  	for _, tc := range testCases {
  1867  		t.Run(tc.name, func(t *testing.T) {
  1868  			gs := NewWithT(t)
  1869  
  1870  			for _, obj := range tc.toCreate {
  1871  				o := obj
  1872  				gs.Expect(r.Client.Create(ctx, &o)).To(Succeed())
  1873  				defer func() {
  1874  					gs.Expect(r.Client.Delete(ctx, &o)).To(Succeed())
  1875  				}()
  1876  				// Check the cache is populated
  1877  				getObj := func() error {
  1878  					return r.Client.Get(ctx, util.ObjectKey(&o), &clusterv1.MachineHealthCheck{})
  1879  				}
  1880  				gs.Eventually(getObj).Should(Succeed())
  1881  			}
  1882  
  1883  			got := r.clusterToMachineHealthCheck(ctx, tc.object)
  1884  			gs.Expect(got).To(ConsistOf(tc.expected))
  1885  		})
  1886  	}
  1887  }
  1888  
  1889  func TestMachineToMachineHealthCheck(t *testing.T) {
  1890  	fakeClient := fake.NewClientBuilder().Build()
  1891  
  1892  	r := &Reconciler{
  1893  		Client: fakeClient,
  1894  	}
  1895  
  1896  	namespace := metav1.NamespaceDefault
  1897  	clusterName := testClusterName
  1898  	nodeName := "node1"
  1899  	labels := map[string]string{"cluster": "foo", "nodepool": "bar"}
  1900  
  1901  	mhc1 := newMachineHealthCheckWithLabels("mhc1", namespace, clusterName, labels)
  1902  	mhc1Req := reconcile.Request{NamespacedName: types.NamespacedName{Namespace: mhc1.Namespace, Name: mhc1.Name}}
  1903  	mhc2 := newMachineHealthCheckWithLabels("mhc2", namespace, clusterName, labels)
  1904  	mhc2Req := reconcile.Request{NamespacedName: types.NamespacedName{Namespace: mhc2.Namespace, Name: mhc2.Name}}
  1905  	mhc3 := newMachineHealthCheckWithLabels("mhc3", namespace, clusterName, map[string]string{"cluster": "foo", "nodepool": "other"})
  1906  	mhc4 := newMachineHealthCheckWithLabels("mhc4", "othernamespace", clusterName, labels)
  1907  	machine1 := newTestMachine("machine1", namespace, clusterName, nodeName, labels)
  1908  
  1909  	testCases := []struct {
  1910  		name     string
  1911  		toCreate []clusterv1.MachineHealthCheck
  1912  		object   client.Object
  1913  		expected []reconcile.Request
  1914  	}{
  1915  		{
  1916  			name:     "when a MachineHealthCheck matches labels for the Machine in the same namespace",
  1917  			toCreate: []clusterv1.MachineHealthCheck{*mhc1},
  1918  			object:   machine1,
  1919  			expected: []reconcile.Request{mhc1Req},
  1920  		},
  1921  		{
  1922  			name:     "when 2 MachineHealthChecks match labels for the Machine in the same namespace",
  1923  			toCreate: []clusterv1.MachineHealthCheck{*mhc1, *mhc2},
  1924  			object:   machine1,
  1925  			expected: []reconcile.Request{mhc1Req, mhc2Req},
  1926  		},
  1927  		{
  1928  			name:     "when a MachineHealthCheck does not match labels for the Machine in the same namespace",
  1929  			toCreate: []clusterv1.MachineHealthCheck{*mhc3},
  1930  			object:   machine1,
  1931  			expected: []reconcile.Request{},
  1932  		},
  1933  		{
  1934  			name:     "when a MachineHealthCheck matches labels for the Machine in another namespace",
  1935  			toCreate: []clusterv1.MachineHealthCheck{*mhc4},
  1936  			object:   machine1,
  1937  			expected: []reconcile.Request{},
  1938  		},
  1939  	}
  1940  
  1941  	for _, tc := range testCases {
  1942  		t.Run(tc.name, func(t *testing.T) {
  1943  			gs := NewWithT(t)
  1944  
  1945  			for _, obj := range tc.toCreate {
  1946  				o := obj
  1947  				gs.Expect(r.Client.Create(ctx, &o)).To(Succeed())
  1948  				defer func() {
  1949  					gs.Expect(r.Client.Delete(ctx, &o)).To(Succeed())
  1950  				}()
  1951  				// Check the cache is populated
  1952  				getObj := func() error {
  1953  					return r.Client.Get(ctx, util.ObjectKey(&o), &clusterv1.MachineHealthCheck{})
  1954  				}
  1955  				gs.Eventually(getObj).Should(Succeed())
  1956  			}
  1957  
  1958  			got := r.machineToMachineHealthCheck(ctx, tc.object)
  1959  			gs.Expect(got).To(ConsistOf(tc.expected))
  1960  		})
  1961  	}
  1962  }
  1963  
  1964  func TestNodeToMachineHealthCheck(t *testing.T) {
  1965  	fakeClient := fake.NewClientBuilder().
  1966  		WithIndex(&clusterv1.Machine{}, index.MachineNodeNameField, index.MachineByNodeName).
  1967  		WithStatusSubresource(&clusterv1.MachineHealthCheck{}, &clusterv1.Machine{}).
  1968  		Build()
  1969  
  1970  	r := &Reconciler{
  1971  		Client: fakeClient,
  1972  	}
  1973  
  1974  	namespace := metav1.NamespaceDefault
  1975  	clusterName := testClusterName
  1976  	nodeName := "node1"
  1977  	labels := map[string]string{"cluster": "foo", "nodepool": "bar"}
  1978  
  1979  	mhc1 := newMachineHealthCheckWithLabels("mhc1", namespace, clusterName, labels)
  1980  	mhc1Req := reconcile.Request{NamespacedName: types.NamespacedName{Namespace: mhc1.Namespace, Name: mhc1.Name}}
  1981  	mhc2 := newMachineHealthCheckWithLabels("mhc2", namespace, clusterName, labels)
  1982  	mhc2Req := reconcile.Request{NamespacedName: types.NamespacedName{Namespace: mhc2.Namespace, Name: mhc2.Name}}
  1983  	mhc3 := newMachineHealthCheckWithLabels("mhc3", namespace, "othercluster", labels)
  1984  	mhc4 := newMachineHealthCheckWithLabels("mhc4", "othernamespace", clusterName, labels)
  1985  
  1986  	machine1 := newTestMachine("machine1", namespace, clusterName, nodeName, labels)
  1987  	machine2 := newTestMachine("machine2", namespace, clusterName, nodeName, labels)
  1988  
  1989  	node1 := &corev1.Node{
  1990  		ObjectMeta: metav1.ObjectMeta{
  1991  			Name: nodeName,
  1992  		},
  1993  	}
  1994  
  1995  	testCases := []struct {
  1996  		name        string
  1997  		mhcToCreate []clusterv1.MachineHealthCheck
  1998  		mToCreate   []clusterv1.Machine
  1999  		object      client.Object
  2000  		expected    []reconcile.Request
  2001  	}{
  2002  		{
  2003  			name:        "when no Machine exists for the Node",
  2004  			mhcToCreate: []clusterv1.MachineHealthCheck{*mhc1},
  2005  			mToCreate:   []clusterv1.Machine{},
  2006  			object:      node1,
  2007  			expected:    []reconcile.Request{},
  2008  		},
  2009  		{
  2010  			name:        "when two Machines exist for the Node",
  2011  			mhcToCreate: []clusterv1.MachineHealthCheck{*mhc1},
  2012  			mToCreate:   []clusterv1.Machine{*machine1, *machine2},
  2013  			object:      node1,
  2014  			expected:    []reconcile.Request{},
  2015  		},
  2016  		{
  2017  			name:        "when no MachineHealthCheck exists for the Node in the Machine's namespace",
  2018  			mhcToCreate: []clusterv1.MachineHealthCheck{*mhc4},
  2019  			mToCreate:   []clusterv1.Machine{*machine1},
  2020  			object:      node1,
  2021  			expected:    []reconcile.Request{},
  2022  		},
  2023  		{
  2024  			name:        "when a MachineHealthCheck exists for the Node in the Machine's namespace",
  2025  			mhcToCreate: []clusterv1.MachineHealthCheck{*mhc1},
  2026  			mToCreate:   []clusterv1.Machine{*machine1},
  2027  			object:      node1,
  2028  			expected:    []reconcile.Request{mhc1Req},
  2029  		},
  2030  		{
  2031  			name:        "when two MachineHealthChecks exist for the Node in the Machine's namespace",
  2032  			mhcToCreate: []clusterv1.MachineHealthCheck{*mhc1, *mhc2},
  2033  			mToCreate:   []clusterv1.Machine{*machine1},
  2034  			object:      node1,
  2035  			expected:    []reconcile.Request{mhc1Req, mhc2Req},
  2036  		},
  2037  		{
  2038  			name:        "when a MachineHealthCheck exists for the Node, but not in the Machine's cluster",
  2039  			mhcToCreate: []clusterv1.MachineHealthCheck{*mhc3},
  2040  			mToCreate:   []clusterv1.Machine{*machine1},
  2041  			object:      node1,
  2042  			expected:    []reconcile.Request{},
  2043  		},
  2044  	}
  2045  
  2046  	for _, tc := range testCases {
  2047  		t.Run(tc.name, func(t *testing.T) {
  2048  			gs := NewWithT(t)
  2049  
  2050  			for _, obj := range tc.mhcToCreate {
  2051  				o := obj
  2052  				gs.Expect(r.Client.Create(ctx, &o)).To(Succeed())
  2053  				defer func() {
  2054  					gs.Expect(r.Client.Delete(ctx, &o)).To(Succeed())
  2055  				}()
  2056  				// Check the cache is populated
  2057  				key := util.ObjectKey(&o)
  2058  				getObj := func() error {
  2059  					return r.Client.Get(ctx, key, &clusterv1.MachineHealthCheck{})
  2060  				}
  2061  				gs.Eventually(getObj).Should(Succeed())
  2062  			}
  2063  			for _, obj := range tc.mToCreate {
  2064  				o := obj
  2065  				gs.Expect(r.Client.Create(ctx, &o)).To(Succeed())
  2066  				defer func() {
  2067  					gs.Expect(r.Client.Delete(ctx, &o)).To(Succeed())
  2068  				}()
  2069  				// Ensure the status is set (required for matching node to machine)
  2070  				o.Status = obj.Status
  2071  				gs.Expect(r.Client.Status().Update(ctx, &o)).To(Succeed())
  2072  
  2073  				// Check the cache is up to date with the status update
  2074  				key := util.ObjectKey(&o)
  2075  				checkStatus := func() clusterv1.MachineStatus {
  2076  					m := &clusterv1.Machine{}
  2077  					err := r.Client.Get(ctx, key, m)
  2078  					if err != nil {
  2079  						return clusterv1.MachineStatus{}
  2080  					}
  2081  					return m.Status
  2082  				}
  2083  				gs.Eventually(checkStatus).Should(BeComparableTo(o.Status))
  2084  			}
  2085  
  2086  			got := r.nodeToMachineHealthCheck(ctx, tc.object)
  2087  			gs.Expect(got).To(ConsistOf(tc.expected))
  2088  		})
  2089  	}
  2090  }
  2091  
  2092  func TestIsAllowedRemediation(t *testing.T) {
  2093  	testCases := []struct {
  2094  		name               string
  2095  		maxUnhealthy       *intstr.IntOrString
  2096  		expectedMachines   int32
  2097  		currentHealthy     int32
  2098  		allowed            bool
  2099  		observedGeneration int64
  2100  	}{
  2101  		{
  2102  			name:             "when maxUnhealthy is not set",
  2103  			maxUnhealthy:     nil,
  2104  			expectedMachines: int32(3),
  2105  			currentHealthy:   int32(0),
  2106  			allowed:          false,
  2107  		},
  2108  		{
  2109  			name:             "when maxUnhealthy is not an int or percentage",
  2110  			maxUnhealthy:     &intstr.IntOrString{Type: intstr.String, StrVal: "abcdef"},
  2111  			expectedMachines: int32(5),
  2112  			currentHealthy:   int32(2),
  2113  			allowed:          false,
  2114  		},
  2115  		{
  2116  			name:             "when maxUnhealthy is an int less than current unhealthy",
  2117  			maxUnhealthy:     &intstr.IntOrString{Type: intstr.Int, IntVal: int32(1)},
  2118  			expectedMachines: int32(3),
  2119  			currentHealthy:   int32(1),
  2120  			allowed:          false,
  2121  		},
  2122  		{
  2123  			name:             "when maxUnhealthy is an int equal to current unhealthy",
  2124  			maxUnhealthy:     &intstr.IntOrString{Type: intstr.Int, IntVal: int32(2)},
  2125  			expectedMachines: int32(3),
  2126  			currentHealthy:   int32(1),
  2127  			allowed:          true,
  2128  		},
  2129  		{
  2130  			name:             "when maxUnhealthy is an int greater than current unhealthy",
  2131  			maxUnhealthy:     &intstr.IntOrString{Type: intstr.Int, IntVal: int32(3)},
  2132  			expectedMachines: int32(3),
  2133  			currentHealthy:   int32(1),
  2134  			allowed:          true,
  2135  		},
  2136  		{
  2137  			name:             "when maxUnhealthy is a percentage less than current unhealthy",
  2138  			maxUnhealthy:     &intstr.IntOrString{Type: intstr.String, StrVal: "50%"},
  2139  			expectedMachines: int32(5),
  2140  			currentHealthy:   int32(2),
  2141  			allowed:          false,
  2142  		},
  2143  		{
  2144  			name:             "when maxUnhealthy is a percentage equal to current unhealthy",
  2145  			maxUnhealthy:     &intstr.IntOrString{Type: intstr.String, StrVal: "60%"},
  2146  			expectedMachines: int32(5),
  2147  			currentHealthy:   int32(2),
  2148  			allowed:          true,
  2149  		},
  2150  		{
  2151  			name:             "when maxUnhealthy is a percentage greater than current unhealthy",
  2152  			maxUnhealthy:     &intstr.IntOrString{Type: intstr.String, StrVal: "70%"},
  2153  			expectedMachines: int32(5),
  2154  			currentHealthy:   int32(2),
  2155  			allowed:          true,
  2156  		},
  2157  	}
  2158  
  2159  	for _, tc := range testCases {
  2160  		t.Run(tc.name, func(t *testing.T) {
  2161  			g := NewWithT(t)
  2162  
  2163  			mhc := &clusterv1.MachineHealthCheck{
  2164  				Spec: clusterv1.MachineHealthCheckSpec{
  2165  					MaxUnhealthy:       tc.maxUnhealthy,
  2166  					NodeStartupTimeout: &metav1.Duration{Duration: 1 * time.Millisecond},
  2167  				},
  2168  				Status: clusterv1.MachineHealthCheckStatus{
  2169  					ExpectedMachines:   tc.expectedMachines,
  2170  					CurrentHealthy:     tc.currentHealthy,
  2171  					ObservedGeneration: tc.observedGeneration,
  2172  				},
  2173  			}
  2174  
  2175  			remediationAllowed, _, _ := isAllowedRemediation(mhc)
  2176  			g.Expect(remediationAllowed).To(Equal(tc.allowed))
  2177  		})
  2178  	}
  2179  }
  2180  
  2181  func TestGetMaxUnhealthy(t *testing.T) {
  2182  	testCases := []struct {
  2183  		name                 string
  2184  		maxUnhealthy         *intstr.IntOrString
  2185  		expectedMaxUnhealthy int
  2186  		actualMachineCount   int32
  2187  		expectedErr          error
  2188  	}{
  2189  		{
  2190  			name:                 "when maxUnhealthy is nil",
  2191  			maxUnhealthy:         nil,
  2192  			expectedMaxUnhealthy: 0,
  2193  			actualMachineCount:   7,
  2194  			expectedErr:          errors.New("spec.maxUnhealthy must be set"),
  2195  		},
  2196  		{
  2197  			name:                 "when maxUnhealthy is not an int or percentage",
  2198  			maxUnhealthy:         &intstr.IntOrString{Type: intstr.String, StrVal: "abcdef"},
  2199  			expectedMaxUnhealthy: 0,
  2200  			actualMachineCount:   3,
  2201  			expectedErr:          errors.New("invalid value for IntOrString: invalid type: string is not a percentage"),
  2202  		},
  2203  		{
  2204  			name:                 "when maxUnhealthy is an int",
  2205  			maxUnhealthy:         &intstr.IntOrString{Type: intstr.Int, IntVal: 3},
  2206  			actualMachineCount:   2,
  2207  			expectedMaxUnhealthy: 3,
  2208  			expectedErr:          nil,
  2209  		},
  2210  		{
  2211  			name:                 "when maxUnhealthy is a 40% (of 5)",
  2212  			maxUnhealthy:         &intstr.IntOrString{Type: intstr.String, StrVal: "40%"},
  2213  			actualMachineCount:   5,
  2214  			expectedMaxUnhealthy: 2,
  2215  			expectedErr:          nil,
  2216  		},
  2217  		{
  2218  			name:                 "when maxUnhealthy is a 60% (of 7)",
  2219  			maxUnhealthy:         &intstr.IntOrString{Type: intstr.String, StrVal: "60%"},
  2220  			actualMachineCount:   7,
  2221  			expectedMaxUnhealthy: 4,
  2222  			expectedErr:          nil,
  2223  		},
  2224  	}
  2225  
  2226  	for _, tc := range testCases {
  2227  		t.Run(tc.name, func(t *testing.T) {
  2228  			g := NewWithT(t)
  2229  
  2230  			mhc := &clusterv1.MachineHealthCheck{
  2231  				Spec: clusterv1.MachineHealthCheckSpec{
  2232  					MaxUnhealthy: tc.maxUnhealthy,
  2233  				},
  2234  				Status: clusterv1.MachineHealthCheckStatus{
  2235  					ExpectedMachines: tc.actualMachineCount,
  2236  				},
  2237  			}
  2238  
  2239  			maxUnhealthy, err := getMaxUnhealthy(mhc)
  2240  			if tc.expectedErr != nil {
  2241  				g.Expect(err).To(MatchError(tc.expectedErr.Error()))
  2242  			} else {
  2243  				g.Expect(err).ToNot(HaveOccurred())
  2244  			}
  2245  			g.Expect(maxUnhealthy).To(Equal(tc.expectedMaxUnhealthy))
  2246  		})
  2247  	}
  2248  }
  2249  
  2250  func ownerReferenceForCluster(ctx context.Context, g *WithT, c *clusterv1.Cluster) metav1.OwnerReference {
  2251  	// Fetch the cluster to populate the UID
  2252  	cc := &clusterv1.Cluster{}
  2253  	g.Expect(env.Get(ctx, util.ObjectKey(c), cc)).To(Succeed())
  2254  
  2255  	return metav1.OwnerReference{
  2256  		APIVersion: clusterv1.GroupVersion.String(),
  2257  		Kind:       "Cluster",
  2258  		Name:       cc.Name,
  2259  		UID:        cc.UID,
  2260  	}
  2261  }
  2262  
  2263  // createCluster creates a Cluster and KubeconfigSecret for that cluster in said namespace.
  2264  func createCluster(g *WithT, namespaceName string) *clusterv1.Cluster {
  2265  	cluster := &clusterv1.Cluster{
  2266  		ObjectMeta: metav1.ObjectMeta{
  2267  			GenerateName: "test-cluster-",
  2268  			Namespace:    namespaceName,
  2269  		},
  2270  	}
  2271  
  2272  	g.Expect(env.Create(ctx, cluster)).To(Succeed())
  2273  
  2274  	// Make sure the cluster is in the cache before proceeding
  2275  	g.Eventually(func() error {
  2276  		var cl clusterv1.Cluster
  2277  		return env.Get(ctx, util.ObjectKey(cluster), &cl)
  2278  	}, timeout, 100*time.Millisecond).Should(Succeed())
  2279  
  2280  	// This is required for MHC to perform checks
  2281  	patchHelper, err := patch.NewHelper(cluster, env.Client)
  2282  	g.Expect(err).ToNot(HaveOccurred())
  2283  	conditions.MarkTrue(cluster, clusterv1.InfrastructureReadyCondition)
  2284  	g.Expect(patchHelper.Patch(ctx, cluster)).To(Succeed())
  2285  
  2286  	// Wait for cluster in cache to be updated post-patch
  2287  	g.Eventually(func() bool {
  2288  		err := env.Get(ctx, util.ObjectKey(cluster), cluster)
  2289  		if err != nil {
  2290  			return false
  2291  		}
  2292  
  2293  		return conditions.IsTrue(cluster, clusterv1.InfrastructureReadyCondition)
  2294  	}, timeout, 100*time.Millisecond).Should(BeTrue())
  2295  
  2296  	g.Expect(env.CreateKubeconfigSecret(ctx, cluster)).To(Succeed())
  2297  
  2298  	return cluster
  2299  }
  2300  
  2301  // newRunningMachine creates a Machine object with a Status.Phase == Running.
  2302  func newRunningMachine(c *clusterv1.Cluster, labels map[string]string) *clusterv1.Machine {
  2303  	return &clusterv1.Machine{
  2304  		TypeMeta: metav1.TypeMeta{
  2305  			APIVersion: clusterv1.GroupVersion.String(),
  2306  			Kind:       "Machine",
  2307  		},
  2308  		ObjectMeta: metav1.ObjectMeta{
  2309  			GenerateName: "test-mhc-machine-",
  2310  			Namespace:    c.Namespace,
  2311  			Labels:       labels,
  2312  		},
  2313  		Spec: clusterv1.MachineSpec{
  2314  			ClusterName: c.Name,
  2315  			Bootstrap: clusterv1.Bootstrap{
  2316  				DataSecretName: ptr.To("data-secret-name"),
  2317  			},
  2318  		},
  2319  		Status: clusterv1.MachineStatus{
  2320  			InfrastructureReady: true,
  2321  			BootstrapReady:      true,
  2322  			Phase:               string(clusterv1.MachinePhaseRunning),
  2323  			ObservedGeneration:  1,
  2324  		},
  2325  	}
  2326  }
  2327  
  2328  func newInfraMachine(machine *clusterv1.Machine) (*unstructured.Unstructured, string) {
  2329  	providerID := fmt.Sprintf("test:////%v", uuid.NewUUID())
  2330  	return &unstructured.Unstructured{
  2331  		Object: map[string]interface{}{
  2332  			"apiVersion": "infrastructure.cluster.x-k8s.io/v1beta1",
  2333  			"kind":       "GenericInfrastructureMachine",
  2334  			"metadata": map[string]interface{}{
  2335  				"generateName": "test-mhc-machine-infra-",
  2336  				"namespace":    machine.Namespace,
  2337  			},
  2338  			"spec": map[string]interface{}{
  2339  				"providerID": providerID,
  2340  			},
  2341  		},
  2342  	}, providerID
  2343  }
  2344  
  2345  type machinesWithNodes struct {
  2346  	count                      int
  2347  	nodeStatus                 corev1.ConditionStatus
  2348  	createNodeRefForMachine    bool
  2349  	firstMachineAsControlPlane bool
  2350  	labels                     map[string]string
  2351  	failureReason              string
  2352  	failureMessage             string
  2353  }
  2354  
  2355  type machineWithNodesOption func(m *machinesWithNodes)
  2356  
  2357  func count(n int) machineWithNodesOption {
  2358  	return func(m *machinesWithNodes) {
  2359  		m.count = n
  2360  	}
  2361  }
  2362  
  2363  func firstMachineAsControlPlane() machineWithNodesOption {
  2364  	return func(m *machinesWithNodes) {
  2365  		m.firstMachineAsControlPlane = true
  2366  	}
  2367  }
  2368  
  2369  func nodeStatus(s corev1.ConditionStatus) machineWithNodesOption {
  2370  	return func(m *machinesWithNodes) {
  2371  		m.nodeStatus = s
  2372  	}
  2373  }
  2374  
  2375  func createNodeRefForMachine(b bool) machineWithNodesOption {
  2376  	return func(m *machinesWithNodes) {
  2377  		m.createNodeRefForMachine = b
  2378  	}
  2379  }
  2380  
  2381  func machineLabels(l map[string]string) machineWithNodesOption {
  2382  	return func(m *machinesWithNodes) {
  2383  		m.labels = l
  2384  	}
  2385  }
  2386  
  2387  func machineFailureReason(s string) machineWithNodesOption {
  2388  	return func(m *machinesWithNodes) {
  2389  		m.failureReason = s
  2390  	}
  2391  }
  2392  
  2393  func machineFailureMessage(s string) machineWithNodesOption {
  2394  	return func(m *machinesWithNodes) {
  2395  		m.failureMessage = s
  2396  	}
  2397  }
  2398  
  2399  func createMachinesWithNodes(
  2400  	g *WithT,
  2401  	c *clusterv1.Cluster,
  2402  	opts ...machineWithNodesOption,
  2403  ) ([]*corev1.Node, []*clusterv1.Machine, func()) {
  2404  	o := &machinesWithNodes{}
  2405  	for _, op := range opts {
  2406  		op(o)
  2407  	}
  2408  
  2409  	var (
  2410  		nodes         []*corev1.Node
  2411  		machines      []*clusterv1.Machine
  2412  		infraMachines []*unstructured.Unstructured
  2413  	)
  2414  
  2415  	for i := 0; i < o.count; i++ {
  2416  		machine := newRunningMachine(c, o.labels)
  2417  		if i == 0 && o.firstMachineAsControlPlane {
  2418  			if machine.Labels == nil {
  2419  				machine.Labels = make(map[string]string)
  2420  			}
  2421  			machine.Labels[clusterv1.MachineControlPlaneLabel] = ""
  2422  		}
  2423  		infraMachine, providerID := newInfraMachine(machine)
  2424  		g.Expect(env.Create(ctx, infraMachine)).To(Succeed())
  2425  		infraMachines = append(infraMachines, infraMachine)
  2426  		fmt.Printf("inframachine created: %s\n", infraMachine.GetName())
  2427  		// Patch the status of the InfraMachine and mark it as ready.
  2428  		// NB. Status cannot be set during object creation so we need to patch
  2429  		// it separately.
  2430  		infraMachinePatch := client.MergeFrom(infraMachine.DeepCopy())
  2431  		g.Expect(unstructured.SetNestedField(infraMachine.Object, true, "status", "ready")).To(Succeed())
  2432  		g.Expect(env.Status().Patch(ctx, infraMachine, infraMachinePatch)).To(Succeed())
  2433  
  2434  		machine.Spec.InfrastructureRef = corev1.ObjectReference{
  2435  			APIVersion: infraMachine.GetAPIVersion(),
  2436  			Kind:       infraMachine.GetKind(),
  2437  			Name:       infraMachine.GetName(),
  2438  		}
  2439  		g.Expect(env.Create(ctx, machine)).To(Succeed())
  2440  		fmt.Printf("machine created: %s\n", machine.GetName())
  2441  
  2442  		// Before moving on we want to ensure that the machine has a valid
  2443  		// status. That is, LastUpdated should not be nil.
  2444  		g.Eventually(func() *metav1.Time {
  2445  			k := client.ObjectKey{
  2446  				Name:      machine.GetName(),
  2447  				Namespace: machine.GetNamespace(),
  2448  			}
  2449  			err := env.Get(ctx, k, machine)
  2450  			if err != nil {
  2451  				return nil
  2452  			}
  2453  			return machine.Status.LastUpdated
  2454  		}, timeout, 100*time.Millisecond).ShouldNot(BeNil())
  2455  
  2456  		machinePatchHelper, err := patch.NewHelper(machine, env.Client)
  2457  		g.Expect(err).ToNot(HaveOccurred())
  2458  
  2459  		if o.createNodeRefForMachine {
  2460  			// Create node
  2461  			node := &corev1.Node{
  2462  				ObjectMeta: metav1.ObjectMeta{
  2463  					GenerateName: "test-mhc-node-",
  2464  				},
  2465  				Spec: corev1.NodeSpec{
  2466  					ProviderID: providerID,
  2467  				},
  2468  			}
  2469  
  2470  			g.Expect(env.Create(ctx, node)).To(Succeed())
  2471  			fmt.Printf("node created: %s\n", node.GetName())
  2472  
  2473  			// Patch node status
  2474  			nodePatchHelper, err := patch.NewHelper(node, env.Client)
  2475  			g.Expect(err).ToNot(HaveOccurred())
  2476  
  2477  			node.Status.Conditions = []corev1.NodeCondition{
  2478  				{
  2479  					Type:               corev1.NodeReady,
  2480  					Status:             o.nodeStatus,
  2481  					LastTransitionTime: metav1.NewTime(time.Now().Add(-10 * time.Minute)),
  2482  				},
  2483  			}
  2484  
  2485  			g.Expect(nodePatchHelper.Patch(ctx, node)).To(Succeed())
  2486  
  2487  			nodes = append(nodes, node)
  2488  
  2489  			machine.Status.NodeRef = &corev1.ObjectReference{
  2490  				Name: node.Name,
  2491  			}
  2492  		}
  2493  
  2494  		if o.failureReason != "" {
  2495  			failureReason := capierrors.MachineStatusError(o.failureReason)
  2496  			machine.Status.FailureReason = &failureReason
  2497  		}
  2498  		if o.failureMessage != "" {
  2499  			machine.Status.FailureMessage = ptr.To(o.failureMessage)
  2500  		}
  2501  
  2502  		// Adding one second to ensure there is a difference from the
  2503  		// original time so that the patch works. That is, ensure the
  2504  		// precision isn't lost during conversions.
  2505  		lastUp := metav1.NewTime(machine.Status.LastUpdated.Add(time.Second))
  2506  		machine.Status.LastUpdated = &lastUp
  2507  
  2508  		// Patch the machine to record the status changes
  2509  		g.Expect(machinePatchHelper.Patch(ctx, machine)).To(Succeed())
  2510  
  2511  		machines = append(machines, machine)
  2512  	}
  2513  
  2514  	cleanup := func() {
  2515  		fmt.Println("Cleaning up nodes, machines and infra machines.")
  2516  		for _, n := range nodes {
  2517  			if err := env.Delete(ctx, n); !apierrors.IsNotFound(err) {
  2518  				g.Expect(err).ToNot(HaveOccurred())
  2519  			}
  2520  		}
  2521  		for _, m := range machines {
  2522  			g.Expect(env.Delete(ctx, m)).To(Succeed())
  2523  		}
  2524  		for _, im := range infraMachines {
  2525  			if err := env.Delete(ctx, im); !apierrors.IsNotFound(err) {
  2526  				g.Expect(err).ToNot(HaveOccurred())
  2527  			}
  2528  		}
  2529  	}
  2530  
  2531  	return nodes, machines, cleanup
  2532  }
  2533  
  2534  func newMachineHealthCheckWithLabels(name, namespace, cluster string, labels map[string]string) *clusterv1.MachineHealthCheck {
  2535  	l := make(map[string]string, len(labels))
  2536  	for k, v := range labels {
  2537  		l[k] = v
  2538  	}
  2539  	l[clusterv1.ClusterNameLabel] = cluster
  2540  
  2541  	mhc := newMachineHealthCheck(namespace, cluster)
  2542  	mhc.SetName(name)
  2543  	mhc.Labels = l
  2544  	mhc.Spec.Selector.MatchLabels = l
  2545  
  2546  	return mhc
  2547  }
  2548  
  2549  func newMachineHealthCheck(namespace, clusterName string) *clusterv1.MachineHealthCheck {
  2550  	maxUnhealthy := intstr.FromString("100%")
  2551  	return &clusterv1.MachineHealthCheck{
  2552  		ObjectMeta: metav1.ObjectMeta{
  2553  			GenerateName: "test-mhc-",
  2554  			Namespace:    namespace,
  2555  		},
  2556  		Spec: clusterv1.MachineHealthCheckSpec{
  2557  			ClusterName: clusterName,
  2558  			Selector: metav1.LabelSelector{
  2559  				MatchLabels: map[string]string{
  2560  					"selector": string(uuid.NewUUID()),
  2561  				},
  2562  			},
  2563  			MaxUnhealthy:       &maxUnhealthy,
  2564  			NodeStartupTimeout: &metav1.Duration{Duration: 1 * time.Millisecond},
  2565  			UnhealthyConditions: []clusterv1.UnhealthyCondition{
  2566  				{
  2567  					Type:    corev1.NodeReady,
  2568  					Status:  corev1.ConditionUnknown,
  2569  					Timeout: metav1.Duration{Duration: 5 * time.Minute},
  2570  				},
  2571  			},
  2572  		},
  2573  	}
  2574  }
  2575  
  2576  func TestPatchTargets(t *testing.T) {
  2577  	g := NewWithT(t)
  2578  
  2579  	namespace := metav1.NamespaceDefault
  2580  	clusterName := testClusterName
  2581  	defaultCluster := &clusterv1.Cluster{
  2582  		ObjectMeta: metav1.ObjectMeta{
  2583  			Name:      clusterName,
  2584  			Namespace: namespace,
  2585  		},
  2586  	}
  2587  	labels := map[string]string{"cluster": "foo", "nodepool": "bar"}
  2588  
  2589  	mhc := newMachineHealthCheckWithLabels("mhc", namespace, clusterName, labels)
  2590  	machine1 := newTestMachine("machine1", namespace, clusterName, "nodeName", labels)
  2591  	machine1.ResourceVersion = "999"
  2592  	conditions.MarkTrue(machine1, clusterv1.MachineHealthCheckSucceededCondition)
  2593  	machine2 := machine1.DeepCopy()
  2594  	machine2.Name = "machine2"
  2595  
  2596  	cl := fake.NewClientBuilder().WithObjects(
  2597  		machine1,
  2598  		machine2,
  2599  		mhc,
  2600  	).WithStatusSubresource(&clusterv1.MachineHealthCheck{}, &clusterv1.Machine{}).Build()
  2601  	r := &Reconciler{
  2602  		Client:   cl,
  2603  		recorder: record.NewFakeRecorder(32),
  2604  		Tracker:  remote.NewTestClusterCacheTracker(logr.New(log.NullLogSink{}), cl, cl, scheme.Scheme, client.ObjectKey{Name: clusterName, Namespace: namespace}, "machinehealthcheck-watchClusterNodes"),
  2605  	}
  2606  
  2607  	// To make the patch fail, create patchHelper with a different client.
  2608  	fakeMachine := machine1.DeepCopy()
  2609  	fakeMachine.Name = "fake"
  2610  	patchHelper, err := patch.NewHelper(fakeMachine, fake.NewClientBuilder().WithObjects(fakeMachine).Build())
  2611  	g.Expect(err).ToNot(HaveOccurred())
  2612  	// healthCheckTarget with fake patchHelper, patch should fail on this target.
  2613  	target1 := healthCheckTarget{
  2614  		MHC:         mhc,
  2615  		Machine:     machine1,
  2616  		patchHelper: patchHelper,
  2617  		Node:        &corev1.Node{},
  2618  	}
  2619  
  2620  	// healthCheckTarget with correct patchHelper.
  2621  	patchHelper2, err := patch.NewHelper(machine2, cl)
  2622  	g.Expect(err).ToNot(HaveOccurred())
  2623  	target3 := healthCheckTarget{
  2624  		MHC:         mhc,
  2625  		Machine:     machine2,
  2626  		patchHelper: patchHelper2,
  2627  		Node:        &corev1.Node{},
  2628  	}
  2629  
  2630  	// Target with wrong patch helper will fail but the other one will be patched.
  2631  	g.Expect(r.patchUnhealthyTargets(context.TODO(), logr.New(log.NullLogSink{}), []healthCheckTarget{target1, target3}, defaultCluster, mhc)).ToNot(BeEmpty())
  2632  	g.Expect(cl.Get(ctx, client.ObjectKey{Name: machine2.Name, Namespace: machine2.Namespace}, machine2)).ToNot(HaveOccurred())
  2633  	g.Expect(conditions.Get(machine2, clusterv1.MachineOwnerRemediatedCondition).Status).To(Equal(corev1.ConditionFalse))
  2634  
  2635  	// Target with wrong patch helper will fail but the other one will be patched.
  2636  	g.Expect(r.patchHealthyTargets(context.TODO(), logr.New(log.NullLogSink{}), []healthCheckTarget{target1, target3}, mhc)).ToNot(BeEmpty())
  2637  }