sigs.k8s.io/cluster-api@v1.7.1/internal/controllers/machinehealthcheck/machinehealthcheck_controller.go (about)

     1  /*
     2  Copyright 2020 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package machinehealthcheck
    18  
    19  import (
    20  	"context"
    21  	"fmt"
    22  	"sort"
    23  	"strconv"
    24  	"strings"
    25  	"time"
    26  
    27  	"github.com/go-logr/logr"
    28  	"github.com/pkg/errors"
    29  	corev1 "k8s.io/api/core/v1"
    30  	apierrors "k8s.io/apimachinery/pkg/api/errors"
    31  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    32  	"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
    33  	"k8s.io/apimachinery/pkg/types"
    34  	kerrors "k8s.io/apimachinery/pkg/util/errors"
    35  	"k8s.io/apimachinery/pkg/util/intstr"
    36  	"k8s.io/client-go/tools/record"
    37  	"k8s.io/klog/v2"
    38  	ctrl "sigs.k8s.io/controller-runtime"
    39  	"sigs.k8s.io/controller-runtime/pkg/builder"
    40  	"sigs.k8s.io/controller-runtime/pkg/client"
    41  	"sigs.k8s.io/controller-runtime/pkg/controller"
    42  	"sigs.k8s.io/controller-runtime/pkg/handler"
    43  	"sigs.k8s.io/controller-runtime/pkg/reconcile"
    44  
    45  	clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1"
    46  	"sigs.k8s.io/cluster-api/api/v1beta1/index"
    47  	"sigs.k8s.io/cluster-api/controllers/external"
    48  	"sigs.k8s.io/cluster-api/controllers/remote"
    49  	"sigs.k8s.io/cluster-api/internal/controllers/machine"
    50  	"sigs.k8s.io/cluster-api/util"
    51  	"sigs.k8s.io/cluster-api/util/annotations"
    52  	"sigs.k8s.io/cluster-api/util/conditions"
    53  	"sigs.k8s.io/cluster-api/util/patch"
    54  	"sigs.k8s.io/cluster-api/util/predicates"
    55  )
    56  
    57  const (
    58  	// Event types.
    59  
    60  	// EventRemediationRestricted is emitted in case when machine remediation
    61  	// is restricted by remediation circuit shorting logic.
    62  	EventRemediationRestricted string = "RemediationRestricted"
    63  
    64  	maxUnhealthyKeyLog     = "max unhealthy"
    65  	unhealthyTargetsKeyLog = "unhealthy targets"
    66  	unhealthyRangeKeyLog   = "unhealthy range"
    67  	totalTargetKeyLog      = "total target"
    68  )
    69  
    70  // +kubebuilder:rbac:groups=core,resources=events,verbs=get;list;watch;create;patch
    71  // +kubebuilder:rbac:groups=core,resources=secrets,verbs=get;list;watch
    72  // +kubebuilder:rbac:groups=cluster.x-k8s.io,resources=machines;machines/status,verbs=get;list;watch;delete
    73  // +kubebuilder:rbac:groups=cluster.x-k8s.io,resources=machinehealthchecks;machinehealthchecks/status;machinehealthchecks/finalizers,verbs=get;list;watch;update;patch
    74  
    75  // Reconciler reconciles a MachineHealthCheck object.
    76  type Reconciler struct {
    77  	Client  client.Client
    78  	Tracker *remote.ClusterCacheTracker
    79  
    80  	// WatchFilterValue is the label value used to filter events prior to reconciliation.
    81  	WatchFilterValue string
    82  
    83  	controller controller.Controller
    84  	recorder   record.EventRecorder
    85  }
    86  
    87  func (r *Reconciler) SetupWithManager(ctx context.Context, mgr ctrl.Manager, options controller.Options) error {
    88  	c, err := ctrl.NewControllerManagedBy(mgr).
    89  		For(&clusterv1.MachineHealthCheck{}).
    90  		Watches(
    91  			&clusterv1.Machine{},
    92  			handler.EnqueueRequestsFromMapFunc(r.machineToMachineHealthCheck),
    93  		).
    94  		WithOptions(options).
    95  		WithEventFilter(predicates.ResourceNotPausedAndHasFilterLabel(ctrl.LoggerFrom(ctx), r.WatchFilterValue)).
    96  		Watches(
    97  			&clusterv1.Cluster{},
    98  			handler.EnqueueRequestsFromMapFunc(r.clusterToMachineHealthCheck),
    99  			builder.WithPredicates(
   100  				// TODO: should this wait for Cluster.Status.InfrastructureReady similar to Infra Machine resources?
   101  				predicates.All(ctrl.LoggerFrom(ctx),
   102  					predicates.ClusterUnpaused(ctrl.LoggerFrom(ctx)),
   103  					predicates.ResourceHasFilterLabel(ctrl.LoggerFrom(ctx), r.WatchFilterValue),
   104  				),
   105  			),
   106  		).Build(r)
   107  	if err != nil {
   108  		return errors.Wrap(err, "failed setting up with a controller manager")
   109  	}
   110  
   111  	r.controller = c
   112  	r.recorder = mgr.GetEventRecorderFor("machinehealthcheck-controller")
   113  	return nil
   114  }
   115  
   116  func (r *Reconciler) Reconcile(ctx context.Context, req ctrl.Request) (_ ctrl.Result, reterr error) {
   117  	log := ctrl.LoggerFrom(ctx)
   118  
   119  	// Fetch the MachineHealthCheck instance
   120  	m := &clusterv1.MachineHealthCheck{}
   121  	if err := r.Client.Get(ctx, req.NamespacedName, m); err != nil {
   122  		if apierrors.IsNotFound(err) {
   123  			// Object not found, return.  Created objects are automatically garbage collected.
   124  			// For additional cleanup logic use finalizers.
   125  			return ctrl.Result{}, nil
   126  		}
   127  
   128  		// Error reading the object - requeue the request.
   129  		log.Error(err, "Failed to fetch MachineHealthCheck")
   130  		return ctrl.Result{}, err
   131  	}
   132  
   133  	log = log.WithValues("Cluster", klog.KRef(m.Namespace, m.Spec.ClusterName))
   134  	ctx = ctrl.LoggerInto(ctx, log)
   135  
   136  	cluster, err := util.GetClusterByName(ctx, r.Client, m.Namespace, m.Spec.ClusterName)
   137  	if err != nil {
   138  		log.Error(err, "Failed to fetch Cluster for MachineHealthCheck")
   139  		return ctrl.Result{}, err
   140  	}
   141  
   142  	// Return early if the object or Cluster is paused.
   143  	if annotations.IsPaused(cluster, m) {
   144  		log.Info("Reconciliation is paused for this object")
   145  		return ctrl.Result{}, nil
   146  	}
   147  
   148  	// Initialize the patch helper
   149  	patchHelper, err := patch.NewHelper(m, r.Client)
   150  	if err != nil {
   151  		return ctrl.Result{}, err
   152  	}
   153  
   154  	defer func() {
   155  		// Always attempt to patch the object and status after each reconciliation.
   156  		// Patch ObservedGeneration only if the reconciliation completed successfully
   157  		patchOpts := []patch.Option{}
   158  		if reterr == nil {
   159  			patchOpts = append(patchOpts, patch.WithStatusObservedGeneration{})
   160  		}
   161  		if err := patchHelper.Patch(ctx, m, patchOpts...); err != nil {
   162  			reterr = kerrors.NewAggregate([]error{reterr, err})
   163  		}
   164  	}()
   165  
   166  	// Reconcile labels.
   167  	if m.Labels == nil {
   168  		m.Labels = make(map[string]string)
   169  	}
   170  	m.Labels[clusterv1.ClusterNameLabel] = m.Spec.ClusterName
   171  
   172  	result, err := r.reconcile(ctx, log, cluster, m)
   173  	if err != nil {
   174  		// Requeue if the reconcile failed because the ClusterCacheTracker was locked for
   175  		// the current cluster because of concurrent access.
   176  		if errors.Is(err, remote.ErrClusterLocked) {
   177  			log.V(5).Info("Requeuing because another worker has the lock on the ClusterCacheTracker")
   178  			return ctrl.Result{RequeueAfter: time.Minute}, nil
   179  		}
   180  		log.Error(err, "Failed to reconcile MachineHealthCheck")
   181  		r.recorder.Eventf(m, corev1.EventTypeWarning, "ReconcileError", "%v", err)
   182  
   183  		// Requeue immediately if any errors occurred
   184  		return ctrl.Result{}, err
   185  	}
   186  
   187  	return result, nil
   188  }
   189  
   190  func (r *Reconciler) reconcile(ctx context.Context, logger logr.Logger, cluster *clusterv1.Cluster, m *clusterv1.MachineHealthCheck) (ctrl.Result, error) {
   191  	// Ensure the MachineHealthCheck is owned by the Cluster it belongs to
   192  	m.SetOwnerReferences(util.EnsureOwnerRef(m.GetOwnerReferences(), metav1.OwnerReference{
   193  		APIVersion: clusterv1.GroupVersion.String(),
   194  		Kind:       "Cluster",
   195  		Name:       cluster.Name,
   196  		UID:        cluster.UID,
   197  	}))
   198  
   199  	// If the cluster is already initialized, get the remote cluster cache to use as a client.Reader.
   200  	var remoteClient client.Client
   201  	if conditions.IsTrue(cluster, clusterv1.ControlPlaneInitializedCondition) {
   202  		var err error
   203  		remoteClient, err = r.Tracker.GetClient(ctx, util.ObjectKey(cluster))
   204  		if err != nil {
   205  			logger.Error(err, "error creating remote cluster cache")
   206  			return ctrl.Result{}, err
   207  		}
   208  
   209  		if err := r.watchClusterNodes(ctx, cluster); err != nil {
   210  			return ctrl.Result{}, err
   211  		}
   212  	}
   213  
   214  	// fetch all targets
   215  	logger.V(3).Info("Finding targets")
   216  	targets, err := r.getTargetsFromMHC(ctx, logger, remoteClient, cluster, m)
   217  	if err != nil {
   218  		logger.Error(err, "Failed to fetch targets from MachineHealthCheck")
   219  		return ctrl.Result{}, err
   220  	}
   221  	totalTargets := len(targets)
   222  	m.Status.ExpectedMachines = int32(totalTargets)
   223  	m.Status.Targets = make([]string, totalTargets)
   224  	for i, t := range targets {
   225  		m.Status.Targets[i] = t.Machine.Name
   226  	}
   227  	// do sort to avoid keep changing m.Status as the returned machines are not in order
   228  	sort.Strings(m.Status.Targets)
   229  
   230  	nodeStartupTimeout := m.Spec.NodeStartupTimeout
   231  	if nodeStartupTimeout == nil {
   232  		nodeStartupTimeout = &clusterv1.DefaultNodeStartupTimeout
   233  	}
   234  
   235  	// health check all targets and reconcile mhc status
   236  	healthy, unhealthy, nextCheckTimes := r.healthCheckTargets(targets, logger, *nodeStartupTimeout)
   237  	m.Status.CurrentHealthy = int32(len(healthy))
   238  
   239  	// check MHC current health against MaxUnhealthy
   240  	remediationAllowed, remediationCount, err := isAllowedRemediation(m)
   241  	if err != nil {
   242  		return ctrl.Result{}, errors.Wrapf(err, "error checking if remediation is allowed")
   243  	}
   244  
   245  	if !remediationAllowed {
   246  		var message string
   247  
   248  		if m.Spec.UnhealthyRange == nil {
   249  			logger.V(3).Info(
   250  				"Short-circuiting remediation",
   251  				totalTargetKeyLog, totalTargets,
   252  				maxUnhealthyKeyLog, m.Spec.MaxUnhealthy,
   253  				unhealthyTargetsKeyLog, len(unhealthy),
   254  			)
   255  			message = fmt.Sprintf("Remediation is not allowed, the number of not started or unhealthy machines exceeds maxUnhealthy (total: %v, unhealthy: %v, maxUnhealthy: %v)",
   256  				totalTargets,
   257  				len(unhealthy),
   258  				m.Spec.MaxUnhealthy)
   259  		} else {
   260  			logger.V(3).Info(
   261  				"Short-circuiting remediation",
   262  				totalTargetKeyLog, totalTargets,
   263  				unhealthyRangeKeyLog, *m.Spec.UnhealthyRange,
   264  				unhealthyTargetsKeyLog, len(unhealthy),
   265  			)
   266  			message = fmt.Sprintf("Remediation is not allowed, the number of not started or unhealthy machines does not fall within the range (total: %v, unhealthy: %v, unhealthyRange: %v)",
   267  				totalTargets,
   268  				len(unhealthy),
   269  				*m.Spec.UnhealthyRange)
   270  		}
   271  
   272  		// Remediation not allowed, the number of not started or unhealthy machines either exceeds maxUnhealthy (or) not within unhealthyRange
   273  		m.Status.RemediationsAllowed = 0
   274  		conditions.Set(m, &clusterv1.Condition{
   275  			Type:     clusterv1.RemediationAllowedCondition,
   276  			Status:   corev1.ConditionFalse,
   277  			Severity: clusterv1.ConditionSeverityWarning,
   278  			Reason:   clusterv1.TooManyUnhealthyReason,
   279  			Message:  message,
   280  		})
   281  
   282  		r.recorder.Event(
   283  			m,
   284  			corev1.EventTypeWarning,
   285  			EventRemediationRestricted,
   286  			message,
   287  		)
   288  		errList := []error{}
   289  		for _, t := range append(healthy, unhealthy...) {
   290  			if err := t.patchHelper.Patch(ctx, t.Machine); err != nil {
   291  				errList = append(errList, errors.Wrapf(err, "failed to patch machine status for machine: %s/%s", t.Machine.Namespace, t.Machine.Name))
   292  				continue
   293  			}
   294  		}
   295  		if len(errList) > 0 {
   296  			return ctrl.Result{}, kerrors.NewAggregate(errList)
   297  		}
   298  		return reconcile.Result{Requeue: true}, nil
   299  	}
   300  
   301  	if m.Spec.UnhealthyRange == nil {
   302  		logger.V(3).Info(
   303  			"Remediations are allowed",
   304  			totalTargetKeyLog, totalTargets,
   305  			maxUnhealthyKeyLog, m.Spec.MaxUnhealthy,
   306  			unhealthyTargetsKeyLog, len(unhealthy),
   307  		)
   308  	} else {
   309  		logger.V(3).Info(
   310  			"Remediations are allowed",
   311  			totalTargetKeyLog, totalTargets,
   312  			unhealthyRangeKeyLog, *m.Spec.UnhealthyRange,
   313  			unhealthyTargetsKeyLog, len(unhealthy),
   314  		)
   315  	}
   316  
   317  	// Remediation is allowed so unhealthyMachineCount is within unhealthyRange (or) maxUnhealthy - unhealthyMachineCount >= 0
   318  	m.Status.RemediationsAllowed = remediationCount
   319  	conditions.MarkTrue(m, clusterv1.RemediationAllowedCondition)
   320  
   321  	errList := r.patchUnhealthyTargets(ctx, logger, unhealthy, cluster, m)
   322  	errList = append(errList, r.patchHealthyTargets(ctx, logger, healthy, m)...)
   323  
   324  	// handle update errors
   325  	if len(errList) > 0 {
   326  		logger.V(3).Info("Error(s) marking machine, requeuing")
   327  		return reconcile.Result{}, kerrors.NewAggregate(errList)
   328  	}
   329  
   330  	if minNextCheck := minDuration(nextCheckTimes); minNextCheck > 0 {
   331  		logger.V(3).Info("Some targets might go unhealthy. Ensuring a requeue happens", "requeueIn", minNextCheck.Truncate(time.Second).String())
   332  		return ctrl.Result{RequeueAfter: minNextCheck}, nil
   333  	}
   334  
   335  	logger.V(3).Info("No more targets meet unhealthy criteria")
   336  
   337  	return ctrl.Result{}, nil
   338  }
   339  
   340  // patchHealthyTargets patches healthy machines with MachineHealthCheckSucceededCondition.
   341  func (r *Reconciler) patchHealthyTargets(ctx context.Context, logger logr.Logger, healthy []healthCheckTarget, m *clusterv1.MachineHealthCheck) []error {
   342  	errList := []error{}
   343  	for _, t := range healthy {
   344  		if m.Spec.RemediationTemplate != nil {
   345  			// Get remediation request object
   346  			obj, err := r.getExternalRemediationRequest(ctx, m, t.Machine.Name)
   347  			if err != nil {
   348  				if !apierrors.IsNotFound(errors.Cause(err)) {
   349  					wrappedErr := errors.Wrapf(err, "failed to fetch remediation request for machine %q in namespace %q within cluster %q", t.Machine.Name, t.Machine.Namespace, t.Machine.Spec.ClusterName)
   350  					errList = append(errList, wrappedErr)
   351  				}
   352  				continue
   353  			}
   354  			// Check that obj has no DeletionTimestamp to avoid hot loop
   355  			if obj.GetDeletionTimestamp() == nil {
   356  				// Issue a delete for remediation request.
   357  				if err := r.Client.Delete(ctx, obj); err != nil && !apierrors.IsNotFound(err) {
   358  					errList = append(errList, errors.Wrapf(err, "failed to delete %v %q for Machine %q", obj.GroupVersionKind(), obj.GetName(), t.Machine.Name))
   359  					continue
   360  				}
   361  			}
   362  		}
   363  
   364  		if err := t.patchHelper.Patch(ctx, t.Machine); err != nil {
   365  			logger.Error(err, "failed to patch healthy machine status for machine", "machine", t.Machine.GetName())
   366  			errList = append(errList, errors.Wrapf(err, "failed to patch healthy machine status for machine: %s/%s", t.Machine.Namespace, t.Machine.Name))
   367  		}
   368  	}
   369  	return errList
   370  }
   371  
   372  // patchUnhealthyTargets patches machines with MachineOwnerRemediatedCondition for remediation.
   373  func (r *Reconciler) patchUnhealthyTargets(ctx context.Context, logger logr.Logger, unhealthy []healthCheckTarget, cluster *clusterv1.Cluster, m *clusterv1.MachineHealthCheck) []error {
   374  	// mark for remediation
   375  	errList := []error{}
   376  	for _, t := range unhealthy {
   377  		condition := conditions.Get(t.Machine, clusterv1.MachineHealthCheckSucceededCondition)
   378  
   379  		if annotations.IsPaused(cluster, t.Machine) {
   380  			logger.Info("Machine has failed health check, but machine is paused so skipping remediation", "target", t.string(), "reason", condition.Reason, "message", condition.Message)
   381  		} else {
   382  			if m.Spec.RemediationTemplate != nil {
   383  				// If external remediation request already exists,
   384  				// return early
   385  				if r.externalRemediationRequestExists(ctx, m, t.Machine.Name) {
   386  					return errList
   387  				}
   388  
   389  				cloneOwnerRef := &metav1.OwnerReference{
   390  					APIVersion: clusterv1.GroupVersion.String(),
   391  					Kind:       "Machine",
   392  					Name:       t.Machine.Name,
   393  					UID:        t.Machine.UID,
   394  				}
   395  
   396  				from, err := external.Get(ctx, r.Client, m.Spec.RemediationTemplate, t.Machine.Namespace)
   397  				if err != nil {
   398  					conditions.MarkFalse(m, clusterv1.ExternalRemediationTemplateAvailableCondition, clusterv1.ExternalRemediationTemplateNotFoundReason, clusterv1.ConditionSeverityError, err.Error())
   399  					errList = append(errList, errors.Wrapf(err, "error retrieving remediation template %v %q for machine %q in namespace %q within cluster %q", m.Spec.RemediationTemplate.GroupVersionKind(), m.Spec.RemediationTemplate.Name, t.Machine.Name, t.Machine.Namespace, m.Spec.ClusterName))
   400  					return errList
   401  				}
   402  
   403  				generateTemplateInput := &external.GenerateTemplateInput{
   404  					Template:    from,
   405  					TemplateRef: m.Spec.RemediationTemplate,
   406  					Namespace:   t.Machine.Namespace,
   407  					ClusterName: t.Machine.Spec.ClusterName,
   408  					OwnerRef:    cloneOwnerRef,
   409  				}
   410  				to, err := external.GenerateTemplate(generateTemplateInput)
   411  				if err != nil {
   412  					errList = append(errList, errors.Wrapf(err, "failed to create template for remediation request %v %q for machine %q in namespace %q within cluster %q", m.Spec.RemediationTemplate.GroupVersionKind(), m.Spec.RemediationTemplate.Name, t.Machine.Name, t.Machine.Namespace, m.Spec.ClusterName))
   413  					return errList
   414  				}
   415  
   416  				// Set the Remediation Request to match the Machine name, the name is used to
   417  				// guarantee uniqueness between runs. A Machine should only ever have a single
   418  				// remediation object of a specific GVK created.
   419  				//
   420  				// NOTE: This doesn't guarantee uniqueness across different MHC objects watching
   421  				// the same Machine, users are in charge of setting health checks and remediation properly.
   422  				to.SetName(t.Machine.Name)
   423  
   424  				logger.Info("Target has failed health check, creating an external remediation request", "remediation request name", to.GetName(), "target", t.string(), "reason", condition.Reason, "message", condition.Message)
   425  				// Create the external clone.
   426  				if err := r.Client.Create(ctx, to); err != nil {
   427  					conditions.MarkFalse(m, clusterv1.ExternalRemediationRequestAvailableCondition, clusterv1.ExternalRemediationRequestCreationFailedReason, clusterv1.ConditionSeverityError, err.Error())
   428  					errList = append(errList, errors.Wrapf(err, "error creating remediation request for machine %q in namespace %q within cluster %q", t.Machine.Name, t.Machine.Namespace, t.Machine.Spec.ClusterName))
   429  					return errList
   430  				}
   431  			} else {
   432  				logger.Info("Target has failed health check, marking for remediation", "target", t.string(), "reason", condition.Reason, "message", condition.Message)
   433  				// NOTE: MHC is responsible for creating MachineOwnerRemediatedCondition if missing or to trigger another remediation if the previous one is completed;
   434  				// instead, if a remediation is in already progress, the remediation owner is responsible for completing the process and MHC should not overwrite the condition.
   435  				if !conditions.Has(t.Machine, clusterv1.MachineOwnerRemediatedCondition) || conditions.IsTrue(t.Machine, clusterv1.MachineOwnerRemediatedCondition) {
   436  					conditions.MarkFalse(t.Machine, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "")
   437  				}
   438  			}
   439  		}
   440  
   441  		if err := t.patchHelper.Patch(ctx, t.Machine); err != nil {
   442  			errList = append(errList, errors.Wrapf(err, "failed to patch unhealthy machine status for machine: %s/%s", t.Machine.Namespace, t.Machine.Name))
   443  			continue
   444  		}
   445  		r.recorder.Eventf(
   446  			t.Machine,
   447  			corev1.EventTypeNormal,
   448  			EventMachineMarkedUnhealthy,
   449  			"Machine %v has been marked as unhealthy",
   450  			t.string(),
   451  		)
   452  	}
   453  	return errList
   454  }
   455  
   456  // clusterToMachineHealthCheck maps events from Cluster objects to
   457  // MachineHealthCheck objects that belong to the Cluster.
   458  func (r *Reconciler) clusterToMachineHealthCheck(ctx context.Context, o client.Object) []reconcile.Request {
   459  	c, ok := o.(*clusterv1.Cluster)
   460  	if !ok {
   461  		panic(fmt.Sprintf("Expected a Cluster, got %T", o))
   462  	}
   463  
   464  	mhcList := &clusterv1.MachineHealthCheckList{}
   465  	if err := r.Client.List(
   466  		ctx,
   467  		mhcList,
   468  		client.InNamespace(c.Namespace),
   469  		client.MatchingLabels{clusterv1.ClusterNameLabel: c.Name},
   470  	); err != nil {
   471  		return nil
   472  	}
   473  
   474  	// This list should only contain MachineHealthChecks which belong to the given Cluster
   475  	requests := []reconcile.Request{}
   476  	for _, mhc := range mhcList.Items {
   477  		key := types.NamespacedName{Namespace: mhc.Namespace, Name: mhc.Name}
   478  		requests = append(requests, reconcile.Request{NamespacedName: key})
   479  	}
   480  	return requests
   481  }
   482  
   483  // machineToMachineHealthCheck maps events from Machine objects to
   484  // MachineHealthCheck objects that monitor the given machine.
   485  func (r *Reconciler) machineToMachineHealthCheck(ctx context.Context, o client.Object) []reconcile.Request {
   486  	m, ok := o.(*clusterv1.Machine)
   487  	if !ok {
   488  		panic(fmt.Sprintf("Expected a Machine, got %T", o))
   489  	}
   490  
   491  	mhcList := &clusterv1.MachineHealthCheckList{}
   492  	if err := r.Client.List(
   493  		ctx,
   494  		mhcList,
   495  		client.InNamespace(m.Namespace),
   496  		client.MatchingLabels{clusterv1.ClusterNameLabel: m.Spec.ClusterName},
   497  	); err != nil {
   498  		return nil
   499  	}
   500  
   501  	var requests []reconcile.Request
   502  	for k := range mhcList.Items {
   503  		mhc := &mhcList.Items[k]
   504  		if machine.HasMatchingLabels(mhc.Spec.Selector, m.Labels) {
   505  			key := util.ObjectKey(mhc)
   506  			requests = append(requests, reconcile.Request{NamespacedName: key})
   507  		}
   508  	}
   509  	return requests
   510  }
   511  
   512  func (r *Reconciler) nodeToMachineHealthCheck(ctx context.Context, o client.Object) []reconcile.Request {
   513  	node, ok := o.(*corev1.Node)
   514  	if !ok {
   515  		panic(fmt.Sprintf("Expected a corev1.Node, got %T", o))
   516  	}
   517  
   518  	machine, err := getMachineFromNode(ctx, r.Client, node.Name)
   519  	if machine == nil || err != nil {
   520  		return nil
   521  	}
   522  
   523  	return r.machineToMachineHealthCheck(ctx, machine)
   524  }
   525  
   526  func (r *Reconciler) watchClusterNodes(ctx context.Context, cluster *clusterv1.Cluster) error {
   527  	// If there is no tracker, don't watch remote nodes
   528  	if r.Tracker == nil {
   529  		return nil
   530  	}
   531  
   532  	return r.Tracker.Watch(ctx, remote.WatchInput{
   533  		Name:         "machinehealthcheck-watchClusterNodes",
   534  		Cluster:      util.ObjectKey(cluster),
   535  		Watcher:      r.controller,
   536  		Kind:         &corev1.Node{},
   537  		EventHandler: handler.EnqueueRequestsFromMapFunc(r.nodeToMachineHealthCheck),
   538  	})
   539  }
   540  
   541  // getMachineFromNode retrieves the machine with a nodeRef to nodeName
   542  // There should at most one machine with a given nodeRef, returns an error otherwise.
   543  func getMachineFromNode(ctx context.Context, c client.Client, nodeName string) (*clusterv1.Machine, error) {
   544  	machineList := &clusterv1.MachineList{}
   545  	if err := c.List(
   546  		ctx,
   547  		machineList,
   548  		client.MatchingFields{index.MachineNodeNameField: nodeName},
   549  	); err != nil {
   550  		return nil, errors.Wrap(err, "failed getting machine list")
   551  	}
   552  	// TODO(vincepri): Remove this loop once controller runtime fake client supports
   553  	// adding indexes on objects.
   554  	items := []*clusterv1.Machine{}
   555  	for i := range machineList.Items {
   556  		machine := &machineList.Items[i]
   557  		if machine.Status.NodeRef != nil && machine.Status.NodeRef.Name == nodeName {
   558  			items = append(items, machine)
   559  		}
   560  	}
   561  	if len(items) != 1 {
   562  		return nil, errors.Errorf("expecting one machine for node %v, got %v", nodeName, machineNames(items))
   563  	}
   564  	return items[0], nil
   565  }
   566  
   567  func machineNames(machines []*clusterv1.Machine) []string {
   568  	result := make([]string, 0, len(machines))
   569  	for _, m := range machines {
   570  		result = append(result, m.Name)
   571  	}
   572  	return result
   573  }
   574  
   575  // isAllowedRemediation checks the value of the MaxUnhealthy field to determine
   576  // returns whether remediation should be allowed or not, the remediation count, and error if any.
   577  func isAllowedRemediation(mhc *clusterv1.MachineHealthCheck) (bool, int32, error) {
   578  	var remediationAllowed bool
   579  	var remediationCount int32
   580  	if mhc.Spec.UnhealthyRange != nil {
   581  		min, max, err := getUnhealthyRange(mhc)
   582  		if err != nil {
   583  			return false, 0, err
   584  		}
   585  		unhealthyMachineCount := unhealthyMachineCount(mhc)
   586  		remediationAllowed = unhealthyMachineCount >= min && unhealthyMachineCount <= max
   587  		remediationCount = int32(max - unhealthyMachineCount)
   588  		return remediationAllowed, remediationCount, nil
   589  	}
   590  
   591  	maxUnhealthy, err := getMaxUnhealthy(mhc)
   592  	if err != nil {
   593  		return false, 0, err
   594  	}
   595  
   596  	// Remediation is not allowed if unhealthy is above maxUnhealthy
   597  	unhealthyMachineCount := unhealthyMachineCount(mhc)
   598  	remediationAllowed = unhealthyMachineCount <= maxUnhealthy
   599  	remediationCount = int32(maxUnhealthy - unhealthyMachineCount)
   600  	return remediationAllowed, remediationCount, nil
   601  }
   602  
   603  // getUnhealthyRange parses an integer range and returns the min and max values
   604  // Eg. [2-5] will return (2,5,nil).
   605  func getUnhealthyRange(mhc *clusterv1.MachineHealthCheck) (int, int, error) {
   606  	// remove '[' and ']'
   607  	unhealthyRange := (*(mhc.Spec.UnhealthyRange))[1 : len(*mhc.Spec.UnhealthyRange)-1]
   608  
   609  	parts := strings.Split(unhealthyRange, "-")
   610  
   611  	min, err := strconv.ParseUint(parts[0], 10, 32)
   612  	if err != nil {
   613  		return 0, 0, err
   614  	}
   615  
   616  	max, err := strconv.ParseUint(parts[1], 10, 32)
   617  	if err != nil {
   618  		return 0, 0, err
   619  	}
   620  
   621  	if max < min {
   622  		return 0, 0, errors.Errorf("max value %d cannot be less than min value %d for unhealthyRange", max, min)
   623  	}
   624  
   625  	return int(min), int(max), nil
   626  }
   627  
   628  func getMaxUnhealthy(mhc *clusterv1.MachineHealthCheck) (int, error) {
   629  	if mhc.Spec.MaxUnhealthy == nil {
   630  		return 0, errors.New("spec.maxUnhealthy must be set")
   631  	}
   632  	maxUnhealthy, err := intstr.GetScaledValueFromIntOrPercent(mhc.Spec.MaxUnhealthy, int(mhc.Status.ExpectedMachines), false)
   633  	if err != nil {
   634  		return 0, err
   635  	}
   636  	return maxUnhealthy, nil
   637  }
   638  
   639  // unhealthyMachineCount calculates the number of presently unhealthy or missing machines
   640  // ie the delta between the expected number of machines and the current number deemed healthy.
   641  func unhealthyMachineCount(mhc *clusterv1.MachineHealthCheck) int {
   642  	return int(mhc.Status.ExpectedMachines - mhc.Status.CurrentHealthy)
   643  }
   644  
   645  // getExternalRemediationRequest gets reference to External Remediation Request, unstructured object.
   646  func (r *Reconciler) getExternalRemediationRequest(ctx context.Context, m *clusterv1.MachineHealthCheck, machineName string) (*unstructured.Unstructured, error) {
   647  	remediationRef := &corev1.ObjectReference{
   648  		APIVersion: m.Spec.RemediationTemplate.APIVersion,
   649  		Kind:       strings.TrimSuffix(m.Spec.RemediationTemplate.Kind, clusterv1.TemplateSuffix),
   650  		Name:       machineName,
   651  	}
   652  	remediationReq, err := external.Get(ctx, r.Client, remediationRef, m.Namespace)
   653  	if err != nil {
   654  		return nil, errors.Wrapf(err, "failed to retrieve external remediation request object")
   655  	}
   656  	return remediationReq, nil
   657  }
   658  
   659  // externalRemediationRequestExists checks if the External Remediation Request is created
   660  // for the machine.
   661  func (r *Reconciler) externalRemediationRequestExists(ctx context.Context, m *clusterv1.MachineHealthCheck, machineName string) bool {
   662  	remediationReq, err := r.getExternalRemediationRequest(ctx, m, machineName)
   663  	if err != nil {
   664  		return false
   665  	}
   666  	return remediationReq != nil
   667  }