sigs.k8s.io/cluster-api@v1.6.3/internal/controllers/machinehealthcheck/machinehealthcheck_controller.go (about)

     1  /*
     2  Copyright 2020 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package machinehealthcheck
    18  
    19  import (
    20  	"context"
    21  	"fmt"
    22  	"sort"
    23  	"strconv"
    24  	"strings"
    25  	"time"
    26  
    27  	"github.com/go-logr/logr"
    28  	"github.com/pkg/errors"
    29  	corev1 "k8s.io/api/core/v1"
    30  	apierrors "k8s.io/apimachinery/pkg/api/errors"
    31  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    32  	"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
    33  	"k8s.io/apimachinery/pkg/types"
    34  	kerrors "k8s.io/apimachinery/pkg/util/errors"
    35  	"k8s.io/apimachinery/pkg/util/intstr"
    36  	"k8s.io/client-go/tools/record"
    37  	"k8s.io/klog/v2"
    38  	ctrl "sigs.k8s.io/controller-runtime"
    39  	"sigs.k8s.io/controller-runtime/pkg/builder"
    40  	"sigs.k8s.io/controller-runtime/pkg/client"
    41  	"sigs.k8s.io/controller-runtime/pkg/controller"
    42  	"sigs.k8s.io/controller-runtime/pkg/handler"
    43  	"sigs.k8s.io/controller-runtime/pkg/reconcile"
    44  
    45  	clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1"
    46  	"sigs.k8s.io/cluster-api/api/v1beta1/index"
    47  	"sigs.k8s.io/cluster-api/controllers/external"
    48  	"sigs.k8s.io/cluster-api/controllers/remote"
    49  	"sigs.k8s.io/cluster-api/internal/controllers/machine"
    50  	"sigs.k8s.io/cluster-api/util"
    51  	"sigs.k8s.io/cluster-api/util/annotations"
    52  	"sigs.k8s.io/cluster-api/util/conditions"
    53  	"sigs.k8s.io/cluster-api/util/patch"
    54  	"sigs.k8s.io/cluster-api/util/predicates"
    55  )
    56  
    57  const (
    58  	// Event types.
    59  
    60  	// EventRemediationRestricted is emitted in case when machine remediation
    61  	// is restricted by remediation circuit shorting logic.
    62  	EventRemediationRestricted string = "RemediationRestricted"
    63  
    64  	maxUnhealthyKeyLog     = "max unhealthy"
    65  	unhealthyTargetsKeyLog = "unhealthy targets"
    66  	unhealthyRangeKeyLog   = "unhealthy range"
    67  	totalTargetKeyLog      = "total target"
    68  )
    69  
    70  // +kubebuilder:rbac:groups=core,resources=events,verbs=get;list;watch;create;patch
    71  // +kubebuilder:rbac:groups=core,resources=secrets,verbs=get;list;watch
    72  // +kubebuilder:rbac:groups=cluster.x-k8s.io,resources=machines;machines/status,verbs=get;list;watch;delete
    73  // +kubebuilder:rbac:groups=cluster.x-k8s.io,resources=machinehealthchecks;machinehealthchecks/status;machinehealthchecks/finalizers,verbs=get;list;watch;update;patch
    74  
    75  // Reconciler reconciles a MachineHealthCheck object.
    76  type Reconciler struct {
    77  	Client  client.Client
    78  	Tracker *remote.ClusterCacheTracker
    79  
    80  	// WatchFilterValue is the label value used to filter events prior to reconciliation.
    81  	WatchFilterValue string
    82  
    83  	controller controller.Controller
    84  	recorder   record.EventRecorder
    85  }
    86  
    87  func (r *Reconciler) SetupWithManager(ctx context.Context, mgr ctrl.Manager, options controller.Options) error {
    88  	c, err := ctrl.NewControllerManagedBy(mgr).
    89  		For(&clusterv1.MachineHealthCheck{}).
    90  		Watches(
    91  			&clusterv1.Machine{},
    92  			handler.EnqueueRequestsFromMapFunc(r.machineToMachineHealthCheck),
    93  		).
    94  		WithOptions(options).
    95  		WithEventFilter(predicates.ResourceNotPausedAndHasFilterLabel(ctrl.LoggerFrom(ctx), r.WatchFilterValue)).
    96  		Watches(
    97  			&clusterv1.Cluster{},
    98  			handler.EnqueueRequestsFromMapFunc(r.clusterToMachineHealthCheck),
    99  			builder.WithPredicates(
   100  				// TODO: should this wait for Cluster.Status.InfrastructureReady similar to Infra Machine resources?
   101  				predicates.All(ctrl.LoggerFrom(ctx),
   102  					predicates.ClusterUnpaused(ctrl.LoggerFrom(ctx)),
   103  					predicates.ResourceHasFilterLabel(ctrl.LoggerFrom(ctx), r.WatchFilterValue),
   104  				),
   105  			),
   106  		).Build(r)
   107  	if err != nil {
   108  		return errors.Wrap(err, "failed setting up with a controller manager")
   109  	}
   110  
   111  	r.controller = c
   112  	r.recorder = mgr.GetEventRecorderFor("machinehealthcheck-controller")
   113  	return nil
   114  }
   115  
   116  func (r *Reconciler) Reconcile(ctx context.Context, req ctrl.Request) (_ ctrl.Result, reterr error) {
   117  	log := ctrl.LoggerFrom(ctx)
   118  
   119  	// Fetch the MachineHealthCheck instance
   120  	m := &clusterv1.MachineHealthCheck{}
   121  	if err := r.Client.Get(ctx, req.NamespacedName, m); err != nil {
   122  		if apierrors.IsNotFound(err) {
   123  			// Object not found, return.  Created objects are automatically garbage collected.
   124  			// For additional cleanup logic use finalizers.
   125  			return ctrl.Result{}, nil
   126  		}
   127  
   128  		// Error reading the object - requeue the request.
   129  		log.Error(err, "Failed to fetch MachineHealthCheck")
   130  		return ctrl.Result{}, err
   131  	}
   132  
   133  	log = log.WithValues("Cluster", klog.KRef(m.Namespace, m.Spec.ClusterName))
   134  	ctx = ctrl.LoggerInto(ctx, log)
   135  
   136  	cluster, err := util.GetClusterByName(ctx, r.Client, m.Namespace, m.Spec.ClusterName)
   137  	if err != nil {
   138  		log.Error(err, "Failed to fetch Cluster for MachineHealthCheck")
   139  		return ctrl.Result{}, err
   140  	}
   141  
   142  	// Return early if the object or Cluster is paused.
   143  	if annotations.IsPaused(cluster, m) {
   144  		log.Info("Reconciliation is paused for this object")
   145  		return ctrl.Result{}, nil
   146  	}
   147  
   148  	// Initialize the patch helper
   149  	patchHelper, err := patch.NewHelper(m, r.Client)
   150  	if err != nil {
   151  		log.Error(err, "Failed to build patch helper")
   152  		return ctrl.Result{}, err
   153  	}
   154  
   155  	defer func() {
   156  		// Always attempt to patch the object and status after each reconciliation.
   157  		// Patch ObservedGeneration only if the reconciliation completed successfully
   158  		patchOpts := []patch.Option{}
   159  		if reterr == nil {
   160  			patchOpts = append(patchOpts, patch.WithStatusObservedGeneration{})
   161  		}
   162  		if err := patchHelper.Patch(ctx, m, patchOpts...); err != nil {
   163  			reterr = kerrors.NewAggregate([]error{reterr, err})
   164  		}
   165  	}()
   166  
   167  	// Reconcile labels.
   168  	if m.Labels == nil {
   169  		m.Labels = make(map[string]string)
   170  	}
   171  	m.Labels[clusterv1.ClusterNameLabel] = m.Spec.ClusterName
   172  
   173  	result, err := r.reconcile(ctx, log, cluster, m)
   174  	if err != nil {
   175  		// Requeue if the reconcile failed because the ClusterCacheTracker was locked for
   176  		// the current cluster because of concurrent access.
   177  		if errors.Is(err, remote.ErrClusterLocked) {
   178  			log.V(5).Info("Requeuing because another worker has the lock on the ClusterCacheTracker")
   179  			return ctrl.Result{Requeue: true}, nil
   180  		}
   181  		log.Error(err, "Failed to reconcile MachineHealthCheck")
   182  		r.recorder.Eventf(m, corev1.EventTypeWarning, "ReconcileError", "%v", err)
   183  
   184  		// Requeue immediately if any errors occurred
   185  		return ctrl.Result{}, err
   186  	}
   187  
   188  	return result, nil
   189  }
   190  
   191  func (r *Reconciler) reconcile(ctx context.Context, logger logr.Logger, cluster *clusterv1.Cluster, m *clusterv1.MachineHealthCheck) (ctrl.Result, error) {
   192  	// Ensure the MachineHealthCheck is owned by the Cluster it belongs to
   193  	m.SetOwnerReferences(util.EnsureOwnerRef(m.GetOwnerReferences(), metav1.OwnerReference{
   194  		APIVersion: clusterv1.GroupVersion.String(),
   195  		Kind:       "Cluster",
   196  		Name:       cluster.Name,
   197  		UID:        cluster.UID,
   198  	}))
   199  
   200  	// If the cluster is already initialized, get the remote cluster cache to use as a client.Reader.
   201  	var remoteClient client.Client
   202  	if conditions.IsTrue(cluster, clusterv1.ControlPlaneInitializedCondition) {
   203  		var err error
   204  		remoteClient, err = r.Tracker.GetClient(ctx, util.ObjectKey(cluster))
   205  		if err != nil {
   206  			logger.Error(err, "error creating remote cluster cache")
   207  			return ctrl.Result{}, err
   208  		}
   209  
   210  		if err := r.watchClusterNodes(ctx, cluster); err != nil {
   211  			return ctrl.Result{}, err
   212  		}
   213  	}
   214  
   215  	// fetch all targets
   216  	logger.V(3).Info("Finding targets")
   217  	targets, err := r.getTargetsFromMHC(ctx, logger, remoteClient, cluster, m)
   218  	if err != nil {
   219  		logger.Error(err, "Failed to fetch targets from MachineHealthCheck")
   220  		return ctrl.Result{}, err
   221  	}
   222  	totalTargets := len(targets)
   223  	m.Status.ExpectedMachines = int32(totalTargets)
   224  	m.Status.Targets = make([]string, totalTargets)
   225  	for i, t := range targets {
   226  		m.Status.Targets[i] = t.Machine.Name
   227  	}
   228  	// do sort to avoid keep changing m.Status as the returned machines are not in order
   229  	sort.Strings(m.Status.Targets)
   230  
   231  	nodeStartupTimeout := m.Spec.NodeStartupTimeout
   232  	if nodeStartupTimeout == nil {
   233  		nodeStartupTimeout = &clusterv1.DefaultNodeStartupTimeout
   234  	}
   235  
   236  	// health check all targets and reconcile mhc status
   237  	healthy, unhealthy, nextCheckTimes := r.healthCheckTargets(targets, logger, *nodeStartupTimeout)
   238  	m.Status.CurrentHealthy = int32(len(healthy))
   239  
   240  	// check MHC current health against MaxUnhealthy
   241  	remediationAllowed, remediationCount, err := isAllowedRemediation(m)
   242  	if err != nil {
   243  		return ctrl.Result{}, errors.Wrapf(err, "error checking if remediation is allowed")
   244  	}
   245  
   246  	if !remediationAllowed {
   247  		var message string
   248  
   249  		if m.Spec.UnhealthyRange == nil {
   250  			logger.V(3).Info(
   251  				"Short-circuiting remediation",
   252  				totalTargetKeyLog, totalTargets,
   253  				maxUnhealthyKeyLog, m.Spec.MaxUnhealthy,
   254  				unhealthyTargetsKeyLog, len(unhealthy),
   255  			)
   256  			message = fmt.Sprintf("Remediation is not allowed, the number of not started or unhealthy machines exceeds maxUnhealthy (total: %v, unhealthy: %v, maxUnhealthy: %v)",
   257  				totalTargets,
   258  				len(unhealthy),
   259  				m.Spec.MaxUnhealthy)
   260  		} else {
   261  			logger.V(3).Info(
   262  				"Short-circuiting remediation",
   263  				totalTargetKeyLog, totalTargets,
   264  				unhealthyRangeKeyLog, *m.Spec.UnhealthyRange,
   265  				unhealthyTargetsKeyLog, len(unhealthy),
   266  			)
   267  			message = fmt.Sprintf("Remediation is not allowed, the number of not started or unhealthy machines does not fall within the range (total: %v, unhealthy: %v, unhealthyRange: %v)",
   268  				totalTargets,
   269  				len(unhealthy),
   270  				*m.Spec.UnhealthyRange)
   271  		}
   272  
   273  		// Remediation not allowed, the number of not started or unhealthy machines either exceeds maxUnhealthy (or) not within unhealthyRange
   274  		m.Status.RemediationsAllowed = 0
   275  		conditions.Set(m, &clusterv1.Condition{
   276  			Type:     clusterv1.RemediationAllowedCondition,
   277  			Status:   corev1.ConditionFalse,
   278  			Severity: clusterv1.ConditionSeverityWarning,
   279  			Reason:   clusterv1.TooManyUnhealthyReason,
   280  			Message:  message,
   281  		})
   282  
   283  		r.recorder.Event(
   284  			m,
   285  			corev1.EventTypeWarning,
   286  			EventRemediationRestricted,
   287  			message,
   288  		)
   289  		errList := []error{}
   290  		for _, t := range append(healthy, unhealthy...) {
   291  			if err := t.patchHelper.Patch(ctx, t.Machine); err != nil {
   292  				errList = append(errList, errors.Wrapf(err, "failed to patch machine status for machine: %s/%s", t.Machine.Namespace, t.Machine.Name))
   293  				continue
   294  			}
   295  		}
   296  		if len(errList) > 0 {
   297  			return ctrl.Result{}, kerrors.NewAggregate(errList)
   298  		}
   299  		return reconcile.Result{Requeue: true}, nil
   300  	}
   301  
   302  	if m.Spec.UnhealthyRange == nil {
   303  		logger.V(3).Info(
   304  			"Remediations are allowed",
   305  			totalTargetKeyLog, totalTargets,
   306  			maxUnhealthyKeyLog, m.Spec.MaxUnhealthy,
   307  			unhealthyTargetsKeyLog, len(unhealthy),
   308  		)
   309  	} else {
   310  		logger.V(3).Info(
   311  			"Remediations are allowed",
   312  			totalTargetKeyLog, totalTargets,
   313  			unhealthyRangeKeyLog, *m.Spec.UnhealthyRange,
   314  			unhealthyTargetsKeyLog, len(unhealthy),
   315  		)
   316  	}
   317  
   318  	// Remediation is allowed so unhealthyMachineCount is within unhealthyRange (or) maxUnhealthy - unhealthyMachineCount >= 0
   319  	m.Status.RemediationsAllowed = remediationCount
   320  	conditions.MarkTrue(m, clusterv1.RemediationAllowedCondition)
   321  
   322  	errList := r.patchUnhealthyTargets(ctx, logger, unhealthy, cluster, m)
   323  	errList = append(errList, r.patchHealthyTargets(ctx, logger, healthy, m)...)
   324  
   325  	// handle update errors
   326  	if len(errList) > 0 {
   327  		logger.V(3).Info("Error(s) marking machine, requeuing")
   328  		return reconcile.Result{}, kerrors.NewAggregate(errList)
   329  	}
   330  
   331  	if minNextCheck := minDuration(nextCheckTimes); minNextCheck > 0 {
   332  		logger.V(3).Info("Some targets might go unhealthy. Ensuring a requeue happens", "requeueIn", minNextCheck.Truncate(time.Second).String())
   333  		return ctrl.Result{RequeueAfter: minNextCheck}, nil
   334  	}
   335  
   336  	logger.V(3).Info("No more targets meet unhealthy criteria")
   337  
   338  	return ctrl.Result{}, nil
   339  }
   340  
   341  // patchHealthyTargets patches healthy machines with MachineHealthCheckSucceededCondition.
   342  func (r *Reconciler) patchHealthyTargets(ctx context.Context, logger logr.Logger, healthy []healthCheckTarget, m *clusterv1.MachineHealthCheck) []error {
   343  	errList := []error{}
   344  	for _, t := range healthy {
   345  		if m.Spec.RemediationTemplate != nil {
   346  			// Get remediation request object
   347  			obj, err := r.getExternalRemediationRequest(ctx, m, t.Machine.Name)
   348  			if err != nil {
   349  				if !apierrors.IsNotFound(errors.Cause(err)) {
   350  					wrappedErr := errors.Wrapf(err, "failed to fetch remediation request for machine %q in namespace %q within cluster %q", t.Machine.Name, t.Machine.Namespace, t.Machine.Spec.ClusterName)
   351  					errList = append(errList, wrappedErr)
   352  				}
   353  				continue
   354  			}
   355  			// Check that obj has no DeletionTimestamp to avoid hot loop
   356  			if obj.GetDeletionTimestamp() == nil {
   357  				// Issue a delete for remediation request.
   358  				if err := r.Client.Delete(ctx, obj); err != nil && !apierrors.IsNotFound(err) {
   359  					errList = append(errList, errors.Wrapf(err, "failed to delete %v %q for Machine %q", obj.GroupVersionKind(), obj.GetName(), t.Machine.Name))
   360  					continue
   361  				}
   362  			}
   363  		}
   364  
   365  		if err := t.patchHelper.Patch(ctx, t.Machine); err != nil {
   366  			logger.Error(err, "failed to patch healthy machine status for machine", "machine", t.Machine.GetName())
   367  			errList = append(errList, errors.Wrapf(err, "failed to patch healthy machine status for machine: %s/%s", t.Machine.Namespace, t.Machine.Name))
   368  		}
   369  	}
   370  	return errList
   371  }
   372  
   373  // patchUnhealthyTargets patches machines with MachineOwnerRemediatedCondition for remediation.
   374  func (r *Reconciler) patchUnhealthyTargets(ctx context.Context, logger logr.Logger, unhealthy []healthCheckTarget, cluster *clusterv1.Cluster, m *clusterv1.MachineHealthCheck) []error {
   375  	// mark for remediation
   376  	errList := []error{}
   377  	for _, t := range unhealthy {
   378  		condition := conditions.Get(t.Machine, clusterv1.MachineHealthCheckSucceededCondition)
   379  
   380  		if annotations.IsPaused(cluster, t.Machine) {
   381  			logger.Info("Machine has failed health check, but machine is paused so skipping remediation", "target", t.string(), "reason", condition.Reason, "message", condition.Message)
   382  		} else {
   383  			if m.Spec.RemediationTemplate != nil {
   384  				// If external remediation request already exists,
   385  				// return early
   386  				if r.externalRemediationRequestExists(ctx, m, t.Machine.Name) {
   387  					return errList
   388  				}
   389  
   390  				cloneOwnerRef := &metav1.OwnerReference{
   391  					APIVersion: clusterv1.GroupVersion.String(),
   392  					Kind:       "Machine",
   393  					Name:       t.Machine.Name,
   394  					UID:        t.Machine.UID,
   395  				}
   396  
   397  				from, err := external.Get(ctx, r.Client, m.Spec.RemediationTemplate, t.Machine.Namespace)
   398  				if err != nil {
   399  					conditions.MarkFalse(m, clusterv1.ExternalRemediationTemplateAvailableCondition, clusterv1.ExternalRemediationTemplateNotFoundReason, clusterv1.ConditionSeverityError, err.Error())
   400  					errList = append(errList, errors.Wrapf(err, "error retrieving remediation template %v %q for machine %q in namespace %q within cluster %q", m.Spec.RemediationTemplate.GroupVersionKind(), m.Spec.RemediationTemplate.Name, t.Machine.Name, t.Machine.Namespace, m.Spec.ClusterName))
   401  					return errList
   402  				}
   403  
   404  				generateTemplateInput := &external.GenerateTemplateInput{
   405  					Template:    from,
   406  					TemplateRef: m.Spec.RemediationTemplate,
   407  					Namespace:   t.Machine.Namespace,
   408  					ClusterName: t.Machine.Spec.ClusterName,
   409  					OwnerRef:    cloneOwnerRef,
   410  				}
   411  				to, err := external.GenerateTemplate(generateTemplateInput)
   412  				if err != nil {
   413  					errList = append(errList, errors.Wrapf(err, "failed to create template for remediation request %v %q for machine %q in namespace %q within cluster %q", m.Spec.RemediationTemplate.GroupVersionKind(), m.Spec.RemediationTemplate.Name, t.Machine.Name, t.Machine.Namespace, m.Spec.ClusterName))
   414  					return errList
   415  				}
   416  
   417  				// Set the Remediation Request to match the Machine name, the name is used to
   418  				// guarantee uniqueness between runs. A Machine should only ever have a single
   419  				// remediation object of a specific GVK created.
   420  				//
   421  				// NOTE: This doesn't guarantee uniqueness across different MHC objects watching
   422  				// the same Machine, users are in charge of setting health checks and remediation properly.
   423  				to.SetName(t.Machine.Name)
   424  
   425  				logger.Info("Target has failed health check, creating an external remediation request", "remediation request name", to.GetName(), "target", t.string(), "reason", condition.Reason, "message", condition.Message)
   426  				// Create the external clone.
   427  				if err := r.Client.Create(ctx, to); err != nil {
   428  					conditions.MarkFalse(m, clusterv1.ExternalRemediationRequestAvailableCondition, clusterv1.ExternalRemediationRequestCreationFailedReason, clusterv1.ConditionSeverityError, err.Error())
   429  					errList = append(errList, errors.Wrapf(err, "error creating remediation request for machine %q in namespace %q within cluster %q", t.Machine.Name, t.Machine.Namespace, t.Machine.Spec.ClusterName))
   430  					return errList
   431  				}
   432  			} else {
   433  				logger.Info("Target has failed health check, marking for remediation", "target", t.string(), "reason", condition.Reason, "message", condition.Message)
   434  				// NOTE: MHC is responsible for creating MachineOwnerRemediatedCondition if missing or to trigger another remediation if the previous one is completed;
   435  				// instead, if a remediation is in already progress, the remediation owner is responsible for completing the process and MHC should not overwrite the condition.
   436  				if !conditions.Has(t.Machine, clusterv1.MachineOwnerRemediatedCondition) || conditions.IsTrue(t.Machine, clusterv1.MachineOwnerRemediatedCondition) {
   437  					conditions.MarkFalse(t.Machine, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "")
   438  				}
   439  			}
   440  		}
   441  
   442  		if err := t.patchHelper.Patch(ctx, t.Machine); err != nil {
   443  			errList = append(errList, errors.Wrapf(err, "failed to patch unhealthy machine status for machine: %s/%s", t.Machine.Namespace, t.Machine.Name))
   444  			continue
   445  		}
   446  		r.recorder.Eventf(
   447  			t.Machine,
   448  			corev1.EventTypeNormal,
   449  			EventMachineMarkedUnhealthy,
   450  			"Machine %v has been marked as unhealthy",
   451  			t.string(),
   452  		)
   453  	}
   454  	return errList
   455  }
   456  
   457  // clusterToMachineHealthCheck maps events from Cluster objects to
   458  // MachineHealthCheck objects that belong to the Cluster.
   459  func (r *Reconciler) clusterToMachineHealthCheck(ctx context.Context, o client.Object) []reconcile.Request {
   460  	c, ok := o.(*clusterv1.Cluster)
   461  	if !ok {
   462  		panic(fmt.Sprintf("Expected a Cluster, got %T", o))
   463  	}
   464  
   465  	mhcList := &clusterv1.MachineHealthCheckList{}
   466  	if err := r.Client.List(
   467  		ctx,
   468  		mhcList,
   469  		client.InNamespace(c.Namespace),
   470  		client.MatchingLabels{clusterv1.ClusterNameLabel: c.Name},
   471  	); err != nil {
   472  		return nil
   473  	}
   474  
   475  	// This list should only contain MachineHealthChecks which belong to the given Cluster
   476  	requests := []reconcile.Request{}
   477  	for _, mhc := range mhcList.Items {
   478  		key := types.NamespacedName{Namespace: mhc.Namespace, Name: mhc.Name}
   479  		requests = append(requests, reconcile.Request{NamespacedName: key})
   480  	}
   481  	return requests
   482  }
   483  
   484  // machineToMachineHealthCheck maps events from Machine objects to
   485  // MachineHealthCheck objects that monitor the given machine.
   486  func (r *Reconciler) machineToMachineHealthCheck(ctx context.Context, o client.Object) []reconcile.Request {
   487  	m, ok := o.(*clusterv1.Machine)
   488  	if !ok {
   489  		panic(fmt.Sprintf("Expected a Machine, got %T", o))
   490  	}
   491  
   492  	mhcList := &clusterv1.MachineHealthCheckList{}
   493  	if err := r.Client.List(
   494  		ctx,
   495  		mhcList,
   496  		client.InNamespace(m.Namespace),
   497  		client.MatchingLabels{clusterv1.ClusterNameLabel: m.Spec.ClusterName},
   498  	); err != nil {
   499  		return nil
   500  	}
   501  
   502  	var requests []reconcile.Request
   503  	for k := range mhcList.Items {
   504  		mhc := &mhcList.Items[k]
   505  		if machine.HasMatchingLabels(mhc.Spec.Selector, m.Labels) {
   506  			key := util.ObjectKey(mhc)
   507  			requests = append(requests, reconcile.Request{NamespacedName: key})
   508  		}
   509  	}
   510  	return requests
   511  }
   512  
   513  func (r *Reconciler) nodeToMachineHealthCheck(ctx context.Context, o client.Object) []reconcile.Request {
   514  	node, ok := o.(*corev1.Node)
   515  	if !ok {
   516  		panic(fmt.Sprintf("Expected a corev1.Node, got %T", o))
   517  	}
   518  
   519  	machine, err := getMachineFromNode(ctx, r.Client, node.Name)
   520  	if machine == nil || err != nil {
   521  		return nil
   522  	}
   523  
   524  	return r.machineToMachineHealthCheck(ctx, machine)
   525  }
   526  
   527  func (r *Reconciler) watchClusterNodes(ctx context.Context, cluster *clusterv1.Cluster) error {
   528  	// If there is no tracker, don't watch remote nodes
   529  	if r.Tracker == nil {
   530  		return nil
   531  	}
   532  
   533  	return r.Tracker.Watch(ctx, remote.WatchInput{
   534  		Name:         "machinehealthcheck-watchClusterNodes",
   535  		Cluster:      util.ObjectKey(cluster),
   536  		Watcher:      r.controller,
   537  		Kind:         &corev1.Node{},
   538  		EventHandler: handler.EnqueueRequestsFromMapFunc(r.nodeToMachineHealthCheck),
   539  	})
   540  }
   541  
   542  // getMachineFromNode retrieves the machine with a nodeRef to nodeName
   543  // There should at most one machine with a given nodeRef, returns an error otherwise.
   544  func getMachineFromNode(ctx context.Context, c client.Client, nodeName string) (*clusterv1.Machine, error) {
   545  	machineList := &clusterv1.MachineList{}
   546  	if err := c.List(
   547  		ctx,
   548  		machineList,
   549  		client.MatchingFields{index.MachineNodeNameField: nodeName},
   550  	); err != nil {
   551  		return nil, errors.Wrap(err, "failed getting machine list")
   552  	}
   553  	// TODO(vincepri): Remove this loop once controller runtime fake client supports
   554  	// adding indexes on objects.
   555  	items := []*clusterv1.Machine{}
   556  	for i := range machineList.Items {
   557  		machine := &machineList.Items[i]
   558  		if machine.Status.NodeRef != nil && machine.Status.NodeRef.Name == nodeName {
   559  			items = append(items, machine)
   560  		}
   561  	}
   562  	if len(items) != 1 {
   563  		return nil, errors.Errorf("expecting one machine for node %v, got %v", nodeName, machineNames(items))
   564  	}
   565  	return items[0], nil
   566  }
   567  
   568  func machineNames(machines []*clusterv1.Machine) []string {
   569  	result := make([]string, 0, len(machines))
   570  	for _, m := range machines {
   571  		result = append(result, m.Name)
   572  	}
   573  	return result
   574  }
   575  
   576  // isAllowedRemediation checks the value of the MaxUnhealthy field to determine
   577  // returns whether remediation should be allowed or not, the remediation count, and error if any.
   578  func isAllowedRemediation(mhc *clusterv1.MachineHealthCheck) (bool, int32, error) {
   579  	var remediationAllowed bool
   580  	var remediationCount int32
   581  	if mhc.Spec.UnhealthyRange != nil {
   582  		min, max, err := getUnhealthyRange(mhc)
   583  		if err != nil {
   584  			return false, 0, err
   585  		}
   586  		unhealthyMachineCount := unhealthyMachineCount(mhc)
   587  		remediationAllowed = unhealthyMachineCount >= min && unhealthyMachineCount <= max
   588  		remediationCount = int32(max - unhealthyMachineCount)
   589  		return remediationAllowed, remediationCount, nil
   590  	}
   591  
   592  	maxUnhealthy, err := getMaxUnhealthy(mhc)
   593  	if err != nil {
   594  		return false, 0, err
   595  	}
   596  
   597  	// Remediation is not allowed if unhealthy is above maxUnhealthy
   598  	unhealthyMachineCount := unhealthyMachineCount(mhc)
   599  	remediationAllowed = unhealthyMachineCount <= maxUnhealthy
   600  	remediationCount = int32(maxUnhealthy - unhealthyMachineCount)
   601  	return remediationAllowed, remediationCount, nil
   602  }
   603  
   604  // getUnhealthyRange parses an integer range and returns the min and max values
   605  // Eg. [2-5] will return (2,5,nil).
   606  func getUnhealthyRange(mhc *clusterv1.MachineHealthCheck) (int, int, error) {
   607  	// remove '[' and ']'
   608  	unhealthyRange := (*(mhc.Spec.UnhealthyRange))[1 : len(*mhc.Spec.UnhealthyRange)-1]
   609  
   610  	parts := strings.Split(unhealthyRange, "-")
   611  
   612  	min, err := strconv.ParseUint(parts[0], 10, 32)
   613  	if err != nil {
   614  		return 0, 0, err
   615  	}
   616  
   617  	max, err := strconv.ParseUint(parts[1], 10, 32)
   618  	if err != nil {
   619  		return 0, 0, err
   620  	}
   621  
   622  	if max < min {
   623  		return 0, 0, errors.Errorf("max value %d cannot be less than min value %d for unhealthyRange", max, min)
   624  	}
   625  
   626  	return int(min), int(max), nil
   627  }
   628  
   629  func getMaxUnhealthy(mhc *clusterv1.MachineHealthCheck) (int, error) {
   630  	if mhc.Spec.MaxUnhealthy == nil {
   631  		return 0, errors.New("spec.maxUnhealthy must be set")
   632  	}
   633  	maxUnhealthy, err := intstr.GetScaledValueFromIntOrPercent(mhc.Spec.MaxUnhealthy, int(mhc.Status.ExpectedMachines), false)
   634  	if err != nil {
   635  		return 0, err
   636  	}
   637  	return maxUnhealthy, nil
   638  }
   639  
   640  // unhealthyMachineCount calculates the number of presently unhealthy or missing machines
   641  // ie the delta between the expected number of machines and the current number deemed healthy.
   642  func unhealthyMachineCount(mhc *clusterv1.MachineHealthCheck) int {
   643  	return int(mhc.Status.ExpectedMachines - mhc.Status.CurrentHealthy)
   644  }
   645  
   646  // getExternalRemediationRequest gets reference to External Remediation Request, unstructured object.
   647  func (r *Reconciler) getExternalRemediationRequest(ctx context.Context, m *clusterv1.MachineHealthCheck, machineName string) (*unstructured.Unstructured, error) {
   648  	remediationRef := &corev1.ObjectReference{
   649  		APIVersion: m.Spec.RemediationTemplate.APIVersion,
   650  		Kind:       strings.TrimSuffix(m.Spec.RemediationTemplate.Kind, clusterv1.TemplateSuffix),
   651  		Name:       machineName,
   652  	}
   653  	remediationReq, err := external.Get(ctx, r.Client, remediationRef, m.Namespace)
   654  	if err != nil {
   655  		return nil, errors.Wrapf(err, "failed to retrieve external remediation request object")
   656  	}
   657  	return remediationReq, nil
   658  }
   659  
   660  // externalRemediationRequestExists checks if the External Remediation Request is created
   661  // for the machine.
   662  func (r *Reconciler) externalRemediationRequestExists(ctx context.Context, m *clusterv1.MachineHealthCheck, machineName string) bool {
   663  	remediationReq, err := r.getExternalRemediationRequest(ctx, m, machineName)
   664  	if err != nil {
   665  		return false
   666  	}
   667  	return remediationReq != nil
   668  }