sigs.k8s.io/cluster-api@v1.6.3/internal/controllers/machinehealthcheck/machinehealthcheck_targets.go (about)

     1  /*
     2  Copyright 2020 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package machinehealthcheck
    18  
    19  import (
    20  	"context"
    21  	"fmt"
    22  	"time"
    23  
    24  	"github.com/go-logr/logr"
    25  	"github.com/pkg/errors"
    26  	corev1 "k8s.io/api/core/v1"
    27  	apierrors "k8s.io/apimachinery/pkg/api/errors"
    28  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    29  	"k8s.io/apimachinery/pkg/types"
    30  	"k8s.io/klog/v2"
    31  	"sigs.k8s.io/controller-runtime/pkg/client"
    32  
    33  	clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1"
    34  	"sigs.k8s.io/cluster-api/util"
    35  	"sigs.k8s.io/cluster-api/util/annotations"
    36  	"sigs.k8s.io/cluster-api/util/conditions"
    37  	"sigs.k8s.io/cluster-api/util/patch"
    38  )
    39  
    40  const (
    41  	// Event types.
    42  
    43  	// EventMachineMarkedUnhealthy is emitted when machine was successfully marked as unhealthy.
    44  	EventMachineMarkedUnhealthy string = "MachineMarkedUnhealthy"
    45  	// EventDetectedUnhealthy is emitted in case a node associated with a
    46  	// machine was detected unhealthy.
    47  	EventDetectedUnhealthy string = "DetectedUnhealthy"
    48  )
    49  
    50  var (
    51  	// We allow users to disable the nodeStartupTimeout by setting the duration to 0.
    52  	disabledNodeStartupTimeout = clusterv1.ZeroDuration
    53  )
    54  
    55  // healthCheckTarget contains the information required to perform a health check
    56  // on the node to determine if any remediation is required.
    57  type healthCheckTarget struct {
    58  	Cluster     *clusterv1.Cluster
    59  	Machine     *clusterv1.Machine
    60  	Node        *corev1.Node
    61  	MHC         *clusterv1.MachineHealthCheck
    62  	patchHelper *patch.Helper
    63  	nodeMissing bool
    64  }
    65  
    66  func (t *healthCheckTarget) string() string {
    67  	return fmt.Sprintf("%s/%s/%s/%s",
    68  		t.MHC.GetNamespace(),
    69  		t.MHC.GetName(),
    70  		t.Machine.GetName(),
    71  		t.nodeName(),
    72  	)
    73  }
    74  
    75  // Get the node name if the target has a node.
    76  func (t *healthCheckTarget) nodeName() string {
    77  	if t.Node != nil {
    78  		return t.Node.GetName()
    79  	}
    80  	return ""
    81  }
    82  
    83  // Determine whether or not a given target needs remediation.
    84  // The node will need remediation if any of the following are true:
    85  // - The Machine has failed for some reason
    86  // - The Machine did not get a node before `timeoutForMachineToHaveNode` elapses
    87  // - The Node has gone away
    88  // - Any condition on the node is matched for the given timeout
    89  // If the target doesn't currently need rememdiation, provide a duration after
    90  // which the target should next be checked.
    91  // The target should be requeued after this duration.
    92  func (t *healthCheckTarget) needsRemediation(logger logr.Logger, timeoutForMachineToHaveNode metav1.Duration) (bool, time.Duration) {
    93  	var nextCheckTimes []time.Duration
    94  	now := time.Now()
    95  
    96  	if t.Machine.Status.FailureReason != nil {
    97  		conditions.MarkFalse(t.Machine, clusterv1.MachineHealthCheckSucceededCondition, clusterv1.MachineHasFailureReason, clusterv1.ConditionSeverityWarning, "FailureReason: %v", *t.Machine.Status.FailureReason)
    98  		logger.V(3).Info("Target is unhealthy", "failureReason", t.Machine.Status.FailureReason)
    99  		return true, time.Duration(0)
   100  	}
   101  
   102  	if t.Machine.Status.FailureMessage != nil {
   103  		conditions.MarkFalse(t.Machine, clusterv1.MachineHealthCheckSucceededCondition, clusterv1.MachineHasFailureReason, clusterv1.ConditionSeverityWarning, "FailureMessage: %v", *t.Machine.Status.FailureMessage)
   104  		logger.V(3).Info("Target is unhealthy", "failureMessage", t.Machine.Status.FailureMessage)
   105  		return true, time.Duration(0)
   106  	}
   107  
   108  	// the node does not exist
   109  	if t.nodeMissing {
   110  		logger.V(3).Info("Target is unhealthy: node is missing")
   111  		conditions.MarkFalse(t.Machine, clusterv1.MachineHealthCheckSucceededCondition, clusterv1.NodeNotFoundReason, clusterv1.ConditionSeverityWarning, "")
   112  		return true, time.Duration(0)
   113  	}
   114  
   115  	// Don't penalize any Machine/Node if the control plane has not been initialized
   116  	// Exception of this rule are control plane machine itself, so the first control plane machine can be remediated.
   117  	if !conditions.IsTrue(t.Cluster, clusterv1.ControlPlaneInitializedCondition) && !util.IsControlPlaneMachine(t.Machine) {
   118  		logger.V(3).Info("Not evaluating target health because the control plane has not yet been initialized")
   119  		// Return a nextCheck time of 0 because we'll get requeued when the Cluster is updated.
   120  		return false, 0
   121  	}
   122  
   123  	// Don't penalize any Machine/Node if the cluster infrastructure is not ready.
   124  	if !conditions.IsTrue(t.Cluster, clusterv1.InfrastructureReadyCondition) {
   125  		logger.V(3).Info("Not evaluating target health because the cluster infrastructure is not ready")
   126  		// Return a nextCheck time of 0 because we'll get requeued when the Cluster is updated.
   127  		return false, 0
   128  	}
   129  
   130  	// the node has not been set yet
   131  	if t.Node == nil {
   132  		if timeoutForMachineToHaveNode == disabledNodeStartupTimeout {
   133  			// Startup timeout is disabled so no need to go any further.
   134  			// No node yet to check conditions, can return early here.
   135  			return false, 0
   136  		}
   137  
   138  		controlPlaneInitialized := conditions.GetLastTransitionTime(t.Cluster, clusterv1.ControlPlaneInitializedCondition)
   139  		clusterInfraReady := conditions.GetLastTransitionTime(t.Cluster, clusterv1.InfrastructureReadyCondition)
   140  		machineCreationTime := t.Machine.CreationTimestamp.Time
   141  
   142  		// Use the latest of the 3 times
   143  		comparisonTime := machineCreationTime
   144  		logger.V(3).Info("Determining comparison time", "machineCreationTime", machineCreationTime, "clusterInfraReadyTime", clusterInfraReady, "controlPlaneInitializedTime", controlPlaneInitialized)
   145  		if conditions.IsTrue(t.Cluster, clusterv1.ControlPlaneInitializedCondition) && controlPlaneInitialized != nil && controlPlaneInitialized.Time.After(comparisonTime) {
   146  			comparisonTime = controlPlaneInitialized.Time
   147  		}
   148  		if conditions.IsTrue(t.Cluster, clusterv1.InfrastructureReadyCondition) && clusterInfraReady != nil && clusterInfraReady.Time.After(comparisonTime) {
   149  			comparisonTime = clusterInfraReady.Time
   150  		}
   151  		logger.V(3).Info("Using comparison time", "time", comparisonTime)
   152  
   153  		timeoutDuration := timeoutForMachineToHaveNode.Duration
   154  		if comparisonTime.Add(timeoutForMachineToHaveNode.Duration).Before(now) {
   155  			conditions.MarkFalse(t.Machine, clusterv1.MachineHealthCheckSucceededCondition, clusterv1.NodeStartupTimeoutReason, clusterv1.ConditionSeverityWarning, "Node failed to report startup in %s", timeoutDuration)
   156  			logger.V(3).Info("Target is unhealthy: machine has no node", "duration", timeoutDuration)
   157  			return true, time.Duration(0)
   158  		}
   159  
   160  		durationUnhealthy := now.Sub(comparisonTime)
   161  		nextCheck := timeoutDuration - durationUnhealthy + time.Second
   162  
   163  		return false, nextCheck
   164  	}
   165  
   166  	// check conditions
   167  	for _, c := range t.MHC.Spec.UnhealthyConditions {
   168  		nodeCondition := getNodeCondition(t.Node, c.Type)
   169  
   170  		// Skip when current node condition is different from the one reported
   171  		// in the MachineHealthCheck.
   172  		if nodeCondition == nil || nodeCondition.Status != c.Status {
   173  			continue
   174  		}
   175  
   176  		// If the condition has been in the unhealthy state for longer than the
   177  		// timeout, return true with no requeue time.
   178  		if nodeCondition.LastTransitionTime.Add(c.Timeout.Duration).Before(now) {
   179  			conditions.MarkFalse(t.Machine, clusterv1.MachineHealthCheckSucceededCondition, clusterv1.UnhealthyNodeConditionReason, clusterv1.ConditionSeverityWarning, "Condition %s on node is reporting status %s for more than %s", c.Type, c.Status, c.Timeout.Duration.String())
   180  			logger.V(3).Info("Target is unhealthy: condition is in state longer than allowed timeout", "condition", c.Type, "state", c.Status, "timeout", c.Timeout.Duration.String())
   181  			return true, time.Duration(0)
   182  		}
   183  
   184  		durationUnhealthy := now.Sub(nodeCondition.LastTransitionTime.Time)
   185  		nextCheck := c.Timeout.Duration - durationUnhealthy + time.Second
   186  		if nextCheck > 0 {
   187  			nextCheckTimes = append(nextCheckTimes, nextCheck)
   188  		}
   189  	}
   190  	return false, minDuration(nextCheckTimes)
   191  }
   192  
   193  // getTargetsFromMHC uses the MachineHealthCheck's selector to fetch machines
   194  // and their nodes targeted by the health check, ready for health checking.
   195  func (r *Reconciler) getTargetsFromMHC(ctx context.Context, logger logr.Logger, clusterClient client.Reader, cluster *clusterv1.Cluster, mhc *clusterv1.MachineHealthCheck) ([]healthCheckTarget, error) {
   196  	machines, err := r.getMachinesFromMHC(ctx, mhc)
   197  	if err != nil {
   198  		return nil, errors.Wrap(err, "error getting machines from MachineHealthCheck")
   199  	}
   200  	if len(machines) == 0 {
   201  		return nil, nil
   202  	}
   203  
   204  	targets := []healthCheckTarget{}
   205  	for k := range machines {
   206  		logger := logger.WithValues("Machine", klog.KObj(&machines[k]))
   207  		skip, reason := shouldSkipRemediation(&machines[k])
   208  		if skip {
   209  			logger.Info("skipping remediation", "reason", reason)
   210  			continue
   211  		}
   212  
   213  		patchHelper, err := patch.NewHelper(&machines[k], r.Client)
   214  		if err != nil {
   215  			return nil, errors.Wrap(err, "unable to initialize patch helper")
   216  		}
   217  		target := healthCheckTarget{
   218  			Cluster:     cluster,
   219  			MHC:         mhc,
   220  			Machine:     &machines[k],
   221  			patchHelper: patchHelper,
   222  		}
   223  		if clusterClient != nil {
   224  			node, err := r.getNodeFromMachine(ctx, clusterClient, target.Machine)
   225  			if err != nil {
   226  				if !apierrors.IsNotFound(err) {
   227  					return nil, errors.Wrap(err, "error getting node")
   228  				}
   229  
   230  				// A node has been seen for this machine, but it no longer exists
   231  				target.nodeMissing = true
   232  			}
   233  			target.Node = node
   234  		}
   235  		targets = append(targets, target)
   236  	}
   237  	return targets, nil
   238  }
   239  
   240  // getMachinesFromMHC fetches Machines matched by the MachineHealthCheck's
   241  // label selector.
   242  func (r *Reconciler) getMachinesFromMHC(ctx context.Context, mhc *clusterv1.MachineHealthCheck) ([]clusterv1.Machine, error) {
   243  	selector, err := metav1.LabelSelectorAsSelector(metav1.CloneSelectorAndAddLabel(
   244  		&mhc.Spec.Selector, clusterv1.ClusterNameLabel, mhc.Spec.ClusterName,
   245  	))
   246  	if err != nil {
   247  		return nil, errors.Wrap(err, "failed to build selector")
   248  	}
   249  
   250  	var machineList clusterv1.MachineList
   251  	if err := r.Client.List(
   252  		ctx,
   253  		&machineList,
   254  		client.MatchingLabelsSelector{Selector: selector},
   255  		client.InNamespace(mhc.GetNamespace()),
   256  	); err != nil {
   257  		return nil, errors.Wrap(err, "failed to list machines")
   258  	}
   259  	return machineList.Items, nil
   260  }
   261  
   262  // getNodeFromMachine fetches the node from a local or remote cluster for a
   263  // given machine.
   264  func (r *Reconciler) getNodeFromMachine(ctx context.Context, clusterClient client.Reader, machine *clusterv1.Machine) (*corev1.Node, error) {
   265  	if machine.Status.NodeRef == nil {
   266  		return nil, nil
   267  	}
   268  
   269  	node := &corev1.Node{}
   270  	nodeKey := types.NamespacedName{
   271  		Name: machine.Status.NodeRef.Name,
   272  	}
   273  
   274  	// if it cannot find a node, send a nil node back...
   275  	if err := clusterClient.Get(ctx, nodeKey, node); err != nil {
   276  		return nil, err
   277  	}
   278  	return node, nil
   279  }
   280  
   281  // healthCheckTargets health checks a slice of targets
   282  // and gives a data to measure the average health.
   283  func (r *Reconciler) healthCheckTargets(targets []healthCheckTarget, logger logr.Logger, timeoutForMachineToHaveNode metav1.Duration) ([]healthCheckTarget, []healthCheckTarget, []time.Duration) {
   284  	var nextCheckTimes []time.Duration
   285  	var unhealthy []healthCheckTarget
   286  	var healthy []healthCheckTarget
   287  
   288  	for _, t := range targets {
   289  		logger := logger.WithValues("Target", t.string())
   290  		logger.V(3).Info("Health checking target")
   291  		needsRemediation, nextCheck := t.needsRemediation(logger, timeoutForMachineToHaveNode)
   292  
   293  		if needsRemediation {
   294  			unhealthy = append(unhealthy, t)
   295  			continue
   296  		}
   297  
   298  		if nextCheck > 0 {
   299  			logger.V(3).Info("Target is likely to go unhealthy", "timeUntilUnhealthy", nextCheck.Truncate(time.Second).String())
   300  			r.recorder.Eventf(
   301  				t.Machine,
   302  				corev1.EventTypeNormal,
   303  				EventDetectedUnhealthy,
   304  				"Machine %v has unhealthy node %v",
   305  				t.string(),
   306  				t.nodeName(),
   307  			)
   308  			nextCheckTimes = append(nextCheckTimes, nextCheck)
   309  			continue
   310  		}
   311  
   312  		if t.Machine.DeletionTimestamp.IsZero() && t.Node != nil {
   313  			conditions.MarkTrue(t.Machine, clusterv1.MachineHealthCheckSucceededCondition)
   314  			healthy = append(healthy, t)
   315  		}
   316  	}
   317  	return healthy, unhealthy, nextCheckTimes
   318  }
   319  
   320  // getNodeCondition returns node condition by type.
   321  func getNodeCondition(node *corev1.Node, conditionType corev1.NodeConditionType) *corev1.NodeCondition {
   322  	for _, cond := range node.Status.Conditions {
   323  		if cond.Type == conditionType {
   324  			return &cond
   325  		}
   326  	}
   327  	return nil
   328  }
   329  
   330  func minDuration(durations []time.Duration) time.Duration {
   331  	if len(durations) == 0 {
   332  		return time.Duration(0)
   333  	}
   334  
   335  	minDuration := durations[0]
   336  	// Ignore first element as that is already minDuration
   337  	for _, nc := range durations[1:] {
   338  		if nc < minDuration {
   339  			minDuration = nc
   340  		}
   341  	}
   342  	return minDuration
   343  }
   344  
   345  // shouldSkipRemediation checks if the machine should be skipped for remediation.
   346  // Returns true if it should be skipped along with the reason for skipping.
   347  func shouldSkipRemediation(m *clusterv1.Machine) (bool, string) {
   348  	if annotations.HasPaused(m) {
   349  		return true, fmt.Sprintf("machine has %q annotation", clusterv1.PausedAnnotation)
   350  	}
   351  
   352  	if annotations.HasSkipRemediation(m) {
   353  		return true, fmt.Sprintf("machine has %q annotation", clusterv1.MachineSkipRemediationAnnotation)
   354  	}
   355  
   356  	return false, ""
   357  }