sigs.k8s.io/cluster-api@v1.7.1/internal/controllers/machine/machine_controller_noderef.go (about)

     1  /*
     2  Copyright 2019 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package machine
    18  
    19  import (
    20  	"context"
    21  	"fmt"
    22  	"strings"
    23  
    24  	"github.com/pkg/errors"
    25  	corev1 "k8s.io/api/core/v1"
    26  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    27  	"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
    28  	"k8s.io/apimachinery/pkg/runtime/schema"
    29  	"k8s.io/klog/v2"
    30  	ctrl "sigs.k8s.io/controller-runtime"
    31  	"sigs.k8s.io/controller-runtime/pkg/client"
    32  
    33  	clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1"
    34  	"sigs.k8s.io/cluster-api/api/v1beta1/index"
    35  	"sigs.k8s.io/cluster-api/internal/controllers/machinedeployment/mdutil"
    36  	"sigs.k8s.io/cluster-api/internal/util/taints"
    37  	"sigs.k8s.io/cluster-api/util"
    38  	"sigs.k8s.io/cluster-api/util/annotations"
    39  	"sigs.k8s.io/cluster-api/util/conditions"
    40  )
    41  
    42  var (
    43  	// ErrNodeNotFound signals that a corev1.Node could not be found for the given provider id.
    44  	ErrNodeNotFound = errors.New("cannot find node with matching ProviderID")
    45  )
    46  
    47  func (r *Reconciler) reconcileNode(ctx context.Context, s *scope) (ctrl.Result, error) {
    48  	log := ctrl.LoggerFrom(ctx)
    49  	cluster := s.cluster
    50  	machine := s.machine
    51  	infraMachine := s.infraMachine
    52  
    53  	// Create a watch on the nodes in the Cluster.
    54  	if err := r.watchClusterNodes(ctx, cluster); err != nil {
    55  		return ctrl.Result{}, err
    56  	}
    57  
    58  	// Check that the Machine has a valid ProviderID.
    59  	if machine.Spec.ProviderID == nil || *machine.Spec.ProviderID == "" {
    60  		log.Info("Waiting for infrastructure provider to report spec.providerID", machine.Spec.InfrastructureRef.Kind, klog.KRef(machine.Spec.InfrastructureRef.Namespace, machine.Spec.InfrastructureRef.Name))
    61  		conditions.MarkFalse(machine, clusterv1.MachineNodeHealthyCondition, clusterv1.WaitingForNodeRefReason, clusterv1.ConditionSeverityInfo, "")
    62  		return ctrl.Result{}, nil
    63  	}
    64  
    65  	remoteClient, err := r.Tracker.GetClient(ctx, util.ObjectKey(cluster))
    66  	if err != nil {
    67  		return ctrl.Result{}, err
    68  	}
    69  
    70  	// Even if Status.NodeRef exists, continue to do the following checks to make sure Node is healthy
    71  	node, err := r.getNode(ctx, remoteClient, *machine.Spec.ProviderID)
    72  	if err != nil {
    73  		if err == ErrNodeNotFound {
    74  			// While a NodeRef is set in the status, failing to get that node means the node is deleted.
    75  			// If Status.NodeRef is not set before, node still can be in the provisioning state.
    76  			if machine.Status.NodeRef != nil {
    77  				conditions.MarkFalse(machine, clusterv1.MachineNodeHealthyCondition, clusterv1.NodeNotFoundReason, clusterv1.ConditionSeverityError, "")
    78  				return ctrl.Result{}, errors.Wrapf(err, "no matching Node for Machine %q in namespace %q", machine.Name, machine.Namespace)
    79  			}
    80  			conditions.MarkFalse(machine, clusterv1.MachineNodeHealthyCondition, clusterv1.NodeProvisioningReason, clusterv1.ConditionSeverityWarning, "")
    81  			// No need to requeue here. Nodes emit an event that triggers reconciliation.
    82  			return ctrl.Result{}, nil
    83  		}
    84  		r.recorder.Event(machine, corev1.EventTypeWarning, "Failed to retrieve Node by ProviderID", err.Error())
    85  		conditions.MarkUnknown(machine, clusterv1.MachineNodeHealthyCondition, clusterv1.NodeInspectionFailedReason, "Failed to get the Node for this Machine by ProviderID")
    86  		return ctrl.Result{}, err
    87  	}
    88  
    89  	// Set the Machine NodeRef.
    90  	if machine.Status.NodeRef == nil {
    91  		machine.Status.NodeRef = &corev1.ObjectReference{
    92  			APIVersion: corev1.SchemeGroupVersion.String(),
    93  			Kind:       "Node",
    94  			Name:       node.Name,
    95  			UID:        node.UID,
    96  		}
    97  		log.Info("Infrastructure provider reporting spec.providerID, Kubernetes node is now available", machine.Spec.InfrastructureRef.Kind, klog.KRef(machine.Spec.InfrastructureRef.Namespace, machine.Spec.InfrastructureRef.Name), "providerID", *machine.Spec.ProviderID, "node", klog.KRef("", machine.Status.NodeRef.Name))
    98  		r.recorder.Event(machine, corev1.EventTypeNormal, "SuccessfulSetNodeRef", machine.Status.NodeRef.Name)
    99  	}
   100  
   101  	// Set the NodeSystemInfo.
   102  	machine.Status.NodeInfo = &node.Status.NodeInfo
   103  
   104  	// Compute all the annotations that CAPI is setting on nodes;
   105  	// CAPI only enforces some annotations and never changes or removes them.
   106  	nodeAnnotations := map[string]string{
   107  		clusterv1.ClusterNameAnnotation:      machine.Spec.ClusterName,
   108  		clusterv1.ClusterNamespaceAnnotation: machine.GetNamespace(),
   109  		clusterv1.MachineAnnotation:          machine.Name,
   110  	}
   111  	if owner := metav1.GetControllerOfNoCopy(machine); owner != nil {
   112  		nodeAnnotations[clusterv1.OwnerKindAnnotation] = owner.Kind
   113  		nodeAnnotations[clusterv1.OwnerNameAnnotation] = owner.Name
   114  	}
   115  
   116  	// Compute labels to be propagated from Machines to nodes.
   117  	// NOTE: CAPI should manage only a subset of node labels, everything else should be preserved.
   118  	// NOTE: Once we reconcile node labels for the first time, the NodeUninitializedTaint is removed from the node.
   119  	nodeLabels := getManagedLabels(machine.Labels)
   120  
   121  	// Get interruptible instance status from the infrastructure provider and set the interruptible label on the node.
   122  	interruptible := false
   123  	found := false
   124  	if infraMachine != nil {
   125  		interruptible, found, err = unstructured.NestedBool(infraMachine.Object, "status", "interruptible")
   126  		if err != nil {
   127  			return ctrl.Result{}, errors.Wrapf(err, "failed to get status interruptible from infra machine %s", klog.KObj(infraMachine))
   128  		}
   129  		// If interruptible is set and is true add the interruptible label to the node labels.
   130  		if found && interruptible {
   131  			nodeLabels[clusterv1.InterruptibleLabel] = ""
   132  		}
   133  	}
   134  
   135  	_, nodeHadInterruptibleLabel := node.Labels[clusterv1.InterruptibleLabel]
   136  
   137  	// Reconcile node taints
   138  	if err := r.patchNode(ctx, remoteClient, node, nodeLabels, nodeAnnotations, machine); err != nil {
   139  		return ctrl.Result{}, errors.Wrapf(err, "failed to reconcile Node %s", klog.KObj(node))
   140  	}
   141  	if !nodeHadInterruptibleLabel && interruptible {
   142  		// If the interruptible label is added to the node then record the event.
   143  		// Nb. Only record the event if the node previously did not have the label to avoid recording
   144  		// the event during every reconcile.
   145  		r.recorder.Event(machine, corev1.EventTypeNormal, "SuccessfulSetInterruptibleNodeLabel", node.Name)
   146  	}
   147  
   148  	// Do the remaining node health checks, then set the node health to true if all checks pass.
   149  	status, message := summarizeNodeConditions(node)
   150  	if status == corev1.ConditionFalse {
   151  		conditions.MarkFalse(machine, clusterv1.MachineNodeHealthyCondition, clusterv1.NodeConditionsFailedReason, clusterv1.ConditionSeverityWarning, message)
   152  		return ctrl.Result{}, nil
   153  	}
   154  	if status == corev1.ConditionUnknown {
   155  		conditions.MarkUnknown(machine, clusterv1.MachineNodeHealthyCondition, clusterv1.NodeConditionsFailedReason, message)
   156  		return ctrl.Result{}, nil
   157  	}
   158  
   159  	conditions.MarkTrue(machine, clusterv1.MachineNodeHealthyCondition)
   160  	return ctrl.Result{}, nil
   161  }
   162  
   163  // getManagedLabels gets a map[string]string and returns another map[string]string
   164  // filtering out labels not managed by CAPI.
   165  func getManagedLabels(labels map[string]string) map[string]string {
   166  	managedLabels := make(map[string]string)
   167  	for key, value := range labels {
   168  		dnsSubdomainOrName := strings.Split(key, "/")[0]
   169  		if dnsSubdomainOrName == clusterv1.NodeRoleLabelPrefix {
   170  			managedLabels[key] = value
   171  		}
   172  		if dnsSubdomainOrName == clusterv1.NodeRestrictionLabelDomain || strings.HasSuffix(dnsSubdomainOrName, "."+clusterv1.NodeRestrictionLabelDomain) {
   173  			managedLabels[key] = value
   174  		}
   175  		if dnsSubdomainOrName == clusterv1.ManagedNodeLabelDomain || strings.HasSuffix(dnsSubdomainOrName, "."+clusterv1.ManagedNodeLabelDomain) {
   176  			managedLabels[key] = value
   177  		}
   178  	}
   179  
   180  	return managedLabels
   181  }
   182  
   183  // summarizeNodeConditions summarizes a Node's conditions and returns the summary of condition statuses and concatenate failed condition messages:
   184  // if there is at least 1 semantically-negative condition, summarized status = False;
   185  // if there is at least 1 semantically-positive condition when there is 0 semantically negative condition, summarized status = True;
   186  // if all conditions are unknown,  summarized status = Unknown.
   187  // (semantically true conditions: NodeMemoryPressure/NodeDiskPressure/NodePIDPressure == false or Ready == true.)
   188  func summarizeNodeConditions(node *corev1.Node) (corev1.ConditionStatus, string) {
   189  	semanticallyFalseStatus := 0
   190  	unknownStatus := 0
   191  
   192  	message := ""
   193  	for _, condition := range node.Status.Conditions {
   194  		switch condition.Type {
   195  		case corev1.NodeMemoryPressure, corev1.NodeDiskPressure, corev1.NodePIDPressure:
   196  			if condition.Status != corev1.ConditionFalse {
   197  				message += fmt.Sprintf("Node condition %s is %s", condition.Type, condition.Status) + ". "
   198  				if condition.Status == corev1.ConditionUnknown {
   199  					unknownStatus++
   200  					continue
   201  				}
   202  				semanticallyFalseStatus++
   203  			}
   204  		case corev1.NodeReady:
   205  			if condition.Status != corev1.ConditionTrue {
   206  				message += fmt.Sprintf("Node condition %s is %s", condition.Type, condition.Status) + ". "
   207  				if condition.Status == corev1.ConditionUnknown {
   208  					unknownStatus++
   209  					continue
   210  				}
   211  				semanticallyFalseStatus++
   212  			}
   213  		}
   214  	}
   215  	if semanticallyFalseStatus > 0 {
   216  		return corev1.ConditionFalse, message
   217  	}
   218  	if semanticallyFalseStatus+unknownStatus < 4 {
   219  		return corev1.ConditionTrue, message
   220  	}
   221  	return corev1.ConditionUnknown, message
   222  }
   223  
   224  func (r *Reconciler) getNode(ctx context.Context, c client.Reader, providerID string) (*corev1.Node, error) {
   225  	nodeList := corev1.NodeList{}
   226  	if err := c.List(ctx, &nodeList, client.MatchingFields{index.NodeProviderIDField: providerID}); err != nil {
   227  		return nil, err
   228  	}
   229  	if len(nodeList.Items) == 0 {
   230  		// If for whatever reason the index isn't registered or available, we fallback to loop over the whole list.
   231  		nl := corev1.NodeList{}
   232  		for {
   233  			if err := c.List(ctx, &nl, client.Continue(nl.Continue)); err != nil {
   234  				return nil, err
   235  			}
   236  
   237  			for _, node := range nl.Items {
   238  				if providerID == node.Spec.ProviderID {
   239  					return &node, nil
   240  				}
   241  			}
   242  
   243  			if nl.Continue == "" {
   244  				break
   245  			}
   246  		}
   247  
   248  		return nil, ErrNodeNotFound
   249  	}
   250  
   251  	if len(nodeList.Items) != 1 {
   252  		return nil, fmt.Errorf("unexpectedly found more than one Node matching the providerID %s", providerID)
   253  	}
   254  
   255  	return &nodeList.Items[0], nil
   256  }
   257  
   258  // PatchNode is required to workaround an issue on Node.Status.Address which is incorrectly annotated as patchStrategy=merge
   259  // and this causes SSA patch to fail in case there are two addresses with the same key https://github.com/kubernetes-sigs/cluster-api/issues/8417
   260  func (r *Reconciler) patchNode(ctx context.Context, remoteClient client.Client, node *corev1.Node, newLabels, newAnnotations map[string]string, m *clusterv1.Machine) error {
   261  	newNode := node.DeepCopy()
   262  
   263  	// Adds the annotations CAPI sets on the node.
   264  	hasAnnotationChanges := annotations.AddAnnotations(newNode, newAnnotations)
   265  
   266  	// Adds the labels from the Machine.
   267  	// NOTE: in order to handle deletion we are tracking the labels set from the Machine in an annotation.
   268  	// At the next reconcile we are going to use this for deleting labels previously set by the Machine, but
   269  	// not present anymore. Labels not set from machines should be always preserved.
   270  	if newNode.Labels == nil {
   271  		newNode.Labels = make(map[string]string)
   272  	}
   273  	hasLabelChanges := false
   274  	labelsFromPreviousReconcile := strings.Split(newNode.Annotations[clusterv1.LabelsFromMachineAnnotation], ",")
   275  	if len(labelsFromPreviousReconcile) == 1 && labelsFromPreviousReconcile[0] == "" {
   276  		labelsFromPreviousReconcile = []string{}
   277  	}
   278  	labelsFromCurrentReconcile := []string{}
   279  	for k, v := range newLabels {
   280  		if cur, ok := newNode.Labels[k]; !ok || cur != v {
   281  			newNode.Labels[k] = v
   282  			hasLabelChanges = true
   283  		}
   284  		labelsFromCurrentReconcile = append(labelsFromCurrentReconcile, k)
   285  	}
   286  	for _, k := range labelsFromPreviousReconcile {
   287  		if _, ok := newLabels[k]; !ok {
   288  			delete(newNode.Labels, k)
   289  			hasLabelChanges = true
   290  		}
   291  	}
   292  	annotations.AddAnnotations(newNode, map[string]string{clusterv1.LabelsFromMachineAnnotation: strings.Join(labelsFromCurrentReconcile, ",")})
   293  
   294  	// Drop the NodeUninitializedTaint taint on the node given that we are reconciling labels.
   295  	hasTaintChanges := taints.RemoveNodeTaint(newNode, clusterv1.NodeUninitializedTaint)
   296  
   297  	// Set Taint to a node in an old MachineSet and unset Taint from a node in a new MachineSet
   298  	isOutdated, err := shouldNodeHaveOutdatedTaint(ctx, r.Client, m)
   299  	if err != nil {
   300  		return errors.Wrapf(err, "failed to check if Node %s is outdated", klog.KRef("", node.Name))
   301  	}
   302  	if isOutdated {
   303  		hasTaintChanges = taints.EnsureNodeTaint(newNode, clusterv1.NodeOutdatedRevisionTaint) || hasTaintChanges
   304  	} else {
   305  		hasTaintChanges = taints.RemoveNodeTaint(newNode, clusterv1.NodeOutdatedRevisionTaint) || hasTaintChanges
   306  	}
   307  
   308  	if !hasAnnotationChanges && !hasLabelChanges && !hasTaintChanges {
   309  		return nil
   310  	}
   311  
   312  	return remoteClient.Patch(ctx, newNode, client.StrategicMergeFrom(node))
   313  }
   314  
   315  func shouldNodeHaveOutdatedTaint(ctx context.Context, c client.Client, m *clusterv1.Machine) (bool, error) {
   316  	if _, hasLabel := m.Labels[clusterv1.MachineDeploymentNameLabel]; !hasLabel {
   317  		return false, nil
   318  	}
   319  
   320  	// Resolve the MachineSet name via owner references because the label value
   321  	// could also be a hash.
   322  	objKey, err := getOwnerMachineSetObjectKey(m.ObjectMeta)
   323  	if err != nil {
   324  		return false, err
   325  	}
   326  	ms := &clusterv1.MachineSet{}
   327  	if err := c.Get(ctx, *objKey, ms); err != nil {
   328  		return false, err
   329  	}
   330  	md := &clusterv1.MachineDeployment{}
   331  	objKey = &client.ObjectKey{
   332  		Namespace: m.ObjectMeta.Namespace,
   333  		Name:      m.Labels[clusterv1.MachineDeploymentNameLabel],
   334  	}
   335  	if err := c.Get(ctx, *objKey, md); err != nil {
   336  		return false, err
   337  	}
   338  	msRev, err := mdutil.Revision(ms)
   339  	if err != nil {
   340  		return false, err
   341  	}
   342  	mdRev, err := mdutil.Revision(md)
   343  	if err != nil {
   344  		return false, err
   345  	}
   346  	if msRev < mdRev {
   347  		return true, nil
   348  	}
   349  	return false, nil
   350  }
   351  
   352  func getOwnerMachineSetObjectKey(obj metav1.ObjectMeta) (*client.ObjectKey, error) {
   353  	for _, ref := range obj.GetOwnerReferences() {
   354  		gv, err := schema.ParseGroupVersion(ref.APIVersion)
   355  		if err != nil {
   356  			return nil, err
   357  		}
   358  		if ref.Kind == "MachineSet" && gv.Group == clusterv1.GroupVersion.Group {
   359  			return &client.ObjectKey{Namespace: obj.Namespace, Name: ref.Name}, nil
   360  		}
   361  	}
   362  	return nil, errors.Errorf("failed to find MachineSet owner reference for Machine %s", klog.KRef(obj.GetNamespace(), obj.GetName()))
   363  }